tesseract v5.3.3.20231005
lm_pain_points.h
Go to the documentation of this file.
1
2// File: lm_pain_points.h
3// Description: Functions that utilize the knowledge about the properties
4// of the paths explored by the segmentation search in order
5// to generate "pain points" - the locations in the ratings
6// matrix which should be classified next.
7// Author: Rika Antonova
8//
9// (C) Copyright 2012, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#ifndef TESSERACT_WORDREC_PAIN_POINTS_H_
23#define TESSERACT_WORDREC_PAIN_POINTS_H_
24
25#include "genericheap.h" // for GenericHeap
26#include "matrix.h" // for MATRIX_COORD (ptr only), MatrixCoordPair
27#include "stopper.h" // for DANGERR
28
29namespace tesseract {
30
31class Dict;
32struct ViterbiStateEntry;
33class WERD_RES;
34
35// Heap of pain points used for determining where to chop/join.
37
38// Types of pain points (ordered in the decreasing level of importance).
44
46};
47
48static const char *const LMPainPointsTypeName[] = {
49 "LM_PPTYPE_BLAMER",
50 "LM_PPTYPE_AMBIGS",
51 "LM_PPTYPE_PATH",
52 "LM_PPTYPE_SHAPE",
53};
54
56public:
58 // If there is a significant drop in character ngram probability or a
59 // dangerous ambiguity make the thresholds on what blob combinations
60 // can be classified looser.
61 static const float kLooseMaxCharWhRatio;
62 // Returns a description of the type of a pain point.
64 return LMPainPointsTypeName[type];
65 }
66
67 LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
68 : max_heap_size_(max)
69 , max_char_wh_ratio_(rat)
70 , fixed_pitch_(fp)
71 , dict_(d)
72 , debug_level_(deb) {}
73 ~LMPainPoints() = default;
74
75 // Returns true if the heap of pain points of pp_type is not empty().
76 inline bool HasPainPoints(LMPainPointsType pp_type) const {
77 return !pain_points_heaps_[pp_type].empty();
78 }
79
80 // Dequeues the next pain point from the pain points queue and copies
81 // its contents and priority to *pp and *priority.
82 // Returns LM_PPTYPE_NUM if pain points queue is empty, otherwise the type.
83 LMPainPointsType Deque(MATRIX_COORD *pp, float *priority);
84
85 // Clears pain points heap.
86 void Clear() {
87 for (auto &pain_points_heap : pain_points_heaps_) {
88 pain_points_heap.clear();
89 }
90 }
91
92 // For each cell, generate a "pain point" if the cell is not classified
93 // and has a left or right neighbor that was classified.
94 void GenerateInitial(WERD_RES *word_res);
95
96 // Generate pain points from the given path.
97 void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res);
98
99 // Generate pain points from dangerous ambiguities in best choice.
100 void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res);
101
102 // Adds a pain point to classify chunks_record->ratings(col, row).
103 // Returns true if a new pain point was added to an appropriate heap.
104 // Pain point priority is set to special_priority for pain points of
105 // LM_PPTYPE_AMBIG or LM_PPTYPE_PATH, for other pain points
106 // AssociateStats::gap_sum is used.
107 bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority,
108 bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res);
109
110 // Adjusts the pain point coordinates to cope with expansion of the ratings
111 // matrix due to a split of the blob with the given index.
112 void RemapForSplit(int index);
113
114private:
115 // Priority queues containing pain points generated by the language model
116 // The priority is set by the language model components, adjustments like
117 // seam cost and width priority are factored into the priority.
118 PainPointHeap pain_points_heaps_[LM_PPTYPE_NUM];
119 // Maximum number of points to keep in the heap.
120 int max_heap_size_;
121 // Maximum character width/height ratio.
122 float max_char_wh_ratio_;
123 // Set to true if fixed pitch should be assumed.
124 bool fixed_pitch_;
125 // Cached pointer to dictionary.
126 const Dict *dict_;
127 // Debug level for print statements.
128 int debug_level_;
129};
130
131} // namespace tesseract
132
133#endif // TESSERACT_WORDREC_PAIN_POINTS_H_
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
type
Definition: upload.py:458
bool empty() const
Definition: genericheap.h:68
bool GeneratePainPoint(int col, int row, LMPainPointsType pp_type, float special_priority, bool ok_to_extend, float max_char_wh_ratio, WERD_RES *word_res)
void RemapForSplit(int index)
bool HasPainPoints(LMPainPointsType pp_type) const
LMPainPoints(int max, float rat, bool fp, const Dict *d, int deb)
void GenerateInitial(WERD_RES *word_res)
void GenerateFromPath(float rating_cert_scale, ViterbiStateEntry *vse, WERD_RES *word_res)
static const float kLooseMaxCharWhRatio
static const float kDefaultPainPointPriorityAdjustment
static const char * PainPointDescription(LMPainPointsType type)
void GenerateFromAmbigs(const DANGERR &fixpt, ViterbiStateEntry *vse, WERD_RES *word_res)
LMPainPointsType Deque(MATRIX_COORD *pp, float *priority)