tesseract v5.3.3.20231005
wordrec.h
Go to the documentation of this file.
1
2// File: wordrec.h
3// Description: wordrec class.
4// Author: Samuel Charron
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_WORDREC_WORDREC_H_
20#define TESSERACT_WORDREC_WORDREC_H_
21
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h" // DISABLED_LEGACY_ENGINE
24#endif
25
26#ifdef DISABLED_LEGACY_ENGINE
27
28# include <cstdint> // for int16_t, int32_t
29# include "classify.h" // for Classify
30# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
31# include "ratngs.h" // for WERD_CHOICE
32
33namespace tesseract {
34class TessdataManager;
35}
36
37namespace tesseract {
38
39/* ccmain/tstruct.cpp */
40
41class TESS_API Wordrec : public Classify {
42public:
43 // config parameters
44
45 BOOL_VAR_H(wordrec_debug_blamer);
46 BOOL_VAR_H(wordrec_run_blamer);
47
48 // methods
49 Wordrec();
50 virtual ~Wordrec() = default;
51
52 // tface.cpp
53 void program_editup(const std::string &textbase, TessdataManager *init_classifier,
54 TessdataManager *init_dict);
55 void program_editdown(int32_t elasped_time);
56 int end_recog();
57 int dict_word(const WERD_CHOICE &word);
58
59 // Member variables
60 WERD_CHOICE *prev_word_best_choice_;
61};
62
63} // namespace tesseract
64
65#else // DISABLED_LEGACY_ENGINE not defined
66
67# include <memory>
68# include "associate.h"
69# include "chop.h" // for PointHeap, MAX_NUM_POINTS
70# include "classify.h" // for Classify
71# include "dict.h"
72# include "elst.h" // for ELIST_ITERATOR, ELISTIZEH, ELIST_LINK
73# include "findseam.h" // for SeamQueue, SeamPile
74# include "language_model.h"
75# include "matrix.h"
76# include "oldlist.h" // for LIST
77# include "params.h" // for INT_VAR_H, IntParam, BOOL_VAR_H, BoolP...
78# include "points.h" // for ICOORD
79# include "ratngs.h" // for BLOB_CHOICE_LIST (ptr only), BLOB_CHOI...
80# include "seam.h" // for SEAM (ptr only), PRIORITY
81# include "stopper.h" // for DANGERR
82
83# include <cstdint> // for int16_t, int32_t
84
85namespace tesseract {
86
87class EDGEPT_CLIST;
88class MATRIX;
89class TBOX;
90class UNICHARSET;
91class WERD_RES;
92
93class LMPainPoints;
94class TessdataManager;
95struct BestChoiceBundle;
96
97struct BlamerBundle;
98struct EDGEPT;
99struct MATRIX_COORD;
100struct SPLIT;
101struct TBLOB;
102struct TESSLINE;
103struct TWERD;
104
105// A class for storing which nodes are to be processed by the segmentation
106// search. There is a single SegSearchPending for each column in the ratings
107// matrix, and it indicates whether the segsearch should combine all
108// BLOB_CHOICES in the column, or just the given row with the parents
109// corresponding to *this SegSearchPending, and whether only updated parent
110// ViterbiStateEntries should be combined, or all, with the BLOB_CHOICEs.
112public:
114 : classified_row_(-1), revisit_whole_column_(false), column_classified_(false) {}
115
116 // Marks the whole column as just classified. Used to start a search on
117 // a newly initialized ratings matrix.
119 column_classified_ = true;
120 }
121 // Marks the matrix entry at the given row as just classified.
122 // Used after classifying a new matrix cell.
123 // Additional to, not overriding a previous RevisitWholeColumn.
124 void SetBlobClassified(int row) {
125 classified_row_ = row;
126 }
127 // Marks the whole column as needing work, but not just classified.
128 // Used when the parent vse list is updated.
129 // Additional to, not overriding a previous SetBlobClassified.
131 revisit_whole_column_ = true;
132 }
133
134 // Clears *this to indicate no work to do.
135 void Clear() {
136 classified_row_ = -1;
137 revisit_whole_column_ = false;
138 column_classified_ = false;
139 }
140
141 // Returns true if there are updates to do in the column that *this
142 // represents.
143 bool WorkToDo() const {
144 return revisit_whole_column_ || column_classified_ || classified_row_ >= 0;
145 }
146 // Returns true if the given row was just classified.
147 bool IsRowJustClassified(int row) const {
148 return row == classified_row_ || column_classified_;
149 }
150 // Returns the single row to process if there is only one, otherwise -1.
151 int SingleRow() const {
152 return revisit_whole_column_ || column_classified_ ? -1 : classified_row_;
153 }
154
155private:
156 // If non-negative, indicates the single row in the ratings matrix that has
157 // just been classified, and so should be combined with all the parents in the
158 // column that this SegSearchPending represents.
159 // Operates independently of revisit_whole_column.
160 int classified_row_;
161 // If revisit_whole_column is true, then all BLOB_CHOICEs in this column will
162 // be processed, but classified_row can indicate a row that is newly
163 // classified. Overridden if column_classified is true.
164 bool revisit_whole_column_;
165 // If column_classified is true, parent vses are processed with all rows
166 // regardless of whether they are just updated, overriding
167 // revisit_whole_column and classified_row.
168 bool column_classified_;
169};
170
171/* ccmain/tstruct.cpp *********************************************************/
172class FRAGMENT : public ELIST_LINK {
173public:
174 FRAGMENT() { // constructor
175 }
176 FRAGMENT(EDGEPT *head_pt, // start
177 EDGEPT *tail_pt); // end
178
179 ICOORD head; // coords of start
180 ICOORD tail; // coords of end
181 EDGEPT *headpt; // start point
182 EDGEPT *tailpt; // end point
183};
185
186class TESS_API Wordrec : public Classify {
187public:
188 // config parameters *******************************************************
189 BOOL_VAR_H(merge_fragments_in_matrix);
190 BOOL_VAR_H(wordrec_enable_assoc);
191 BOOL_VAR_H(force_word_assoc);
192 INT_VAR_H(repair_unchopped_blobs);
193 double_VAR_H(tessedit_certainty_threshold);
194 INT_VAR_H(chop_debug);
195 BOOL_VAR_H(chop_enable);
196 BOOL_VAR_H(chop_vertical_creep);
197 INT_VAR_H(chop_split_length);
198 INT_VAR_H(chop_same_distance);
199 INT_VAR_H(chop_min_outline_points);
200 INT_VAR_H(chop_seam_pile_size);
201 BOOL_VAR_H(chop_new_seam_pile);
202 INT_VAR_H(chop_inside_angle);
203 INT_VAR_H(chop_min_outline_area);
204 double_VAR_H(chop_split_dist_knob);
205 double_VAR_H(chop_overlap_knob);
206 double_VAR_H(chop_center_knob);
207 INT_VAR_H(chop_centered_maxwidth);
208 double_VAR_H(chop_sharpness_knob);
209 double_VAR_H(chop_width_change_knob);
210 double_VAR_H(chop_ok_split);
211 double_VAR_H(chop_good_split);
212 INT_VAR_H(chop_x_y_weight);
213 BOOL_VAR_H(assume_fixed_pitch_char_segment);
214 INT_VAR_H(wordrec_debug_level);
215 INT_VAR_H(wordrec_max_join_chunks);
216 BOOL_VAR_H(wordrec_skip_no_truth_words);
217 BOOL_VAR_H(wordrec_debug_blamer);
218 BOOL_VAR_H(wordrec_run_blamer);
219 INT_VAR_H(segsearch_debug_level);
220 INT_VAR_H(segsearch_max_pain_points);
221 INT_VAR_H(segsearch_max_futile_classifications);
222 double_VAR_H(segsearch_max_char_wh_ratio);
223 BOOL_VAR_H(save_alt_choices);
224
225 // methods from wordrec/*.cpp ***********************************************
226 Wordrec();
227 ~Wordrec() override = default;
228
229 // Fills word->alt_choices with alternative paths found during
230 // chopping/segmentation search that are kept in best_choices.
231 void SaveAltChoices(const LIST &best_choices, WERD_RES *word);
232
233 // Fills character choice lattice in the given BlamerBundle
234 // using the given ratings matrix and best choice list.
235 void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
236 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
237
238 // Calls fill_lattice_ member function
239 // (assumes that fill_lattice_ is not nullptr).
240 void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
241 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) {
242 (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
243 }
244
245 // tface.cpp
246 void program_editup(const std::string &textbase, TessdataManager *init_classifier,
247 TessdataManager *init_dict);
248 void cc_recog(WERD_RES *word);
249 void program_editdown(int32_t elasped_time);
250 void set_pass1();
251 void set_pass2();
252 int end_recog();
253 BLOB_CHOICE_LIST *call_matcher(TBLOB *blob);
254 int dict_word(const WERD_CHOICE &word);
255 // wordclass.cpp
256 BLOB_CHOICE_LIST *classify_blob(TBLOB *blob, const char *string, ScrollView::Color color,
257 BlamerBundle *blamer_bundle);
258
259 // segsearch.cpp
260 // SegSearch works on the lower diagonal matrix of BLOB_CHOICE_LISTs.
261 // Each entry in the matrix represents the classification choice
262 // for a chunk, i.e. an entry in row 2, column 1 represents the list
263 // of ratings for the chunks 1 and 2 classified as a single blob.
264 // The entries on the diagonal of the matrix are classifier choice lists
265 // for a single chunk from the maximal segmentation.
266 //
267 // The ratings matrix given to SegSearch represents the segmentation
268 // graph / trellis for the current word. The nodes in the graph are the
269 // individual BLOB_CHOICEs in each of the BLOB_CHOICE_LISTs in the ratings
270 // matrix. The children of each node (nodes connected by outgoing links)
271 // are the entries in the column that is equal to node's row+1. The parents
272 // (nodes connected by the incoming links) are the entries in the row that
273 // is equal to the node's column-1. Here is an example ratings matrix:
274 //
275 // 0 1 2 3 4
276 // -------------------------
277 // 0| c,( |
278 // 1| d l,1 |
279 // 2| o |
280 // 3| c,( |
281 // 4| g,y l,1 |
282 // -------------------------
283 //
284 // In the example above node "o" has children (outgoing connection to nodes)
285 // "c","(","g","y" and parents (incoming connections from nodes) "l","1","d".
286 //
287 // The objective of the search is to find the least cost path, where the cost
288 // is determined by the language model components and the properties of the
289 // cut between the blobs on the path. SegSearch starts by populating the
290 // matrix with the all the entries that were classified by the chopper and
291 // finding the initial best path. Based on the classifier ratings, language
292 // model scores and the properties of each cut, a list of "pain points" is
293 // constructed - those are the points on the path where the choices do not
294 // look consistent with the neighboring choices, the cuts look particularly
295 // problematic, or the certainties of the blobs are low. The most troublesome
296 // "pain point" is picked from the list and the new entry in the ratings
297 // matrix corresponding to this "pain point" is filled in. Then the language
298 // model state is updated to reflect the new classification and the new
299 // "pain points" are added to the list and the next most troublesome
300 // "pain point" is determined. This continues until either the word choice
301 // composed from the best paths in the segmentation graph is "good enough"
302 // (e.g. above a certain certainty threshold, is an unambiguous dictionary
303 // word, etc) or there are no more "pain points" to explore.
304 //
305 // If associate_blobs is set to false no new classifications will be done
306 // to combine blobs. Segmentation search will run only one "iteration"
307 // on the classifications already recorded in chunks_record.ratings.
308 //
309 // Note: this function assumes that word_res, best_choice_bundle arguments
310 // are not nullptr.
311 void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
312 BlamerBundle *blamer_bundle);
313
314 // Setup and run just the initial segsearch on an established matrix,
315 // without doing any additional chopping or joining.
316 // (Internal factored version that can be used as part of the main SegSearch.)
317 void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
318 std::vector<SegSearchPending> *pending,
319 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle);
320
321 // chop.cpp
322 PRIORITY point_priority(EDGEPT *point);
323 void add_point_to_list(PointHeap *point_heap, EDGEPT *point);
324 // Returns true if the edgept supplied as input is an inside angle. This
325 // is determined by the angular change of the vectors from point to point.
326 bool is_inside_angle(EDGEPT *pt);
327 int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3);
328 EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist);
329 void prioritize_points(TESSLINE *outline, PointHeap *points);
330 void new_min_point(EDGEPT *local_min, PointHeap *points);
331 void new_max_point(EDGEPT *local_max, PointHeap *points);
332 void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point,
333 EDGEPT_CLIST *new_points);
334
335 // chopper.cpp
336 SEAM *attempt_blob_chop(TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob,
337 const std::vector<SEAM *> &seams);
338 SEAM *chop_numbered_blob(TWERD *word, int32_t blob_number, bool italic_blob,
339 const std::vector<SEAM *> &seams);
340 SEAM *chop_overlapping_blob(const std::vector<TBOX> &boxes, bool italic_blob, WERD_RES *word_res,
341 unsigned *blob_number);
342 SEAM *improve_one_blob(const std::vector<BLOB_CHOICE *> &blob_choices, DANGERR *fixpt,
343 bool split_next_to_fragment, bool italic_blob, WERD_RES *word,
344 unsigned *blob_number);
345 SEAM *chop_one_blob(const std::vector<TBOX> &boxes,
346 const std::vector<BLOB_CHOICE *> &blob_choices, WERD_RES *word_res,
347 unsigned *blob_number);
348 void chop_word_main(WERD_RES *word);
349 void improve_by_chopping(float rating_cert_scale, WERD_RES *word,
350 BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle,
351 LMPainPoints *pain_points, std::vector<SegSearchPending> *pending);
352 int select_blob_to_split(const std::vector<BLOB_CHOICE *> &blob_choices, float rating_ceiling,
353 bool split_next_to_fragment);
354 int select_blob_to_split_from_fixpt(DANGERR *fixpt);
355
356 // findseam.cpp
357 void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams);
358 void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority,
359 SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile);
360 void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue);
361 SEAM *pick_good_seam(TBLOB *blob);
362 void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue,
363 SeamPile *seam_pile, SEAM **seam, TBLOB *blob);
364 void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], int16_t num_points,
365 EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile,
366 SEAM **seam, TBLOB *blob);
367
368 // gradechop.cpp
369 PRIORITY grade_split_length(SPLIT *split);
370 PRIORITY grade_sharpness(SPLIT *split);
371
372 // outlines.cpp
373 bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt);
374
375 // pieces.cpp
376 virtual BLOB_CHOICE_LIST *classify_piece(const std::vector<SEAM *> &seams, int16_t start,
377 int16_t end, const char *description, TWERD *word,
378 BlamerBundle *blamer_bundle);
379
380 // Member variables.
381
382 std::unique_ptr<LanguageModel> language_model_;
384 // Stores the best choice for the previous word in the paragraph.
385 // This variable is modified by PAGE_RES_IT when iterating over
386 // words to OCR on the page.
388
389 // Function used to fill char choice lattices.
390 void (Wordrec::*fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices,
391 const UNICHARSET &unicharset, BlamerBundle *blamer_bundle);
392
393protected:
394 inline bool SegSearchDone(int num_futile_classifications) {
395 return (language_model_->AcceptableChoiceFound() ||
396 num_futile_classifications >= segsearch_max_futile_classifications);
397 }
398
399 // Updates the language model state recorded for the child entries specified
400 // in pending[starting_col]. Enqueues the children of the updated entries
401 // into pending and proceeds to update (and remove from pending) all the
402 // remaining entries in pending[col] (col >= starting_col). Upon termination
403 // of this function all the pending[col] lists will be empty.
404 //
405 // The arguments:
406 //
407 // starting_col: index of the column in chunks_record->ratings from
408 // which the update should be started
409 //
410 // pending: list of entries listing chunks_record->ratings entries
411 // that should be updated
412 //
413 // pain_points: priority heap listing the pain points generated by
414 // the language model
415 //
416 // temp_pain_points: temporary storage for tentative pain points generated
417 // by the language model after a single call to LanguageModel::UpdateState()
418 // (the argument is passed in rather than created before each
419 // LanguageModel::UpdateState() call to avoid dynamic memory re-allocation)
420 //
421 // best_choice_bundle: a collection of variables that should be updated
422 // if a new best choice is found
423 //
424 void UpdateSegSearchNodes(float rating_cert_scale, int starting_col,
425 std::vector<SegSearchPending> *pending, WERD_RES *word_res,
426 LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle,
427 BlamerBundle *blamer_bundle);
428
429 // Process the given pain point: classify the corresponding blob, enqueue
430 // new pain points to join the newly classified blob with its neighbors.
431 void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point,
432 const char *pain_point_type,
433 std::vector<SegSearchPending> *pending, WERD_RES *word_res,
434 LMPainPoints *pain_points, BlamerBundle *blamer_bundle);
435 // Resets enough of the results so that the Viterbi search is re-run.
436 // Needed when the n-gram model is enabled, as the multi-length comparison
437 // implementation will re-value existing paths to worse values.
438 void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle,
439 std::vector<SegSearchPending> &pending);
440
441 // Add pain points for classifying blobs on the correct segmentation path
442 // (so that we can evaluate correct segmentation path and discover the reason
443 // for incorrect result).
444 void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points,
445 BlamerBundle *blamer_bundle, std::string &blamer_debug);
446};
447
448} // namespace tesseract
449
450#endif // DISABLED_LEGACY_ENGINE
451
452#endif // TESSERACT_WORDREC_WORDREC_H_
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
#define MAX_NUM_POINTS
Definition: chop.h:28
@ TBOX
float PRIORITY
Definition: seam.h:31
BOOL_VAR_H(wordrec_display_splits)
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
integer coordinate
Definition: points.h:36
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:226
void SetBlobClassified(int row)
Definition: wordrec.h:124
bool IsRowJustClassified(int row) const
Definition: wordrec.h:147
EDGEPT * headpt
Definition: wordrec.h:181
EDGEPT * tailpt
Definition: wordrec.h:182
FRAGMENT(EDGEPT *head_pt, EDGEPT *tail_pt)
BOOL_VAR_H(assume_fixed_pitch_char_segment)
INT_VAR_H(segsearch_debug_level)
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387
INT_VAR_H(wordrec_debug_level)
double_VAR_H(chop_width_change_knob)
BOOL_VAR_H(merge_fragments_in_matrix)
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:240
INT_VAR_H(segsearch_max_futile_classifications)
INT_VAR_H(chop_debug)
INT_VAR_H(chop_inside_angle)
INT_VAR_H(chop_same_distance)
BOOL_VAR_H(wordrec_skip_no_truth_words)
INT_VAR_H(chop_seam_pile_size)
void SaveAltChoices(const LIST &best_choices, WERD_RES *word)
BOOL_VAR_H(wordrec_debug_blamer)
double_VAR_H(chop_center_knob)
INT_VAR_H(chop_split_length)
void FillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
INT_VAR_H(repair_unchopped_blobs)
INT_VAR_H(chop_x_y_weight)
PRIORITY pass2_ok_split
Definition: wordrec.h:383
BOOL_VAR_H(chop_enable)
BOOL_VAR_H(wordrec_run_blamer)
BOOL_VAR_H(force_word_assoc)
double_VAR_H(chop_ok_split)
double_VAR_H(chop_sharpness_knob)
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:394
double_VAR_H(segsearch_max_char_wh_ratio)
INT_VAR_H(chop_min_outline_area)
double_VAR_H(chop_good_split)
INT_VAR_H(segsearch_max_pain_points)
BOOL_VAR_H(chop_vertical_creep)
BOOL_VAR_H(wordrec_enable_assoc)
double_VAR_H(tessedit_certainty_threshold)
~Wordrec() override=default
BOOL_VAR_H(save_alt_choices)
BOOL_VAR_H(chop_new_seam_pile)
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382
INT_VAR_H(chop_centered_maxwidth)
double_VAR_H(chop_overlap_knob)
double_VAR_H(chop_split_dist_knob)
INT_VAR_H(wordrec_max_join_chunks)
INT_VAR_H(chop_min_outline_points)
#define TESS_API
Definition: export.h:32