tesseract  4.00.00dev
tesseractclass.h
Go to the documentation of this file.
1 // File: tesseractclass.h
3 // Description: The Tesseract class. It holds/owns everything needed
4 // to run Tesseract on a single language, and also a set of
5 // sub-Tesseracts to run sub-languages. For thread safety, *every*
6 // global variable goes in here, directly, or indirectly.
7 // This makes it safe to run multiple Tesseracts in different
8 // threads in parallel, and keeps the different language
9 // instances separate.
10 // Author: Ray Smith
11 // Created: Fri Mar 07 08:17:01 PST 2008
12 //
13 // (C) Copyright 2008, Google Inc.
14 // Licensed under the Apache License, Version 2.0 (the "License");
15 // you may not use this file except in compliance with the License.
16 // You may obtain a copy of the License at
17 // http://www.apache.org/licenses/LICENSE-2.0
18 // Unless required by applicable law or agreed to in writing, software
19 // distributed under the License is distributed on an "AS IS" BASIS,
20 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21 // See the License for the specific language governing permissions and
22 // limitations under the License.
23 //
25 
26 #ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_
27 #define TESSERACT_CCMAIN_TESSERACTCLASS_H_
28 
29 #include "allheaders.h"
30 #include "control.h"
31 #include "debugpixa.h"
32 #include "devanagari_processing.h"
33 #include "docqual.h"
34 #include "genericvector.h"
35 #include "ocrclass.h"
36 #include "params.h"
37 #include "textord.h"
38 #include "wordrec.h"
39 
40 class BLOB_CHOICE_LIST_CLIST;
41 class BLOCK_LIST;
42 struct OSResults;
43 class PAGE_RES;
44 class PAGE_RES_IT;
45 struct Pix;
46 class ROW;
47 class SVMenuNode;
48 class TBOX;
49 class TO_BLOCK_LIST;
50 class WERD;
51 class WERD_CHOICE;
52 class WERD_RES;
53 
54 
55 // Top-level class for all tesseract global instance data.
56 // This class either holds or points to all data used by an instance
57 // of Tesseract, including the memory allocator. When this is
58 // complete, Tesseract will be thread-safe. UNTIL THEN, IT IS NOT!
59 //
60 // NOTE to developers: Do not create cyclic dependencies through this class!
61 // The directory dependency tree must remain a tree! The keep this clean,
62 // lower-level code (eg in ccutil, the bottom level) must never need to
63 // know about the content of a higher-level directory.
64 // The following scheme will grant the easiest access to lower-level
65 // global members without creating a cyclic dependency:
66 //
67 // Class Hierarchy (^ = inheritance):
68 //
69 // CCUtil (ccutil/ccutil.h)
70 // ^ Members include: UNICHARSET
71 // CUtil (cutil/cutil_class.h)
72 // ^ Members include: TBLOB*, TEXTBLOCK*
73 // CCStruct (ccstruct/ccstruct.h)
74 // ^ Members include: Image
75 // Classify (classify/classify.h)
76 // ^ Members include: Dict
77 // WordRec (wordrec/wordrec.h)
78 // ^ Members include: WERD*, DENORM*
79 // Tesseract (ccmain/tesseractclass.h)
80 // Members include: Pix*
81 //
82 // Other important classes:
83 //
84 // TessBaseAPI (api/baseapi.h)
85 // Members include: BLOCK_LIST*, PAGE_RES*,
86 // Tesseract*, ImageThresholder*
87 // Dict (dict/dict.h)
88 // Members include: Image* (private)
89 //
90 // NOTE: that each level contains members that correspond to global
91 // data that is defined (and used) at that level, not necessarily where
92 // the type is defined so for instance:
93 // BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
94 // goes inside the Textord class, not the cc_util class.
95 
96 namespace tesseract {
97 
98 class ColumnFinder;
99 class DocumentData;
100 class EquationDetect;
101 class ImageData;
102 class LSTMRecognizer;
103 class Tesseract;
104 
105 // A collection of various variables for statistics and debugging.
109  doc_blob_quality(0),
110  doc_outline_errs(0),
111  doc_char_quality(0),
112  good_char_count(0),
114  word_count(0),
115  dict_words(0),
116  tilde_crunch_written(false),
117  last_char_was_newline(true),
118  last_char_was_tilde(false),
120 
127  inT32 word_count; // count of word in the document
128  inT32 dict_words; // number of dicitionary words in the document
129  STRING dump_words_str; // accumulator used by dump_words()
130  // Flags used by write_results()
135 };
136 
137 // Struct to hold all the pointers to relevant data for processing a word.
138 struct WordData {
139  WordData() : word(NULL), row(NULL), block(NULL), prev_word(NULL) {}
140  explicit WordData(const PAGE_RES_IT& page_res_it)
141  : word(page_res_it.word()), row(page_res_it.row()->row),
142  block(page_res_it.block()->block), prev_word(NULL) {}
143  WordData(BLOCK* block_in, ROW* row_in, WERD_RES* word_res)
144  : word(word_res), row(row_in), block(block_in), prev_word(NULL) {}
145 
151 };
152 
153 // Definition of a Tesseract WordRecognizer. The WordData provides the context
154 // of row/block, in_word holds an initialized, possibly pre-classified word,
155 // that the recognizer may or may not consume (but if so it sets *in_word=NULL)
156 // and produces one or more output words in out_words, which may be the
157 // consumed in_word, or may be generated independently.
158 // This api allows both a conventional tesseract classifier to work, or a
159 // line-level classifier that generates multiple words from a merged input.
160 typedef void (Tesseract::*WordRecognizer)(const WordData& word_data,
161  WERD_RES** in_word,
162  PointerVector<WERD_RES>* out_words);
163 
164 class Tesseract : public Wordrec {
165  public:
166  Tesseract();
167  ~Tesseract();
168 
169  // Return appropriate dictionary
170  Dict& getDict() override;
171 
172  // Clear as much used memory as possible without resetting the adaptive
173  // classifier or losing any other classifier data.
174  void Clear();
175  // Clear all memory of adaption for this and all subclassifiers.
176  void ResetAdaptiveClassifier();
177  // Clear the document dictionary for this and all subclassifiers.
178  void ResetDocumentDictionary();
179 
180  // Set the equation detector.
181  void SetEquationDetect(EquationDetect* detector);
182 
183  // Simple accessors.
184  const FCOORD& reskew() const {
185  return reskew_;
186  }
187  // Destroy any existing pix and return a pointer to the pointer.
189  pixDestroy(&pix_binary_);
190  return &pix_binary_;
191  }
192  Pix* pix_binary() const {
193  return pix_binary_;
194  }
195  Pix* pix_grey() const {
196  return pix_grey_;
197  }
198  void set_pix_grey(Pix* grey_pix) {
199  pixDestroy(&pix_grey_);
200  pix_grey_ = grey_pix;
201  }
202  Pix* pix_original() const { return pix_original_; }
203  // Takes ownership of the given original_pix.
204  void set_pix_original(Pix* original_pix) {
205  pixDestroy(&pix_original_);
206  pix_original_ = original_pix;
207  // Clone to sublangs as well.
208  for (int i = 0; i < sub_langs_.size(); ++i)
209  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
210  : nullptr);
211  }
212  // Returns a pointer to a Pix representing the best available resolution image
213  // of the page, with best available bit depth as second priority. Result can
214  // be of any bit depth, but never color-mapped, as that has always been
215  // removed. Note that in grey and color, 0 is black and 255 is
216  // white. If the input was binary, then black is 1 and white is 0.
217  // To tell the difference pixGetDepth() will return 32, 8 or 1.
218  // In any case, the return value is a borrowed Pix, and should not be
219  // deleted or pixDestroyed.
220  Pix* BestPix() const {
221  if (pixGetWidth(pix_original_) == ImageWidth())
222  return pix_original_;
223  else if (pix_grey_ != NULL)
224  return pix_grey_;
225  else
226  return pix_binary_;
227  }
228  void set_pix_thresholds(Pix* thresholds) {
229  pixDestroy(&pix_thresholds_);
230  pix_thresholds_ = thresholds;
231  }
232  int source_resolution() const {
233  return source_resolution_;
234  }
235  void set_source_resolution(int ppi) {
236  source_resolution_ = ppi;
237  }
238  int ImageWidth() const {
239  return pixGetWidth(pix_binary_);
240  }
241  int ImageHeight() const {
242  return pixGetHeight(pix_binary_);
243  }
244  Pix* scaled_color() const {
245  return scaled_color_;
246  }
247  int scaled_factor() const {
248  return scaled_factor_;
249  }
250  void SetScaledColor(int factor, Pix* color) {
251  scaled_factor_ = factor;
252  scaled_color_ = color;
253  }
254  const Textord& textord() const {
255  return textord_;
256  }
258  return &textord_;
259  }
260 
261  bool right_to_left() const {
262  return right_to_left_;
263  }
264  int num_sub_langs() const {
265  return sub_langs_.size();
266  }
267  Tesseract* get_sub_lang(int index) const {
268  return sub_langs_[index];
269  }
270  // Returns true if any language uses Tesseract (as opposed to LSTM).
271  bool AnyTessLang() const {
272  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
273  for (int i = 0; i < sub_langs_.size(); ++i) {
274  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
275  }
276  return false;
277  }
278  // Returns true if any language uses the LSTM.
279  bool AnyLSTMLang() const {
280  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
281  for (int i = 0; i < sub_langs_.size(); ++i) {
282  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
283  return true;
284  }
285  return false;
286  }
287 
288  void SetBlackAndWhitelist();
289 
290  // Perform steps to prepare underlying binary image/other data structures for
291  // page segmentation. Uses the strategy specified in the global variable
292  // pageseg_devanagari_split_strategy for perform splitting while preparing for
293  // page segmentation.
294  void PrepareForPageseg();
295 
296  // Perform steps to prepare underlying binary image/other data structures for
297  // Tesseract OCR. The current segmentation is required by this method.
298  // Uses the strategy specified in the global variable
299  // ocr_devanagari_split_strategy for performing splitting while preparing for
300  // Tesseract ocr.
301  void PrepareForTessOCR(BLOCK_LIST* block_list,
302  Tesseract* osd_tess, OSResults* osr);
303 
304  int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
305  Tesseract* osd_tess, OSResults* osr);
306  void SetupWordScripts(BLOCK_LIST* blocks);
307  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
308  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
309  Tesseract* osd_tess, OSResults* osr);
310  ColumnFinder* SetupPageSegAndDetectOrientation(
311  PageSegMode pageseg_mode, BLOCK_LIST* blocks, Tesseract* osd_tess,
312  OSResults* osr, TO_BLOCK_LIST* to_blocks, Pix** photo_mask_pix,
313  Pix** music_mask_pix);
314  // par_control.cpp
315  void PrerecAllWordsPar(const GenericVector<WordData>& words);
316 
318  // Generates training data for training a line recognizer, eg LSTM.
319  // Breaks the page into lines, according to the boxes, and writes them to a
320  // serialized DocumentData based on output_basename.
321  void TrainLineRecognizer(const STRING& input_imagename,
322  const STRING& output_basename,
323  BLOCK_LIST *block_list);
324  // Generates training data for training a line recognizer, eg LSTM.
325  // Breaks the boxes into lines, normalizes them, converts to ImageData and
326  // appends them to the given training_data.
327  void TrainFromBoxes(const GenericVector<TBOX>& boxes,
328  const GenericVector<STRING>& texts,
329  BLOCK_LIST *block_list,
330  DocumentData* training_data);
331 
332  // Returns an Imagedata containing the image of the given textline,
333  // and ground truth boxes/truth text if available in the input.
334  // The image is not normalized in any way.
335  ImageData* GetLineData(const TBOX& line_box,
336  const GenericVector<TBOX>& boxes,
337  const GenericVector<STRING>& texts,
338  int start_box, int end_box,
339  const BLOCK& block);
340  // Helper gets the image of a rectangle, using the block.re_rotation() if
341  // needed to get to the image, and rotating the result back to horizontal
342  // layout. (CJK characters will be on their left sides) The vertical text flag
343  // is set in the returned ImageData if the text was originally vertical, which
344  // can be used to invoke a different CJK recognition engine. The revised_box
345  // is also returned to enable calculation of output bounding boxes.
346  ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding,
347  TBOX* revised_box) const;
348  // Recognizes a word or group of words, converting to WERD_RES in *words.
349  // Analogous to classify_word_pass1, but can handle a group of words as well.
350  void LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
351  PointerVector<WERD_RES>* words);
352  // Apply segmentation search to the given set of words, within the constraints
353  // of the existing ratings matrix. If there is already a best_choice on a word
354  // leaves it untouched and just sets the done/accepted etc flags.
355  void SearchWords(PointerVector<WERD_RES>* words);
356 
358  bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box,
359  const char* word_config, int pass);
360  // Sets up the words ready for whichever engine is to be run
361  void SetupAllWordsPassN(int pass_n,
362  const TBOX* target_word_box,
363  const char* word_config,
364  PAGE_RES* page_res,
365  GenericVector<WordData>* words);
366  // Sets up the single word ready for whichever engine is to be run.
367  void SetupWordPassN(int pass_n, WordData* word);
368  // Runs word recognition on all the words.
369  bool RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
370  PAGE_RES_IT* pr_it,
371  GenericVector<WordData>* words);
372  bool recog_all_words(PAGE_RES* page_res,
373  ETEXT_DESC* monitor,
374  const TBOX* target_word_box,
375  const char* word_config,
376  int dopasses);
377  void rejection_passes(PAGE_RES* page_res,
378  ETEXT_DESC* monitor,
379  const TBOX* target_word_box,
380  const char* word_config);
381  void bigram_correction_pass(PAGE_RES *page_res);
382  void blamer_pass(PAGE_RES* page_res);
383  // Sets script positions and detects smallcaps on all output words.
384  void script_pos_pass(PAGE_RES* page_res);
385  // Helper to recognize the word using the given (language-specific) tesseract.
386  // Returns positive if this recognizer found more new best words than the
387  // number kept from best_words.
388  int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer,
389  bool debug, WERD_RES** in_word,
390  PointerVector<WERD_RES>* best_words);
391  // Moves good-looking "noise"/diacritics from the reject list to the main
392  // blob list on the current word. Returns true if anything was done, and
393  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
394  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
395  bool* make_next_word_fuzzy);
396  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
397  // Input: a set of noisy outlines that probably belong to the real_word.
398  // Output: outlines that overlapped blobs are set to NULL and put back into
399  // the word, either in the blobs or in the reject list.
400  void AssignDiacriticsToOverlappingBlobs(
401  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
402  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
403  GenericVector<bool>* overlapped_any_blob,
404  GenericVector<C_BLOB*>* target_blobs);
405  // Attempts to assign non-overlapping outlines to their nearest blobs or
406  // make new blobs out of them.
407  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
408  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
409  GenericVector<bool>* word_wanted,
410  GenericVector<C_BLOB*>* target_blobs);
411  // Starting with ok_outlines set to indicate which outlines overlap the blob,
412  // chooses the optimal set (approximately) and returns true if any outlines
413  // are desired, in which case ok_outlines indicates which ones.
414  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
415  PAGE_RES_IT* pr_it, C_BLOB* blob,
416  const GenericVector<C_OUTLINE*>& outlines,
417  int num_outlines,
418  GenericVector<bool>* ok_outlines);
419  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
420  // the inclusion of the outlines, and returns the certainty of the raw choice.
421  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
422  const GenericVector<C_OUTLINE*>& outlines,
423  int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
424  STRING* best_str);
425  // Classifies the given blob (part of word_data->word->word) as an individual
426  // word, using languages, chopper etc, returning only the certainty of the
427  // best raw choice, and undoing all the work done to fake out the word.
428  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
429  STRING* best_str, float* c2);
430  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
431  WordData* word_data);
432  void classify_word_pass1(const WordData& word_data,
433  WERD_RES** in_word,
434  PointerVector<WERD_RES>* out_words);
435  void recog_pseudo_word(PAGE_RES* page_res, // blocks to check
436  TBOX &selection_box);
437 
438  void fix_rep_char(PAGE_RES_IT* page_res_it);
439 
440  ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET& char_set,
441  const char *s,
442  const char *lengths);
443  void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK* block);
444  void classify_word_pass2(const WordData& word_data,
445  WERD_RES** in_word,
446  PointerVector<WERD_RES>* out_words);
447  void ReportXhtFixResult(bool accept_new_word, float new_x_ht,
448  WERD_RES* word, WERD_RES* new_word);
449  bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
450  bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
451  // Runs recognition with the test baseline shift and x-height and returns true
452  // if there was an improvement in recognition result.
453  bool TestNewNormalization(int original_misfits, float baseline_shift,
454  float new_x_ht, WERD_RES *word, BLOCK* block,
455  ROW *row);
456  BOOL8 recog_interactive(PAGE_RES_IT* pr_it);
457 
458  // Set fonts of this word.
459  void set_word_fonts(WERD_RES *word);
460  void font_recognition_pass(PAGE_RES* page_res);
461  void dictionary_correction_pass(PAGE_RES* page_res);
462  BOOL8 check_debug_pt(WERD_RES *word, int location);
463 
465  bool SubAndSuperscriptFix(WERD_RES *word_res);
466  void GetSubAndSuperscriptCandidates(const WERD_RES *word,
467  int *num_rebuilt_leading,
468  ScriptPos *leading_pos,
469  float *leading_certainty,
470  int *num_rebuilt_trailing,
471  ScriptPos *trailing_pos,
472  float *trailing_certainty,
473  float *avg_certainty,
474  float *unlikely_threshold);
475  WERD_RES *TrySuperscriptSplits(int num_chopped_leading,
476  float leading_certainty,
477  ScriptPos leading_pos,
478  int num_chopped_trailing,
479  float trailing_certainty,
480  ScriptPos trailing_pos,
481  WERD_RES *word,
482  bool *is_good,
483  int *retry_leading,
484  int *retry_trailing);
485  bool BelievableSuperscript(bool debug,
486  const WERD_RES &word,
487  float certainty_threshold,
488  int *left_ok,
489  int *right_ok) const;
490 
492 
493  void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box);
494  void write_results(PAGE_RES_IT &page_res_it, // full info
495  char newline_type, // type of newline
496  BOOL8 force_eol // override tilde crunch?
497  );
498  void set_unlv_suspects(WERD_RES *word);
499  UNICHAR_ID get_rep_char(WERD_RES *word); // what char is repeated?
500  BOOL8 acceptable_number_string(const char *s,
501  const char *lengths);
502  inT16 count_alphanums(const WERD_CHOICE &word);
503  inT16 count_alphas(const WERD_CHOICE &word);
505  void read_config_file(const char *filename, SetParamConstraint constraint);
506  // Initialize for potentially a set of languages defined by the language
507  // string and recursively any additional languages required by any language
508  // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
509  // See init_tesseract_internal for args.
510  int init_tesseract(const char* arg0, const char* textbase,
511  const char* language, OcrEngineMode oem, char** configs,
512  int configs_size, const GenericVector<STRING>* vars_vec,
513  const GenericVector<STRING>* vars_values,
514  bool set_only_init_params, TessdataManager* mgr);
515  int init_tesseract(const char *datapath,
516  const char *language,
517  OcrEngineMode oem) {
518  TessdataManager mgr;
519  return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL,
520  false, &mgr);
521  }
522  // Common initialization for a single language.
523  // arg0 is the datapath for the tessdata directory, which could be the
524  // path of the tessdata directory with no trailing /, or (if tessdata
525  // lives in the same directory as the executable, the path of the executable,
526  // hence the name arg0.
527  // textbase is an optional output file basename (used only for training)
528  // language is the language code to load.
529  // oem controls which engine(s) will operate on the image
530  // configs (argv) is an array of config filenames to load variables from.
531  // May be NULL.
532  // configs_size (argc) is the number of elements in configs.
533  // vars_vec is an optional vector of variables to set.
534  // vars_values is an optional corresponding vector of values for the variables
535  // in vars_vec.
536  // If set_only_init_params is true, then only the initialization variables
537  // will be set.
538  int init_tesseract_internal(const char* arg0, const char* textbase,
539  const char* language, OcrEngineMode oem,
540  char** configs, int configs_size,
541  const GenericVector<STRING>* vars_vec,
542  const GenericVector<STRING>* vars_values,
543  bool set_only_init_params, TessdataManager* mgr);
544 
545  // Set the universal_id member of each font to be unique among all
546  // instances of the same font loaded.
547  void SetupUniversalFontIds();
548 
549  int init_tesseract_lm(const char* arg0, const char* textbase,
550  const char* language, TessdataManager* mgr);
551 
552  void recognize_page(STRING& image_name);
553  void end_tesseract();
554 
555  bool init_tesseract_lang_data(const char* arg0, const char* textbase,
556  const char* language, OcrEngineMode oem,
557  char** configs, int configs_size,
558  const GenericVector<STRING>* vars_vec,
559  const GenericVector<STRING>* vars_values,
560  bool set_only_init_params,
561  TessdataManager* mgr);
562 
563  void ParseLanguageString(const char* lang_str,
564  GenericVector<STRING>* to_load,
565  GenericVector<STRING>* not_to_load);
566 
568  SVMenuNode *build_menu_new();
569  #ifndef GRAPHICS_DISABLED
570  void pgeditor_main(int width, int height, PAGE_RES* page_res);
571  #endif // GRAPHICS_DISABLED
572  void process_image_event( // action in image win
573  const SVEvent &event);
574  BOOL8 process_cmd_win_event( // UI command semantics
575  inT32 cmd_event, // which menu item?
576  char *new_value // any prompt data
577  );
578  void debug_word(PAGE_RES* page_res, const TBOX &selection_box);
579  void do_re_display(
580  BOOL8 (tesseract::Tesseract::*word_painter)(PAGE_RES_IT* pr_it));
581  BOOL8 word_display(PAGE_RES_IT* pr_it);
582  BOOL8 word_bln_display(PAGE_RES_IT* pr_it);
583  BOOL8 word_blank_and_set_display(PAGE_RES_IT* pr_its);
584  BOOL8 word_set_display(PAGE_RES_IT* pr_it);
585  // #ifndef GRAPHICS_DISABLED
586  BOOL8 word_dumper(PAGE_RES_IT* pr_it);
587  // #endif // GRAPHICS_DISABLED
588  void blob_feature_display(PAGE_RES* page_res, const TBOX& selection_box);
590  // make rej map for word
591  void make_reject_map(WERD_RES *word, ROW *row, inT16 pass);
592  BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map);
593  inT16 first_alphanum_index(const char *word,
594  const char *word_lengths);
595  inT16 first_alphanum_offset(const char *word,
596  const char *word_lengths);
597  inT16 alpha_count(const char *word,
598  const char *word_lengths);
599  BOOL8 word_contains_non_1_digit(const char *word,
600  const char *word_lengths);
601  void dont_allow_1Il(WERD_RES *word);
602  inT16 count_alphanums( //how many alphanums
603  WERD_RES *word);
604  void flip_0O(WERD_RES *word);
605  BOOL8 non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
606  BOOL8 non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id);
607  BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row);
608  void nn_match_word( //Match a word
609  WERD_RES *word,
610  ROW *row);
611  void nn_recover_rejects(WERD_RES *word, ROW *row);
612  void set_done( //set done flag
613  WERD_RES *word,
614  inT16 pass);
615  inT16 safe_dict_word(const WERD_RES *werd_res); // is best_choice in dict?
616  void flip_hyphens(WERD_RES *word);
617  void reject_I_1_L(WERD_RES *word);
618  void reject_edge_blobs(WERD_RES *word);
619  void reject_mostly_rejects(WERD_RES *word);
621  BOOL8 word_adaptable( //should we adapt?
622  WERD_RES *word,
623  uinT16 mode);
624 
626  void recog_word_recursive(WERD_RES* word);
627  void recog_word(WERD_RES *word);
628  void split_and_recog_word(WERD_RES* word);
629  void split_word(WERD_RES *word,
630  int split_pt,
631  WERD_RES **right_piece,
632  BlamerBundle **orig_blamer_bundle) const;
633  void join_words(WERD_RES *word,
634  WERD_RES *word2,
635  BlamerBundle *orig_bb) const;
637  BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position);
638  inT16 eval_word_spacing(WERD_RES_LIST &word_res_list);
639  void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK* block);
640  inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list);
641  void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
642  void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK* block);
643  void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK* block);
644  void fix_fuzzy_spaces( //find fuzzy words
645  ETEXT_DESC *monitor, //progress monitor
646  inT32 word_count, //count of words in doc
647  PAGE_RES *page_res);
648  void dump_words(WERD_RES_LIST &perm, inT16 score,
649  inT16 mode, BOOL8 improved);
650  BOOL8 fixspace_thinks_word_done(WERD_RES *word);
651  inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score);
652  float blob_noise_score(TBLOB *blob);
653  void break_noisiest_blob_word(WERD_RES_LIST &words);
655  GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word);
656  BOOL8 potential_word_crunch(WERD_RES *word,
657  GARBAGE_LEVEL garbage_level,
658  BOOL8 ok_dict_word);
659  void tilde_crunch(PAGE_RES_IT &page_res_it);
660  void unrej_good_quality_words( //unreject potential
661  PAGE_RES_IT &page_res_it);
662  void doc_and_block_rejection( //reject big chunks
663  PAGE_RES_IT &page_res_it,
664  BOOL8 good_quality_doc);
665  void quality_based_rejection(PAGE_RES_IT &page_res_it,
666  BOOL8 good_quality_doc);
667  void convert_bad_unlv_chs(WERD_RES *word_res);
668  void tilde_delete(PAGE_RES_IT &page_res_it);
669  inT16 word_blob_quality(WERD_RES *word, ROW *row);
670  void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count,
671  inT16 *accepted_match_count);
672  void unrej_good_chs(WERD_RES *word, ROW *row);
673  inT16 count_outline_errs(char c, inT16 outline_count);
674  inT16 word_outline_errs(WERD_RES *word);
675  BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level);
676  CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode);
677  inT16 failure_count(WERD_RES *word);
678  BOOL8 noise_outlines(TWERD *word);
680  void
681  process_selected_words (
682  PAGE_RES* page_res, // blocks to check
683  //function to call
684  TBOX & selection_box,
685  BOOL8 (tesseract::Tesseract::*word_processor)(PAGE_RES_IT* pr_it));
687  void tess_add_doc_word( //test acceptability
688  WERD_CHOICE *word_choice //after context
689  );
690  void tess_segment_pass_n(int pass_n, WERD_RES *word);
691  bool tess_acceptable_word(WERD_RES *word);
692 
694  // Applies the box file based on the image name fname, and resegments
695  // the words in the block_list (page), with:
696  // blob-mode: one blob per line in the box file, words as input.
697  // word/line-mode: one blob per space-delimited unit after the #, and one word
698  // per line in the box file. (See comment above for box file format.)
699  // If find_segmentation is true, (word/line mode) then the classifier is used
700  // to re-segment words/lines to match the space-delimited truth string for
701  // each box. In this case, the input box may be for a word or even a whole
702  // text line, and the output words will contain multiple blobs corresponding
703  // to the space-delimited input string.
704  // With find_segmentation false, no classifier is needed, but the chopper
705  // can still be used to correctly segment touching characters with the help
706  // of the input boxes.
707  // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
708  // from normal classification, ie. with a word, chopped_word, rebuild_word,
709  // seam_array, denorm, box_word, and best_state, but NO best_choice or
710  // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
711  // Instead, the correct_text member of WERD_RES is set, and this may be later
712  // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
713  // is not required before calling ApplyBoxTraining.
714  PAGE_RES* ApplyBoxes(const STRING& fname, bool find_segmentation,
715  BLOCK_LIST *block_list);
716 
717  // Any row xheight that is significantly different from the median is set
718  // to the median.
719  void PreenXHeights(BLOCK_LIST *block_list);
720 
721  // Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
722  // All fuzzy spaces are removed, and all the words are maximally chopped.
723  PAGE_RES* SetupApplyBoxes(const GenericVector<TBOX>& boxes,
724  BLOCK_LIST *block_list);
725  // Tests the chopper by exhaustively running chop_one_blob.
726  // The word_res will contain filled chopped_word, seam_array, denorm,
727  // box_word and best_state for the maximally chopped word.
728  void MaximallyChopWord(const GenericVector<TBOX>& boxes,
729  BLOCK* block, ROW* row, WERD_RES* word_res);
730  // Gather consecutive blobs that match the given box into the best_state
731  // and corresponding correct_text.
732  // Fights over which box owns which blobs are settled by pre-chopping and
733  // applying the blobs to box or next_box with the least non-overlap.
734  // Returns false if the box was in error, which can only be caused by
735  // failing to find an appropriate blob for a box.
736  // This means that occasionally, blobs may be incorrectly segmented if the
737  // chopper fails to find a suitable chop point.
738  bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
739  const TBOX& box, const TBOX& next_box,
740  const char* correct_text);
741  // Consume all source blobs that strongly overlap the given box,
742  // putting them into a new word, with the correct_text label.
743  // Fights over which box owns which blobs are settled by
744  // applying the blobs to box or next_box with the least non-overlap.
745  // Returns false if the box was in error, which can only be caused by
746  // failing to find an overlapping blob for a box.
747  bool ResegmentWordBox(BLOCK_LIST *block_list,
748  const TBOX& box, const TBOX& next_box,
749  const char* correct_text);
750  // Resegments the words by running the classifier in an attempt to find the
751  // correct segmentation that produces the required string.
752  void ReSegmentByClassification(PAGE_RES* page_res);
753  // Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
754  // Returns false if an invalid UNICHAR_ID is encountered.
755  bool ConvertStringToUnichars(const char* utf8,
756  GenericVector<UNICHAR_ID>* class_ids);
757  // Resegments the word to achieve the target_text from the classifier.
758  // Returns false if the re-segmentation fails.
759  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
760  // applies a full search on the classifier results to find the best classified
761  // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
762  // substitutions ARE used.
763  bool FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
764  WERD_RES* word_res);
765  // Recursive helper to find a match to the target_text (from text_index
766  // position) in the choices (from choices_pos position).
767  // Choices is an array of GenericVectors, of length choices_length, with each
768  // element representing a starting position in the word, and the
769  // GenericVector holding classification results for a sequence of consecutive
770  // blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
771  void SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
772  int choices_pos, int choices_length,
773  const GenericVector<UNICHAR_ID>& target_text,
774  int text_index,
775  float rating, GenericVector<int>* segmentation,
776  float* best_rating, GenericVector<int>* best_segmentation);
777  // Counts up the labelled words and the blobs within.
778  // Deletes all unused or emptied words, counting the unused ones.
779  // Resets W_BOL and W_EOL flags correctly.
780  // Builds the rebuild_word and rebuilds the box_word.
781  void TidyUp(PAGE_RES* page_res);
782  // Logs a bad box by line in the box file and box coords.
783  void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch,
784  const char *err_msg);
785  // Creates a fake best_choice entry in each WERD_RES with the correct text.
786  void CorrectClassifyWords(PAGE_RES* page_res);
787  // Call LearnWord to extract features for labelled blobs within each word.
788  // Features are stored in an internal buffer.
789  void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res);
790 
792  // Returns the number of misfit blob tops in this word.
793  int CountMisfitTops(WERD_RES *word_res);
794  // Returns a new x-height in pixels (original image coords) that is
795  // maximally compatible with the result in word_res.
796  // Returns 0.0f if no x-height is found that is better than the current
797  // estimate.
798  float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
800  // TODO(ocr-team): Find and remove obsolete parameters.
801  BOOL_VAR_H(tessedit_resegment_from_boxes, false,
802  "Take segmentation and labeling from box file");
803  BOOL_VAR_H(tessedit_resegment_from_line_boxes, false,
804  "Conversion of word/line box file to char box file");
805  BOOL_VAR_H(tessedit_train_from_boxes, false,
806  "Generate training data from boxed chars");
807  BOOL_VAR_H(tessedit_make_boxes_from_boxes, false,
808  "Generate more boxes from boxed chars");
809  BOOL_VAR_H(tessedit_train_line_recognizer, false,
810  "Break input into lines and remap boxes if present");
811  BOOL_VAR_H(tessedit_dump_pageseg_images, false,
812  "Dump intermediate images made during page segmentation");
813  INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
814  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
815  " 5=line, 6=word, 7=char"
816  " (Values from PageSegMode enum in publictypes.h)");
817  INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
818  "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults"
819  " to loading and running the most accurate available.");
820  STRING_VAR_H(tessedit_char_blacklist, "",
821  "Blacklist of chars not to recognize");
822  STRING_VAR_H(tessedit_char_whitelist, "",
823  "Whitelist of chars to recognize");
824  STRING_VAR_H(tessedit_char_unblacklist, "",
825  "List of chars to override tessedit_char_blacklist");
826  BOOL_VAR_H(tessedit_ambigs_training, false,
827  "Perform training for ambiguities");
828  INT_VAR_H(pageseg_devanagari_split_strategy,
830  "Whether to use the top-line splitting process for Devanagari "
831  "documents while performing page-segmentation.");
832  INT_VAR_H(ocr_devanagari_split_strategy,
834  "Whether to use the top-line splitting process for Devanagari "
835  "documents while performing ocr.");
836  STRING_VAR_H(tessedit_write_params_to_file, "",
837  "Write all parameters to the given file.");
838  BOOL_VAR_H(tessedit_adaption_debug, false,
839  "Generate and print debug information for adaption");
840  INT_VAR_H(bidi_debug, 0, "Debug level for BiDi");
841  INT_VAR_H(applybox_debug, 1, "Debug level");
842  INT_VAR_H(applybox_page, 0, "Page number to apply boxes from");
843  STRING_VAR_H(applybox_exposure_pattern, ".exp",
844  "Exposure value follows this pattern in the image"
845  " filename. The name of the image files are expected"
846  " to be in the form [lang].[fontname].exp[num].tif");
847  BOOL_VAR_H(applybox_learn_chars_and_char_frags_mode, false,
848  "Learn both character fragments (as is done in the"
849  " special low exposure mode) as well as unfragmented"
850  " characters.");
851  BOOL_VAR_H(applybox_learn_ngrams_mode, false,
852  "Each bounding box is assumed to contain ngrams. Only"
853  " learn the ngrams whose outlines overlap horizontally.");
854  BOOL_VAR_H(tessedit_display_outwords, false, "Draw output words");
855  BOOL_VAR_H(tessedit_dump_choices, false, "Dump char choices");
856  BOOL_VAR_H(tessedit_timing_debug, false, "Print timing stats");
857  BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
858  "Try to improve fuzzy spaces");
859  BOOL_VAR_H(tessedit_unrej_any_wd, false,
860  "Don't bother with word plausibility");
861  BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
862  BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
863  BOOL_VAR_H(tessedit_enable_doc_dict, true,
864  "Add words to the document dictionary");
865  BOOL_VAR_H(tessedit_debug_fonts, false, "Output font info per char");
866  BOOL_VAR_H(tessedit_debug_block_rejection, false, "Block and Row stats");
867  BOOL_VAR_H(tessedit_enable_bigram_correction, true,
868  "Enable correction based on the word bigram dictionary.");
869  BOOL_VAR_H(tessedit_enable_dict_correction, false,
870  "Enable single word correction based on the dictionary.");
871  INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
872  "correction.");
873  BOOL_VAR_H(enable_noise_removal, true,
874  "Remove and conditionally reassign small outlines when they"
875  " confuse layout analysis, determining diacritics vs noise");
876  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
877  // Worst (min) certainty, for which a diacritic is allowed to make the base
878  // character worse and still be included.
879  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
880  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
881  // make the base character worse and still be included.
882  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
883  // Worst (min) certainty, for which a diacritic is allowed to make a new
884  // stand-alone blob.
885  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
886  // Factor of certainty margin for adding diacritics to not count as worse.
887  double_VAR_H(noise_cert_factor, 0.375,
888  "Scaling on certainty diff from Hingepoint");
889  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
890  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
891  INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
892  BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
893  STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
894  STRING_VAR_H(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");
895  STRING_VAR_H(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation");
896  double_VAR_H(quality_rej_pc, 0.08, "good_quality_doc lte rejection limit");
897  double_VAR_H(quality_blob_pc, 0.0, "good_quality_doc gte good blobs limit");
898  double_VAR_H(quality_outline_pc, 1.0,
899  "good_quality_doc lte outline error limit");
900  double_VAR_H(quality_char_pc, 0.95, "good_quality_doc gte good char limit");
901  INT_VAR_H(quality_min_initial_alphas_reqd, 2, "alphas in a good word");
902  INT_VAR_H(tessedit_tess_adaption_mode, 0x27,
903  "Adaptation decision algorithm for tess");
904  BOOL_VAR_H(tessedit_minimal_rej_pass1, false,
905  "Do minimal rejection on pass 1 output");
906  BOOL_VAR_H(tessedit_test_adaption, false, "Test adaption criteria");
907  BOOL_VAR_H(tessedit_matcher_log, false, "Log matcher activity");
908  INT_VAR_H(tessedit_test_adaption_mode, 3,
909  "Adaptation decision algorithm for tess");
910  BOOL_VAR_H(test_pt, false, "Test for point");
911  double_VAR_H(test_pt_x, 99999.99, "xcoord");
912  double_VAR_H(test_pt_y, 99999.99, "ycoord");
913  INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info.");
914  INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info.");
915  BOOL_VAR_H(paragraph_text_based, true,
916  "Run paragraph detection on the post-text-recognition "
917  "(more accurate)");
918  BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm");
919  STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines");
920  STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines");
921  BOOL_VAR_H(docqual_excuse_outline_errs, false,
922  "Allow outline errs in unrejection?");
923  BOOL_VAR_H(tessedit_good_quality_unrej, true,
924  "Reduce rejection on good docs");
925  BOOL_VAR_H(tessedit_use_reject_spaces, true, "Reject spaces?");
926  double_VAR_H(tessedit_reject_doc_percent, 65.00,
927  "%rej allowed before rej whole doc");
928  double_VAR_H(tessedit_reject_block_percent, 45.00,
929  "%rej allowed before rej whole block");
930  double_VAR_H(tessedit_reject_row_percent, 40.00,
931  "%rej allowed before rej whole row");
932  double_VAR_H(tessedit_whole_wd_rej_row_percent, 70.00,
933  "Number of row rejects in whole word rejects"
934  "which prevents whole row rejection");
935  BOOL_VAR_H(tessedit_preserve_blk_rej_perfect_wds, true,
936  "Only rej partially rejected words in block rejection");
937  BOOL_VAR_H(tessedit_preserve_row_rej_perfect_wds, true,
938  "Only rej partially rejected words in row rejection");
939  BOOL_VAR_H(tessedit_dont_blkrej_good_wds, false,
940  "Use word segmentation quality metric");
941  BOOL_VAR_H(tessedit_dont_rowrej_good_wds, false,
942  "Use word segmentation quality metric");
943  INT_VAR_H(tessedit_preserve_min_wd_len, 2,
944  "Only preserve wds longer than this");
945  BOOL_VAR_H(tessedit_row_rej_good_docs, true,
946  "Apply row rejection to good docs");
947  double_VAR_H(tessedit_good_doc_still_rowrej_wd, 1.1,
948  "rej good doc wd if more than this fraction rejected");
949  BOOL_VAR_H(tessedit_reject_bad_qual_wds, true,
950  "Reject all bad quality wds");
951  BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats");
952  BOOL_VAR_H(tessedit_debug_quality_metrics, false,
953  "Output data to debug file");
954  BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks");
955  double_VAR_H(quality_rowrej_pc, 1.1,
956  "good_quality_doc gte good char limit");
957  BOOL_VAR_H(unlv_tilde_crunching, true,
958  "Mark v.bad words for tilde crunch");
959  BOOL_VAR_H(hocr_font_info, false,
960  "Add font info to hocr output");
961  BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
962  BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
963  double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
964  BOOL_VAR_H(crunch_terrible_garbage, true, "As it says");
965  double_VAR_H(crunch_poor_garbage_cert, -9.0,
966  "crunch garbage cert lt this");
967  double_VAR_H(crunch_poor_garbage_rate, 60, "crunch garbage rating lt this");
968  double_VAR_H(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this");
969  double_VAR_H(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this");
970  BOOL_VAR_H(crunch_pot_garbage, true, "POTENTIAL crunch garbage");
971  double_VAR_H(crunch_del_rating, 60, "POTENTIAL crunch rating lt this");
972  double_VAR_H(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");
973  double_VAR_H(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");
974  double_VAR_H(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");
975  double_VAR_H(crunch_del_min_width, 3.0, "Del if word width lt xht x this");
976  double_VAR_H(crunch_del_high_word, 1.5,
977  "Del if word gt xht x this above bl");
978  double_VAR_H(crunch_del_low_word, 0.5, "Del if word gt xht x this below bl");
979  double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
980  INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
981  INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
982  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
983  BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
984  BOOL_VAR_H(crunch_leave_accept_strings, false,
985  "Don't pot crunch sensible strings");
986  BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
987  INT_VAR_H(crunch_leave_lc_strings, 4,
988  "Don't crunch words with long lower case strings");
989  INT_VAR_H(crunch_leave_uc_strings, 4,
990  "Don't crunch words with long lower case strings");
991  INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
992  INT_VAR_H(crunch_debug, 0, "As it says");
993  INT_VAR_H(fixsp_non_noise_limit, 1,
994  "How many non-noise blbs either side?");
995  double_VAR_H(fixsp_small_outlines_size, 0.28, "Small if lt xht x this");
996  BOOL_VAR_H(tessedit_prefer_joined_punct, false, "Reward punctation joins");
997  INT_VAR_H(fixsp_done_mode, 1, "What constitues done for spacing");
998  INT_VAR_H(debug_fix_space_level, 0, "Contextual fixspace debug");
999  STRING_VAR_H(numeric_punctuation, ".,",
1000  "Punct. chs expected WITHIN numbers");
1001  INT_VAR_H(x_ht_acceptance_tolerance, 8,
1002  "Max allowed deviation of blob top outside of font data");
1003  INT_VAR_H(x_ht_min_change, 8, "Min change in xht before actually trying it");
1004  INT_VAR_H(superscript_debug, 0, "Debug level for sub & superscript fixer");
1005  double_VAR_H(superscript_worse_certainty, 2.0, "How many times worse "
1006  "certainty does a superscript position glyph need to be for us "
1007  "to try classifying it as a char with a different baseline?");
1008  double_VAR_H(superscript_bettered_certainty, 0.97, "What reduction in "
1009  "badness do we think sufficient to choose a superscript over "
1010  "what we'd thought. For example, a value of 0.6 means we want "
1011  "to reduce badness of certainty by 40%");
1012  double_VAR_H(superscript_scaledown_ratio, 0.4,
1013  "A superscript scaled down more than this is unbelievably "
1014  "small. For example, 0.3 means we expect the font size to "
1015  "be no smaller than 30% of the text line font size.");
1016  double_VAR_H(subscript_max_y_top, 0.5,
1017  "Maximum top of a character measured as a multiple of x-height "
1018  "above the baseline for us to reconsider whether it's a "
1019  "subscript.");
1020  double_VAR_H(superscript_min_y_bottom, 0.3,
1021  "Minimum bottom of a character measured as a multiple of "
1022  "x-height above the baseline for us to reconsider whether it's "
1023  "a superscript.");
1024  BOOL_VAR_H(tessedit_write_block_separators, false,
1025  "Write block separators in output");
1026  BOOL_VAR_H(tessedit_write_rep_codes, false,
1027  "Write repetition char code");
1028  BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
1029  BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
1030  BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
1031  BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
1032  BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
1033  BOOL_VAR_H(textonly_pdf, false,
1034  "Create PDF with only one invisible text layer");
1035  STRING_VAR_H(unrecognised_char, "|",
1036  "Output char for unidentified blobs");
1037  INT_VAR_H(suspect_level, 99, "Suspect marker level");
1038  INT_VAR_H(suspect_space_level, 100,
1039  "Min suspect level for rejecting spaces");
1040  INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this");
1041  BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
1042  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
1043  double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
1044  BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
1045  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
1046  BOOL_VAR_H(tessedit_word_for_word, false,
1047  "Make output have exactly one word per WERD");
1048  BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
1049  "Don't reject ANYTHING AT ALL");
1050  BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
1051  INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
1052  BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
1053  BOOL_VAR_H(tessedit_flip_0O, true, "Contextual 0O O0 flips");
1054  double_VAR_H(tessedit_lower_flip_hyphen, 1.5,
1055  "Aspect ratio dot/hyphen test");
1056  double_VAR_H(tessedit_upper_flip_hyphen, 1.8,
1057  "Aspect ratio dot/hyphen test");
1058  BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
1059  BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
1060  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
1061  BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
1062  BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
1063  BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
1064  BOOL_VAR_H(rej_use_sensible_wd, false, "Extend permuter check");
1065  BOOL_VAR_H(rej_alphas_in_number_perm, false, "Extend permuter check");
1066  double_VAR_H(rej_whole_of_mostly_reject_word_fract, 0.85, "if >this fract");
1067  INT_VAR_H(tessedit_image_border, 2, "Rej blbs near image edge limit");
1068  STRING_VAR_H(ok_repeated_ch_non_alphanum_wds, "-?*\075",
1069  "Allow NN to unrej");
1070  STRING_VAR_H(conflict_set_I_l_1, "Il1[]", "Il1 conflict set");
1071  INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this");
1072  BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes");
1073  INT_VAR_H(tessedit_page_number, -1,
1074  "-1 -> All pages, else specific page to process");
1075  BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE");
1076  BOOL_VAR_H(interactive_display_mode, false, "Run interactively?");
1077  STRING_VAR_H(file_type, ".tif", "Filename extension");
1078  BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word");
1079  STRING_VAR_H(tessedit_load_sublangs, "",
1080  "List of languages to load with this one");
1081  BOOL_VAR_H(tessedit_use_primary_params_model, false,
1082  "In multilingual mode use params model of the primary language");
1083  // Min acceptable orientation margin (difference in scores between top and 2nd
1084  // choice in OSResults::orientations) to believe the page orientation.
1085  double_VAR_H(min_orientation_margin, 7.0,
1086  "Min acceptable orientation margin");
1087  BOOL_VAR_H(textord_tabfind_show_vlines, false, "Debug line finding");
1088  BOOL_VAR_H(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model");
1089  BOOL_VAR_H(poly_allow_detailed_fx, false,
1090  "Allow feature extractors to see the original outline");
1091  BOOL_VAR_H(tessedit_init_config_only, false,
1092  "Only initialize with the config file. Useful if the instance is "
1093  "not going to be used for OCR but say only for layout analysis.");
1094  BOOL_VAR_H(textord_equation_detect, false, "Turn on equation detector");
1095  BOOL_VAR_H(textord_tabfind_vertical_text, true, "Enable vertical detection");
1096  BOOL_VAR_H(textord_tabfind_force_vertical_text, false,
1097  "Force using vertical text page mode");
1098  double_VAR_H(textord_tabfind_vertical_text_ratio, 0.5,
1099  "Fraction of textlines deemed vertical to use vertical page "
1100  "mode");
1101  double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
1102  "Fraction of height used as a minimum gap for aligned blobs.");
1103  INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
1104  BOOL_VAR_H(preserve_interword_spaces, false,
1105  "Preserve multiple interword spaces");
1106  STRING_VAR_H(page_separator, "\f",
1107  "Page separator (default is form feed control character)");
1108 
1109  // The following parameters were deprecated and removed from their original
1110  // locations. The parameters are temporarily kept here to give Tesseract
1111  // users a chance to updated their [lang].traineddata and config files
1112  // without introducing failures during Tesseract initialization.
1113  // TODO(ocr-team): remove these parameters from the code once we are
1114  // reasonably sure that Tesseract users have updated their data files.
1115  //
1116  // BEGIN DEPRECATED PARAMETERS
1117  BOOL_VAR_H(textord_tabfind_vertical_horizontal_mix, true,
1118  "find horizontal lines such as headers in vertical page mode");
1119  INT_VAR_H(tessedit_ok_mode, 5, "Acceptance decision algorithm");
1120  BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
1121  " dawgs (e.g. for non-space delimited languages)");
1122  INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
1123  BOOL_VAR_H(permute_debug, 0, "char permutation debug");
1124  double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
1125  " current best rate to prune other hypotheses");
1126  BOOL_VAR_H(permute_script_word, 0,
1127  "Turn on word script consistency permuter");
1128  BOOL_VAR_H(segment_segcost_rating, 0,
1129  "incorporate segmentation cost in word rating?");
1130  double_VAR_H(segment_reward_script, 0.95,
1131  "Score multipler for script consistency within a word. "
1132  "Being a 'reward' factor, it should be <= 1. "
1133  "Smaller value implies bigger reward.");
1134  BOOL_VAR_H(permute_fixed_length_dawg, 0,
1135  "Turn on fixed-length phrasebook search permuter");
1136  BOOL_VAR_H(permute_chartype_word, 0,
1137  "Turn on character type (property) consistency permuter");
1138  double_VAR_H(segment_reward_chartype, 0.97,
1139  "Score multipler for char type consistency within a word. ");
1140  double_VAR_H(segment_reward_ngram_best_choice, 0.99,
1141  "Score multipler for ngram permuter's best choice"
1142  " (only used in the Han script path).");
1143  BOOL_VAR_H(ngram_permuter_activated, false,
1144  "Activate character-level n-gram-based permuter");
1145  BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
1146  INT_VAR_H(language_model_fixed_length_choices_depth, 3,
1147  "Depth of blob choice lists to explore"
1148  " when fixed length dawgs are on");
1149  BOOL_VAR_H(use_new_state_cost, FALSE,
1150  "use new state cost heuristics for segmentation state evaluation");
1151  double_VAR_H(heuristic_segcost_rating_base, 1.25,
1152  "base factor for adding segmentation cost into word rating."
1153  "It's a multiplying factor, the larger the value above 1, "
1154  "the bigger the effect of segmentation cost.");
1155  double_VAR_H(heuristic_weight_rating, 1,
1156  "weight associated with char rating in combined cost of state");
1157  double_VAR_H(heuristic_weight_width, 1000.0,
1158  "weight associated with width evidence in combined cost of"
1159  " state");
1160  double_VAR_H(heuristic_weight_seamcut, 0,
1161  "weight associated with seam cut in combined cost of state");
1162  double_VAR_H(heuristic_max_char_wh_ratio, 2.0,
1163  "max char width-to-height ratio allowed in segmentation");
1164  BOOL_VAR_H(enable_new_segsearch, false,
1165  "Enable new segmentation search path.");
1166  double_VAR_H(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
1167  "Maximum character width-to-height ratio for"
1168  "fixed pitch fonts");
1169  // END DEPRECATED PARAMETERS
1170 
1172  FILE *init_recog_training(const STRING &fname);
1173  void recog_training_segmented(const STRING &fname,
1174  PAGE_RES *page_res,
1175  volatile ETEXT_DESC *monitor,
1176  FILE *output_file);
1177  void ambigs_classify_and_output(const char *label,
1178  PAGE_RES_IT* pr_it,
1179  FILE *output_file);
1180 
1181  private:
1182  // The filename of a backup config file. If not null, then we currently
1183  // have a temporary debug config file loaded, and backup_config_file_
1184  // will be loaded, and set to null when debug is complete.
1185  const char* backup_config_file_;
1186  // The filename of a config file to read when processing a debug word.
1187  STRING word_config_;
1188  // Image used for input to layout analysis and tesseract recognition.
1189  // May be modified by the ShiroRekhaSplitter to eliminate the top-line.
1190  Pix* pix_binary_;
1191  // Grey-level input image if the input was not binary, otherwise NULL.
1192  Pix* pix_grey_;
1193  // Original input image. Color if the input was color.
1194  Pix* pix_original_;
1195  // Thresholds that were used to generate the thresholded image from grey.
1196  Pix* pix_thresholds_;
1197  // Debug images. If non-empty, will be written on destruction.
1198  DebugPixa pixa_debug_;
1199  // Input image resolution after any scaling. The resolution is not well
1200  // transmitted by operations on Pix, so we keep an independent record here.
1201  int source_resolution_;
1202  // The shiro-rekha splitter object which is used to split top-lines in
1203  // Devanagari words to provide a better word and grapheme segmentation.
1204  ShiroRekhaSplitter splitter_;
1205  // Page segmentation/layout
1206  Textord textord_;
1207  // True if the primary language uses right_to_left reading order.
1208  bool right_to_left_;
1209  Pix* scaled_color_;
1210  int scaled_factor_;
1211  FCOORD deskew_;
1212  FCOORD reskew_;
1213  TesseractStats stats_;
1214  // Sub-languages to be tried in addition to this.
1215  GenericVector<Tesseract*> sub_langs_;
1216  // Most recently used Tesseract out of this and sub_langs_. The default
1217  // language for the next word.
1218  Tesseract* most_recently_used_;
1219  // The size of the font table, ie max possible font id + 1.
1220  int font_table_size_;
1221  // Equation detector. Note: this pointer is NOT owned by the class.
1222  EquationDetect* equ_detect_;
1223  // LSTM recognizer, if available.
1224  LSTMRecognizer* lstm_recognizer_;
1225  // Output "page" number (actually line number) using TrainLineRecognizer.
1226  int train_line_page_num_;
1227 };
1228 
1229 } // namespace tesseract
1230 
1231 #endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_
Definition: points.h:189
void flip_0O(WERD_RES *word)
const Textord & textord() const
Pix * pix_binary() const
CRUNCH_MODE
Definition: pageres.h:145
void SetScaledColor(int factor, Pix *color)
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
Pix * pix_grey() const
void set_pix_original(Pix *original_pix)
Tesseract * get_sub_lang(int index) const
int16_t inT16
Definition: host.h:36
PointerVector< WERD_RES > lang_words
void dont_allow_1Il(WERD_RES *word)
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: ocrrow.h:32
Pix * pix_original() const
Definition: ocrblock.h:30
void flip_hyphens(WERD_RES *word)
Pix * scaled_color() const
SetParamConstraint
Definition: params.h:36
int num_sub_langs() const
int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem)
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
#define INT_VAR_H(name, val, comment)
Definition: params.h:264
Definition: blobs.h:395
const FCOORD & reskew() const
void set_pix_thresholds(Pix *thresholds)
#define FALSE
Definition: capi.h:46
int scaled_factor() const
BOOL8 non_0_digit(const char *str, int length)
bool AnyLSTMLang() const
WordData(BLOCK *block_in, ROW *row_in, WERD_RES *word_res)
GARBAGE_LEVEL
Definition: docqual.h:25
int source_resolution() const
Definition: strngs.h:45
int32_t inT32
Definition: host.h:38
Definition: rect.h:30
Definition: blobs.h:261
#define double_VAR_H(name, val, comment)
Definition: params.h:273
Textord * mutable_textord()
CMD_EVENTS mode
Definition: pgedit.cpp:116
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:267
void set_source_resolution(int ppi)
unsigned char BOOL8
Definition: host.h:44
void set_pix_grey(Pix *grey_pix)
Definition: werd.h:60
Pix * BestPix() const
#define STRING_VAR_H(name, val, comment)
Definition: params.h:270
bool right_to_left() const
uint16_t uinT16
Definition: host.h:37
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
WordData(const PAGE_RES_IT &page_res_it)
bool AnyTessLang() const
int UNICHAR_ID
Definition: unichar.h:35
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)