tesseract  4.00.00dev
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word, const UNICHARSET &unicharset) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const STRING &lang, TessdataManager *data_file)
 
void LoadLSTM (const STRING &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not NULL contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be NULL if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uinT8 perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_ngram_best_choice = 1.24
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
int max_viterbi_list_size = 10
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
bool save_raw_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
char * word_to_debug_lengths = ""
 
int fragments_debug = 0
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 87 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 33 of file dict.cpp.

37  ccutil_(ccutil),
38  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
39  getCCUtil()->params()),
41  "A suffix of user-provided words located in tessdata.",
42  getCCUtil()->params()),
44  "A filename of user-provided patterns.",
45  getCCUtil()->params()),
47  "A suffix of user-provided patterns located in "
48  "tessdata.",
49  getCCUtil()->params()),
50  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
55  getCCUtil()->params()),
57  "Load dawg with punctuation"
58  " patterns.",
59  getCCUtil()->params()),
61  "Load dawg with number"
62  " patterns.",
63  getCCUtil()->params()),
65  "Load dawg with special word "
66  "bigrams.",
67  getCCUtil()->params()),
69  "Score penalty (0.1 = 10%) added if there are subscripts "
70  "or superscripts in a word, but it is otherwise OK.",
71  getCCUtil()->params()),
73  "Score penalty (0.1 = 10%) added if an xheight is "
74  "inconsistent.",
75  getCCUtil()->params()),
77  "Score multiplier for word matches which have good case and"
78  " are frequent in the given language (lower is better).",
79  getCCUtil()->params()),
81  "Score multiplier for word matches that have good case "
82  "(lower is better).",
83  getCCUtil()->params()),
85  "Default score multiplier for word matches, which may have "
86  "case issues (lower is better).",
87  getCCUtil()->params()),
89  "Multipler to for the best choice from the ngram model.",
90  getCCUtil()->params()),
92  "Score multiplier for glyph fragment segmentations which "
93  "do not match a dictionary word (lower is better).",
94  getCCUtil()->params()),
96  "Score multiplier for poorly cased strings that are not in"
97  " the dictionary and generally look like garbage (lower is"
98  " better).",
99  getCCUtil()->params()),
101  "Output file for ambiguities found in the dictionary",
102  getCCUtil()->params()),
104  "Set to 1 for general debug info"
105  ", to 2 for more details, to 3 to see all the debug messages",
106  getCCUtil()->params()),
107  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
108  getCCUtil()->params()),
109  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
110  getCCUtil()->params()),
112  "Use only the first UTF8 step of the given string"
113  " when computing log probabilities.",
114  getCCUtil()->params()),
115  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
116  getCCUtil()->params()),
118  "Certainty threshold for non-dict words",
119  getCCUtil()->params()),
121  "Reject certainty offset", getCCUtil()->params()),
123  "Size of dict word to be treated as non-dict word",
124  getCCUtil()->params()),
126  "Certainty to add"
127  " for each dict char above small word size.",
128  getCCUtil()->params()),
130  "Max certaintly variation allowed in a word (in sigma)",
131  getCCUtil()->params()),
132  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
133  getCCUtil()->params()),
135  "Make AcceptableChoice() always return false. Useful"
136  " when there is a need to explore all segmentations",
137  getCCUtil()->params()),
139  "Deprecated- backward compatibility only",
140  getCCUtil()->params()),
142  "Max words to keep in list", getCCUtil()->params()),
144  "Word for which stopper debug"
145  " information should be printed to stdout",
146  getCCUtil()->params()),
148  "Lengths of unichars in word_to_debug",
149  getCCUtil()->params()),
150  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
151  getCCUtil()->params()),
153  "Don't use any alphabetic-specific tricks."
154  " Set to true in the traineddata config file for"
155  " scripts that are cursive or inherently fixed-pitch",
156  getCCUtil()->params()),
157  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
158  getCCUtil()->params()),
160  "Worst certainty for using pending dictionary",
161  getCCUtil()->params()),
163  "Worst certainty for words that can be inserted into the"
164  " document dictionary",
165  getCCUtil()->params()),
167  "Maximum number of different"
168  " character choices to consider during permutation."
169  " This limit is especially useful when user patterns"
170  " are specified, since overly generic patterns can result in"
171  " dawg search exploring an overly large number of options.",
172  getCCUtil()->params()) {
173  dang_ambigs_table_ = NULL;
174  replace_ambigs_table_ = NULL;
175  reject_offset_ = 0.0;
176  go_deeper_fxn_ = NULL;
177  hyphen_word_ = NULL;
178  last_word_on_line_ = false;
179  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
180  document_words_ = NULL;
181  dawg_cache_ = NULL;
182  dawg_cache_is_ours_ = false;
183  pending_words_ = NULL;
184  bigram_dawg_ = NULL;
185  freq_dawg_ = NULL;
186  punc_dawg_ = NULL;
187  unambig_dawg_ = NULL;
188  wordseg_rating_adjust_factor_ = -1.0f;
189  output_ambig_words_file_ = NULL;
190 }
bool load_unambig_dawg
Definition: dict.h:566
bool load_freq_dawg
Definition: dict.h:565
double certainty_scale
Definition: dict.h:611
int max_permuter_attempts
Definition: dict.h:647
double stopper_allowable_character_badness
Definition: dict.h:621
bool load_bigram_dawg
Definition: dict.h:571
double segment_penalty_dict_case_bad
Definition: dict.h:588
char * word_to_debug
Definition: dict.h:630
int fragments_debug
Definition: dict.h:633
char * output_ambig_words_file
Definition: dict.h:603
int hyphen_debug_level
Definition: dict.h:606
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:615
double doc_dict_certainty_threshold
Definition: dict.h:642
char * user_words_file
Definition: dict.h:557
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
bool load_number_dawg
Definition: dict.h:569
bool use_only_first_uft8_step
Definition: dict.h:610
double segment_penalty_dict_frequent_word
Definition: dict.h:580
double segment_penalty_garbage
Definition: dict.h:601
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:372
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:400
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:306
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:318
int stopper_debug_level
Definition: dict.h:622
bool segment_nonalphabetic_script
Definition: dict.h:637
int stopper_smallword_size
Definition: dict.h:617
const CCUtil * getCCUtil() const
Definition: dict.h:91
double segment_penalty_ngram_best_choice
Definition: dict.h:592
bool save_doc_words
Definition: dict.h:638
char * user_words_suffix
Definition: dict.h:559
double xheight_penalty_subscripts
Definition: dict.h:574
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:204
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:315
bool load_system_dawg
Definition: dict.h:564
double stopper_nondict_certainty_base
Definition: dict.h:613
char * user_patterns_file
Definition: dict.h:561
char * user_patterns_suffix
Definition: dict.h:563
CCUtil ccutil
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
char * word_to_debug_lengths
Definition: dict.h:632
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:383
bool save_raw_choices
Definition: dict.h:627
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:366
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
bool stopper_no_acceptable_choices
Definition: dict.h:625
double segment_penalty_dict_nonword
Definition: dict.h:596
double xheight_penalty_inconsistent
Definition: dict.h:577
double stopper_certainty_per_char
Definition: dict.h:619
int tessedit_truncate_wordchoice_log
Definition: dict.h:628
bool load_punc_dawg
Definition: dict.h:568
double doc_dict_pending_threshold
Definition: dict.h:640
double segment_penalty_dict_case_ok
Definition: dict.h:584
int max_viterbi_list_size
Definition: dict.h:607
int dawg_debug_level
Definition: dict.h:605

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 192 of file dict.cpp.

192  {
193  End();
194  delete hyphen_word_;
195  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
196 }
void End()
Definition: dict.cpp:348

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 70 of file context.cpp.

71  {
72  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
73  int num_alphanum = 0;
74  for (int x = 0; x < word.length(); ++x) {
75  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
76  unicharset.get_isdigit(word.unichar_id(x)));
77  }
78  return (static_cast<float>(num_alphanum) /
79  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
80 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int length() const
Definition: ratngs.h:299

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 50 of file stopper.cpp.

51  {
52  float CertaintyThreshold = stopper_nondict_certainty_base;
53  int WordSize;
54 
55  if (stopper_no_acceptable_choices) return false;
56 
57  if (best_choice.length() == 0) return false;
58 
59  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
60  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
61  bool is_case_ok = case_ok(best_choice, getUnicharset());
62 
63  if (stopper_debug_level >= 1) {
64  const char *xht = "UNKNOWN";
65  switch (xheight_consistency) {
66  case XH_GOOD: xht = "NORMAL"; break;
67  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
68  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
69  default: xht = "UNKNOWN";
70  }
71  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
72  best_choice.unichar_string().string(),
73  (is_valid_word ? 'y' : 'n'),
74  (is_case_ok ? 'y' : 'n'),
75  xht,
76  best_choice.min_x_height(),
77  best_choice.max_x_height());
78  }
79  // Do not accept invalid words in PASS1.
80  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
81  if (is_valid_word && is_case_ok) {
82  WordSize = LengthOfShortestAlphaRun(best_choice);
83  WordSize -= stopper_smallword_size;
84  if (WordSize < 0)
85  WordSize = 0;
86  CertaintyThreshold += WordSize * stopper_certainty_per_char;
87  }
88 
89  if (stopper_debug_level >= 1)
90  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
91  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
92 
93  if (no_dang_ambigs &&
94  best_choice.certainty() > CertaintyThreshold &&
95  xheight_consistency < XH_INCONSISTENT &&
96  UniformCertainties(best_choice)) {
97  return true;
98  } else {
99  if (stopper_debug_level >= 1) {
100  tprintf("AcceptableChoice() returned false"
101  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
102  no_dang_ambigs, best_choice.certainty(),
103  CertaintyThreshold,
104  UniformCertainties(best_choice));
105  }
106  return false;
107  }
108 }
uinT8 permuter() const
Definition: ratngs.h:342
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
#define tprintf(...)
Definition: tprintf.h:31
float min_x_height() const
Definition: ratngs.h:332
const char * string() const
Definition: strngs.cpp:198
int stopper_debug_level
Definition: dict.h:622
int stopper_smallword_size
Definition: dict.h:617
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:451
double stopper_nondict_certainty_base
Definition: dict.h:613
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:470
bool dangerous_ambig_found() const
Definition: ratngs.h:359
int length() const
Definition: ratngs.h:299
bool stopper_no_acceptable_choices
Definition: dict.h:625
double stopper_certainty_per_char
Definition: dict.h:619
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
float max_x_height() const
Definition: ratngs.h:335
const STRING & unichar_string() const
Definition: ratngs.h:537

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 110 of file stopper.cpp.

110  {
111  if (word->best_choice == NULL) return false;
112  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
113  int WordSize;
114 
115  if (stopper_debug_level >= 1) {
116  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
117  word->best_choice->debug_string().string(),
118  (valid_word(*word->best_choice) ? 'y' : 'n'),
119  (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
120  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
121  word->best_choices.singleton() ? 'n' : 'y');
122  }
123 
124  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
125  return false;
126  if (valid_word(*word->best_choice) &&
127  case_ok(*word->best_choice, getUnicharset())) {
128  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
129  WordSize -= stopper_smallword_size;
130  if (WordSize < 0)
131  WordSize = 0;
132  CertaintyThreshold += WordSize * stopper_certainty_per_char;
133  }
134 
135  if (stopper_debug_level >= 1)
136  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
137  word->best_choice->certainty(), CertaintyThreshold);
138 
139  if (word->best_choice->certainty() > CertaintyThreshold &&
141  if (stopper_debug_level >= 1)
142  tprintf("ACCEPTED\n");
143  return true;
144  } else {
145  if (stopper_debug_level >= 1)
146  tprintf("REJECTED\n");
147  return false;
148  }
149 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
float certainty() const
Definition: ratngs.h:326
const STRING debug_string() const
Definition: ratngs.h:501
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int stopper_debug_level
Definition: dict.h:622
int stopper_smallword_size
Definition: dict.h:617
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:451
double stopper_nondict_certainty_base
Definition: dict.h:613
bool dangerous_ambig_found() const
Definition: ratngs.h:359
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752
int length() const
Definition: ratngs.h:299
bool stopper_no_acceptable_choices
Definition: dict.h:625
double stopper_certainty_per_char
Definition: dict.h:619
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 614 of file dict.cpp.

614  {
615  // Do not add hyphenated word parts to the document dawg.
616  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
617  // called when the first part of the hyphenated word is
618  // discovered and while the second part of the word is recognized.
619  // hyphen_word_ is cleared in cc_recg() before the next word on
620  // the line is recognized.
621  if (hyphen_word_) return;
622 
623  char filename[CHARS_PER_LINE];
624  FILE *doc_word_file;
625  int stringlen = best_choice.length();
626 
627  if (valid_word(best_choice) || stringlen < 2)
628  return;
629 
630  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
631  if (best_choice.length() >= kDocDictMaxRepChars) {
632  int num_rep_chars = 1;
633  UNICHAR_ID uch_id = best_choice.unichar_id(0);
634  for (int i = 1; i < best_choice.length(); ++i) {
635  if (best_choice.unichar_id(i) != uch_id) {
636  num_rep_chars = 1;
637  uch_id = best_choice.unichar_id(i);
638  } else {
639  ++num_rep_chars;
640  if (num_rep_chars == kDocDictMaxRepChars) return;
641  }
642  }
643  }
644 
645  if (best_choice.certainty() < doc_dict_certainty_threshold ||
646  stringlen == 2) {
647  if (best_choice.certainty() < doc_dict_pending_threshold)
648  return;
649 
650  if (!pending_words_->word_in_dawg(best_choice)) {
651  if (stringlen > 2 ||
652  (stringlen == 2 &&
653  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
654  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
655  pending_words_->add_word_to_dawg(best_choice);
656  }
657  return;
658  }
659  }
660 
661  if (save_doc_words) {
662  strcpy(filename, getCCUtil()->imagefile.string());
663  strcat(filename, ".doc");
664  doc_word_file = open_file (filename, "a");
665  fprintf(doc_word_file, "%s\n",
666  best_choice.debug_string().string());
667  fclose(doc_word_file);
668  }
669  document_words_->add_word_to_dawg(best_choice);
670 }
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
float certainty() const
Definition: ratngs.h:326
const STRING debug_string() const
Definition: ratngs.h:501
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
double doc_dict_certainty_threshold
Definition: dict.h:642
const char * string() const
Definition: strngs.cpp:198
const CCUtil * getCCUtil() const
Definition: dict.h:91
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
bool save_doc_words
Definition: dict.h:638
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752
int length() const
Definition: ratngs.h:299
double doc_dict_pending_threshold
Definition: dict.h:640
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
#define CHARS_PER_LINE
Definition: cutil.h:57
int UNICHAR_ID
Definition: unichar.h:35

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 672 of file dict.cpp.

677  {
678  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
679  word->GetTopScriptID() == getUnicharset().han_sid());
680  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
681  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
682 
683  float adjust_factor = additional_adjust;
684  float new_rating = word->rating();
685  new_rating += kRatingPad;
686  const char *xheight_triggered = "";
687  if (word->length() > 1) {
688  // Calculate x-height and y-offset consistency penalties.
689  switch (xheight_consistency) {
690  case XH_INCONSISTENT:
691  adjust_factor += xheight_penalty_inconsistent;
692  xheight_triggered = ", xhtBAD";
693  break;
694  case XH_SUBNORMAL:
695  adjust_factor += xheight_penalty_subscripts;
696  xheight_triggered = ", xhtSUB";
697  break;
698  case XH_GOOD:
699  // leave the factor alone - all good!
700  break;
701  }
702  // TODO(eger): if nonword is true, but there is a "core" thats' a dict
703  // word, negate nonword status.
704  } else {
705  if (debug) {
706  tprintf("Consistency could not be calculated.\n");
707  }
708  }
709  if (debug) {
710  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
711  word->unichar_string().string(), word->rating(),
712  xheight_triggered);
713  }
714 
715  if (nonword) { // non-dictionary word
716  if (case_is_ok && punc_is_ok) {
717  adjust_factor += segment_penalty_dict_nonword;
718  new_rating *= adjust_factor;
719  if (debug) tprintf(", W");
720  } else {
721  adjust_factor += segment_penalty_garbage;
722  new_rating *= adjust_factor;
723  if (debug) {
724  if (!case_is_ok) tprintf(", C");
725  if (!punc_is_ok) tprintf(", P");
726  }
727  }
728  } else { // dictionary word
729  if (case_is_ok) {
730  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
732  adjust_factor += segment_penalty_dict_frequent_word;
733  new_rating *= adjust_factor;
734  if (debug) tprintf(", F");
735  } else {
736  adjust_factor += segment_penalty_dict_case_ok;
737  new_rating *= adjust_factor;
738  if (debug) tprintf(", ");
739  }
740  } else {
741  adjust_factor += segment_penalty_dict_case_bad;
742  new_rating *= adjust_factor;
743  if (debug) tprintf(", C");
744  }
745  }
746  new_rating -= kRatingPad;
747  if (modify_rating) word->set_rating(new_rating);
748  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
749  word->set_adjust_factor(adjust_factor);
750 }
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
int han_sid() const
Definition: unicharset.h:887
int GetTopScriptID() const
Definition: ratngs.cpp:656
void set_rating(float new_val)
Definition: ratngs.h:365
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:828
double segment_penalty_dict_case_bad
Definition: dict.h:588
float rating() const
Definition: ratngs.h:323
#define tprintf(...)
Definition: tprintf.h:31
double segment_penalty_dict_frequent_word
Definition: dict.h:580
double segment_penalty_garbage
Definition: dict.h:601
const char * string() const
Definition: strngs.cpp:198
void set_permuter(uinT8 perm)
Definition: ratngs.h:371
double xheight_penalty_subscripts
Definition: dict.h:574
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
void set_adjust_factor(float factor)
Definition: ratngs.h:305
int length() const
Definition: ratngs.h:299
double segment_penalty_dict_nonword
Definition: dict.h:596
double xheight_penalty_inconsistent
Definition: dict.h:577
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
double segment_penalty_dict_case_ok
Definition: dict.h:584
int null_sid() const
Definition: unicharset.h:882
const STRING & unichar_string() const
Definition: ratngs.h:537

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 245 of file permdawg.cpp.

256  {
257  int word_ending =
258  (char_choice_index == char_choices.length() - 1) ? true : false;
259 
260  // Deal with fragments.
261  CHAR_FRAGMENT_INFO char_frag_info;
262  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
263  blob_choice.certainty(), prev_char_frag_info, debug,
264  word_ending, &char_frag_info)) {
265  return; // blob_choice must be an invalid fragment
266  }
267  // Search the next letter if this character is a fragment.
268  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
269  permute_choices(debug, char_choices, char_choice_index + 1,
270  &char_frag_info, word, certainties, limit,
271  best_choice, attempts_left, more_args);
272  return;
273  }
274 
275  // Add the next unichar.
276  float old_rating = word->rating();
277  float old_certainty = word->certainty();
278  uinT8 old_permuter = word->permuter();
279  certainties[word->length()] = char_frag_info.certainty;
281  char_frag_info.unichar_id, char_frag_info.num_fragments,
282  char_frag_info.rating, char_frag_info.certainty);
283 
284  // Explore the next unichar.
285  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
286  &char_frag_info, word_ending, word, certainties,
287  limit, best_choice, attempts_left, more_args);
288 
289  // Remove the unichar we added to explore other choices in it's place.
290  word->remove_last_unichar_id();
291  word->set_rating(old_rating);
292  word->set_certainty(old_certainty);
293  word->set_permuter(old_permuter);
294 }
uinT8 permuter() const
Definition: ratngs.h:342
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void set_rating(float new_val)
Definition: ratngs.h:365
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
uint8_t uinT8
Definition: host.h:35
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:321
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
UNICHAR_ID unichar_id
Definition: dict.h:40
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448
int length() const
Definition: genericvector.h:86
void set_permuter(uinT8 perm)
Definition: ratngs.h:371
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:204
void set_certainty(float new_val)
Definition: ratngs.h:368
float rating() const
Definition: ratngs.h:79
void remove_last_unichar_id()
Definition: ratngs.h:479
int length() const
Definition: ratngs.h:299
float certainty
Definition: dict.h:44
float rating
Definition: dict.h:43
float certainty() const
Definition: ratngs.h:82
int num_fragments
Definition: dict.h:42

◆ CallParamsModelClassify()

float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 403 of file dict.h.

403  {
404  ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert
405  return (this->*params_model_classify_)(
406  getCCUtil()->lang.string(), path);
407  }
const char * string() const
Definition: strngs.cpp:198
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:400
STRING lang
Definition: ccutil.h:66
const CCUtil * getCCUtil() const
Definition: dict.h:91
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word,
const UNICHARSET unicharset 
) const

Check a string to see if it matches a set of lexical rules.

Definition at line 52 of file context.cpp.

52  {
53  int state = 0;
54  int x;
55  for (x = 0; x < word.length(); ++x) {
56  UNICHAR_ID ch_id = word.unichar_id(x);
57  if (unicharset.get_isupper(ch_id))
58  state = case_state_table[state][1];
59  else if (unicharset.get_islower(ch_id))
60  state = case_state_table[state][2];
61  else if (unicharset.get_isdigit(ch_id))
62  state = case_state_table[state][3];
63  else
64  state = case_state_table[state][0];
65  if (state == -1) return false;
66  }
67  return state != 5; // single lower is bad
68 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
const int case_state_table[6][4]
Definition: context.cpp:35
int length() const
Definition: ratngs.h:299
int UNICHAR_ID
Definition: unichar.h:35

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 430 of file dict.h.

430  {
431  if (!dawg) return ch;
432  switch (dawg->type()) {
433  case DAWG_TYPE_NUMBER:
435  default:
436  return ch;
437  }
438  }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 108 of file dict.h.

108  {
109  const GenericVector<UNICHAR_ID>& normed_ids =
110  getUnicharset().normed_ids(unichar_id);
111  return normed_ids.size() == 1 &&
112  (normed_ids[0] == hyphen_unichar_id_ ||
113  normed_ids[0] == slash_unichar_id_);
114  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not NULL.

Definition at line 136 of file dict.h.

136  {
137  if (this->hyphenated()) {
138  *word = *hyphen_word_;
139  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
140  }
141  }
int hyphen_debug_level
Definition: dict.h:606
void print() const
Definition: ratngs.h:576
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 174 of file permdawg.cpp.

175  {
176  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
177  best_choice->make_bad();
178  best_choice->set_rating(rating_limit);
179  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
180  return best_choice;
181  DawgPositionVector *active_dawgs =
182  new DawgPositionVector[char_choices.length() + 1];
183  init_active_dawgs(&(active_dawgs[0]), true);
184  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
186 
187  float certainties[MAX_WERD_LENGTH];
189  int attempts_left = max_permuter_attempts;
190  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL,
191  char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice,
192  &attempts_left, &dawg_args);
193  delete[] active_dawgs;
194  return best_choice;
195 }
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:570
int max_permuter_attempts
Definition: dict.h:647
void set_rating(float new_val)
Definition: ratngs.h:365
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:51
#define MAX_WERD_LENGTH
Definition: dict.h:35
int length() const
Definition: genericvector.h:86
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:204
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:439
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int dawg_debug_level
Definition: dict.h:605

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, nd updated_dawgs member variables of dawg_args are not NULL.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 372 of file dict.cpp.

374  {
375  DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
376 
377  if (dawg_debug_level >= 3) {
378  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
379  " num active dawgs=%d\n",
380  getUnicharset().debug_str(unichar_id).string(), word_end,
381  dawg_args->active_dawgs->length());
382  }
383 
384  // Do not accept words that contain kPatternUnicharID.
385  // (otherwise pattern dawgs would not function correctly).
386  // Do not accept words containing INVALID_UNICHAR_IDs.
387  if (unichar_id == Dawg::kPatternUnicharID ||
388  unichar_id == INVALID_UNICHAR_ID) {
389  dawg_args->permuter = NO_PERM;
390  return NO_PERM;
391  }
392 
393  // Initialization.
394  PermuterType curr_perm = NO_PERM;
395  dawg_args->updated_dawgs->clear();
396  dawg_args->valid_end = false;
397 
398  // Go over the active_dawgs vector and insert DawgPosition records
399  // with the updated ref (an edge with the corresponding unichar id) into
400  // dawg_args->updated_pos.
401  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
402  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
403  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
404  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
405 
406  if (!dawg && !punc_dawg) {
407  // shouldn't happen.
408  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
409  continue;
410  }
411  if (!dawg) {
412  // We're in the punctuation dawg. A core dawg has not been chosen.
413  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
414  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
415  punc_node, Dawg::kPatternUnicharID, word_end);
416  if (punc_transition_edge != NO_EDGE) {
417  // Find all successors, and see which can transition.
418  const SuccessorList &slist = *(successors_[pos.punc_index]);
419  for (int s = 0; s < slist.length(); ++s) {
420  int sdawg_index = slist[s];
421  const Dawg *sdawg = dawgs_[sdawg_index];
422  UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
423  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
424  if (dawg_edge != NO_EDGE) {
425  if (dawg_debug_level >=3) {
426  tprintf("Letter found in dawg %d\n", sdawg_index);
427  }
428  dawg_args->updated_dawgs->add_unique(
429  DawgPosition(sdawg_index, dawg_edge,
430  pos.punc_index, punc_transition_edge, false),
431  dawg_debug_level > 0,
432  "Append transition from punc dawg to current dawgs: ");
433  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
434  if (sdawg->end_of_word(dawg_edge) &&
435  punc_dawg->end_of_word(punc_transition_edge))
436  dawg_args->valid_end = true;
437  }
438  }
439  }
440  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
441  word_end);
442  if (punc_edge != NO_EDGE) {
443  if (dawg_debug_level >=3) {
444  tprintf("Letter found in punctuation dawg\n");
445  }
446  dawg_args->updated_dawgs->add_unique(
447  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
448  dawg_debug_level > 0,
449  "Extend punctuation dawg: ");
450  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
451  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
452  }
453  continue;
454  }
455 
456  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
457  // We can end the main word here.
458  // If we can continue on the punc ref, add that possibility.
459  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
460  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
461  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
462  if (punc_edge != NO_EDGE) {
463  dawg_args->updated_dawgs->add_unique(
464  DawgPosition(pos.dawg_index, pos.dawg_ref,
465  pos.punc_index, punc_edge, true),
466  dawg_debug_level > 0,
467  "Return to punctuation dawg: ");
468  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
469  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
470  }
471  }
472 
473  if (pos.back_to_punc) continue;
474 
475  // If we are dealing with the pattern dawg, look up all the
476  // possible edges, not only for the exact unichar_id, but also
477  // for all its character classes (alpha, digit, etc).
478  if (dawg->type() == DAWG_TYPE_PATTERN) {
479  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
480  &curr_perm);
481  // There can't be any successors to dawg that is of type
482  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
483  continue;
484  }
485 
486  // Find the edge out of the node for the unichar_id.
487  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
488  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
489  : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
490 
491  if (dawg_debug_level >= 3) {
492  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
493  pos.dawg_index, node, edge);
494  }
495 
496  if (edge != NO_EDGE) { // the unichar was found in the current dawg
497  if (dawg_debug_level >=3) {
498  tprintf("Letter found in dawg %d\n", pos.dawg_index);
499  }
500  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
501  if (dawg_debug_level >= 3) {
502  tprintf("Punctuation constraint not satisfied at end of word.\n");
503  }
504  continue;
505  }
506  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
507  if (dawg->end_of_word(edge) &&
508  (punc_dawg == NULL || punc_dawg->end_of_word(pos.punc_ref)))
509  dawg_args->valid_end = true;
510  dawg_args->updated_dawgs->add_unique(
511  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
512  false),
513  dawg_debug_level > 0,
514  "Append current dawg to updated active dawgs: ");
515  }
516  } // end for
517  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
518  // or if we found the current letter in a non-punctuation dawg. This
519  // allows preserving information on which dawg the "core" word came from.
520  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
521  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
522  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
523  dawg_args->permuter = curr_perm;
524  }
525  if (dawg_debug_level >= 2) {
526  tprintf("Returning %d for permuter code for this character.\n",
527  dawg_args->permuter);
528  }
529  return dawg_args->permuter;
530 }
GenericVector< int > SuccessorList
Definition: dawg.h:69
#define REFFORMAT
Definition: dawg.h:93
PermuterType
Definition: ratngs.h:238
inT64 NODE_REF
Definition: dawg.h:56
inT64 EDGE_REF
Definition: dawg.h:55
#define tprintf(...)
Definition: tprintf.h:31
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:532
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:420
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:430
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int dawg_debug_level
Definition: dict.h:605
int UNICHAR_ID
Definition: unichar.h:35

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 383 of file dict.h.

385  {
386  (void)lang;
387  (void)context;
388  (void)context_bytes;
389  (void)character;
390  (void)character_bytes;
391  return 0.0;
392  }

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 587 of file dict.cpp.

588  {
589  bool punc_dawg_available =
590  (punc_dawg_ != NULL) &&
591  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
592 
593  for (int i = 0; i < dawgs_.length(); i++) {
594  if (dawgs_[i] != NULL &&
595  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
596  int dawg_ty = dawgs_[i]->type();
597  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
598  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
599  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
600  if (dawg_debug_level >= 3) {
601  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
602  NO_EDGE);
603  }
604  } else if (!punc_dawg_available || !subsumed_by_punc) {
605  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
606  if (dawg_debug_level >= 3) {
607  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
608  }
609  }
610  }
611  }
612 }
#define REFFORMAT
Definition: dawg.h:93
#define tprintf(...)
Definition: tprintf.h:31
int length() const
Definition: genericvector.h:86
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int dawg_debug_level
Definition: dict.h:605

◆ End()

void tesseract::Dict::End ( )

Definition at line 348 of file dict.cpp.

348  {
349  if (dawgs_.length() == 0)
350  return; // Not safe to call twice.
351  for (int i = 0; i < dawgs_.size(); i++) {
352  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
353  delete dawgs_[i];
354  }
355  }
356  dawg_cache_->FreeDawg(bigram_dawg_);
357  if (dawg_cache_is_ours_) {
358  delete dawg_cache_;
359  dawg_cache_ = NULL;
360  }
361  successors_.delete_data_pointers();
362  dawgs_.clear();
363  successors_.clear();
364  document_words_ = NULL;
365  delete pending_words_;
366  pending_words_ = NULL;
367 }
int size() const
Definition: genericvector.h:72
void delete_data_pointers()
int length() const
Definition: genericvector.h:86
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 367 of file stopper.cpp.

367 {}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 328 of file dict.cpp.

328  {
329  if (dawgs_.empty()) return false;
330  // Construct a list of corresponding successors for each dawg. Each entry, i,
331  // in the successors_ vector is a vector of integers that represent the
332  // indices into the dawgs_ vector of the successors for dawg i.
333  successors_.reserve(dawgs_.length());
334  for (int i = 0; i < dawgs_.length(); ++i) {
335  const Dawg *dawg = dawgs_[i];
336  SuccessorList *lst = new SuccessorList();
337  for (int j = 0; j < dawgs_.length(); ++j) {
338  const Dawg *other = dawgs_[j];
339  if (dawg != NULL && other != NULL &&
340  (dawg->lang() == other->lang()) &&
341  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
342  }
343  successors_ += lst;
344  }
345  return true;
346 }
void reserve(int size)
GenericVector< int > SuccessorList
Definition: dawg.h:69
bool empty() const
Definition: genericvector.h:91
int length() const
Definition: genericvector.h:86

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 321 of file permdawg.cpp.

325  {
326  const CHAR_FRAGMENT *this_fragment =
327  getUnicharset().get_fragment(curr_unichar_id);
328  const CHAR_FRAGMENT *prev_fragment =
329  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
330 
331  // Print debug info for fragments.
332  if (debug && (prev_fragment || this_fragment)) {
333  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
334  getUnicharset().debug_str(curr_unichar_id).string(),
335  word_ending);
336  if (prev_fragment) {
337  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
338  }
339  if (this_fragment) {
340  tprintf("this_fragment %s\n", this_fragment->to_string().string());
341  }
342  }
343 
344  char_frag_info->unichar_id = curr_unichar_id;
345  char_frag_info->fragment = this_fragment;
346  char_frag_info->rating = curr_rating;
347  char_frag_info->certainty = curr_certainty;
348  char_frag_info->num_fragments = 1;
349  if (prev_fragment && !this_fragment) {
350  if (debug) tprintf("Skip choice with incomplete fragment\n");
351  return false;
352  }
353  if (this_fragment) {
354  // We are dealing with a fragment.
355  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
356  if (prev_fragment) {
357  if (!this_fragment->is_continuation_of(prev_fragment)) {
358  if (debug) tprintf("Non-matching fragment piece\n");
359  return false;
360  }
361  if (this_fragment->is_ending()) {
362  char_frag_info->unichar_id =
363  getUnicharset().unichar_to_id(this_fragment->get_unichar());
364  char_frag_info->fragment = NULL;
365  if (debug) {
366  tprintf("Built character %s from fragments\n",
367  getUnicharset().debug_str(
368  char_frag_info->unichar_id).string());
369  }
370  } else {
371  if (debug) tprintf("Record fragment continuation\n");
372  char_frag_info->fragment = this_fragment;
373  }
374  // Update certainty and rating.
375  char_frag_info->rating =
376  prev_char_frag_info->rating + curr_rating;
377  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
378  char_frag_info->certainty =
379  MIN(curr_certainty, prev_char_frag_info->certainty);
380  } else {
381  if (this_fragment->is_beginning()) {
382  if (debug) tprintf("Record fragment beginning\n");
383  } else {
384  if (debug) {
385  tprintf("Non-starting fragment piece with no prev_fragment\n");
386  }
387  return false;
388  }
389  }
390  }
391  if (word_ending && char_frag_info->fragment) {
392  if (debug) tprintf("Word can not end with a fragment\n");
393  return false;
394  }
395  return true;
396 }
#define MIN(x, y)
Definition: ndminx.h:28
const CHAR_FRAGMENT * fragment
Definition: dict.h:41
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:99
#define tprintf(...)
Definition: tprintf.h:31
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
const char * string() const
Definition: strngs.cpp:198
static STRING to_string(const char *unichar, int pos, int total, bool natural)
UNICHAR_ID unichar_id
Definition: dict.h:40
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
const char * get_unichar() const
Definition: unicharset.h:71
bool is_ending() const
Definition: unicharset.h:109
float certainty
Definition: dict.h:44
float rating
Definition: dict.h:43
bool is_beginning() const
Definition: unicharset.h:106
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int num_fragments
Definition: dict.h:42

◆ getCCUtil() [1/2]

const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 91 of file dict.h.

91  {
92  return ccutil_;
93  }

◆ getCCUtil() [2/2]

CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 94 of file dict.h.

94  {
95  return ccutil_;
96  }

◆ GetDawg()

const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 414 of file dict.h.

414 { return dawgs_[index]; }

◆ GetPuncDawg()

const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 416 of file dict.h.

416 { return punc_dawg_; }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 420 of file dict.h.

420  {
421  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
422  NODE_REF node = dawg->next_node(edge_ref);
423  if (node == 0) node = NO_EDGE; // end of word
424  return node;
425  }
inT64 NODE_REF
Definition: dawg.h:56

◆ GetUnambigDawg()

const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 418 of file dict.h.

418 { return unambig_dawg_; }

◆ getUnicharAmbigs()

const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 103 of file dict.h.

103  {
104  return getCCUtil()->unichar_ambigs;
105  }
const CCUtil * getCCUtil() const
Definition: dict.h:91
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69

◆ getUnicharset() [1/2]

const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 97 of file dict.h.

97  {
98  return getCCUtil()->unicharset;
99  }
UNICHARSET unicharset
Definition: ccutil.h:68
const CCUtil * getCCUtil() const
Definition: dict.h:91

◆ getUnicharset() [2/2]

UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 100 of file dict.h.

100  {
101  return getCCUtil()->unicharset;
102  }
UNICHARSET unicharset
Definition: ccutil.h:68
const CCUtil * getCCUtil() const
Definition: dict.h:91

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 198 of file dict.cpp.

198  {
199  // This global cache (a singleton) will outlive every Tesseract instance
200  // (even those that someone else might declare as global statics).
201  static DawgCache cache;
202  return &cache;
203 }

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 51 of file permdawg.cpp.

55  {
56  DawgArgs *more_args = static_cast<DawgArgs *>(void_more_args);
57  word_ending = (char_choice_index == char_choices.size()-1);
58  int word_index = word->length() - 1;
59  if (best_choice->rating() < *limit) return;
60  // Look up char in DAWG
61 
62  // If the current unichar is an ngram first try calling
63  // letter_is_okay() for each unigram it contains separately.
64  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
65  bool checked_unigrams = false;
66  if (getUnicharset().get_isngram(orig_uch_id)) {
67  if (dawg_debug_level) {
68  tprintf("checking unigrams in an ngram %s\n",
69  getUnicharset().debug_str(orig_uch_id).string());
70  }
71  int num_unigrams = 0;
72  word->remove_last_unichar_id();
74  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
75  // Since the string came out of the unicharset, failure is impossible.
76  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL,
77  NULL));
78  bool unigrams_ok = true;
79  // Construct DawgArgs that reflect the current state.
80  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
81  DawgPositionVector unigram_updated_dawgs;
82  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
83  &unigram_updated_dawgs,
84  more_args->permuter);
85  // Check unigrams in the ngram with letter_is_okay().
86  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
87  UNICHAR_ID uch_id = encoding[i];
88  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
89  ++num_unigrams;
90  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
91  unigrams_ok = (this->*letter_is_okay_)(
92  &unigram_dawg_args,
93  word->unichar_id(word_index+num_unigrams-1),
94  word_ending && i == encoding.size() - 1);
95  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
96  if (dawg_debug_level) {
97  tprintf("unigram %s is %s\n",
98  getUnicharset().debug_str(uch_id).string(),
99  unigrams_ok ? "OK" : "not OK");
100  }
101  }
102  // Restore the word and copy the updated dawg state if needed.
103  while (num_unigrams-- > 0) word->remove_last_unichar_id();
104  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
105  if (unigrams_ok) {
106  checked_unigrams = true;
107  more_args->permuter = unigram_dawg_args.permuter;
108  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
109  }
110  }
111 
112  // Check which dawgs from the dawgs_ vector contain the word
113  // up to and including the current unichar.
114  if (checked_unigrams || (this->*letter_is_okay_)(
115  more_args, word->unichar_id(word_index), word_ending)) {
116  // Add a new word choice
117  if (word_ending) {
118  if (dawg_debug_level) {
119  tprintf("found word = %s\n", word->debug_string().string());
120  }
121  if (strcmp(output_ambig_words_file.string(), "") != 0) {
122  if (output_ambig_words_file_ == NULL) {
123  output_ambig_words_file_ =
124  fopen(output_ambig_words_file.string(), "wb+");
125  if (output_ambig_words_file_ == NULL) {
126  tprintf("Failed to open output_ambig_words_file %s\n",
127  output_ambig_words_file.string());
128  exit(1);
129  }
130  STRING word_str;
131  word->string_and_lengths(&word_str, NULL);
132  word_str += " ";
133  fprintf(output_ambig_words_file_, "%s", word_str.string());
134  }
135  STRING word_str;
136  word->string_and_lengths(&word_str, NULL);
137  word_str += " ";
138  fprintf(output_ambig_words_file_, "%s", word_str.string());
139  }
140  WERD_CHOICE *adjusted_word = word;
141  adjusted_word->set_permuter(more_args->permuter);
142  update_best_choice(*adjusted_word, best_choice);
143  } else { // search the next letter
144  // Make updated_* point to the next entries in the DawgPositionVector
145  // arrays (that were originally created in dawg_permute_and_select)
146  ++(more_args->updated_dawgs);
147  // Make active_dawgs and constraints point to the updated ones.
148  ++(more_args->active_dawgs);
149  permute_choices(debug, char_choices, char_choice_index + 1,
150  prev_char_frag_info, word, certainties, limit,
151  best_choice, attempts_left, more_args);
152  // Restore previous state to explore another letter in this position.
153  --(more_args->updated_dawgs);
154  --(more_args->active_dawgs);
155  }
156  } else {
157  if (dawg_debug_level) {
158  tprintf("last unichar not OK at index %d in %s\n",
159  word_index, word->debug_string().string());
160  }
161  }
162 }
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:170
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:203
char * output_ambig_words_file
Definition: dict.h:603
float rating() const
Definition: ratngs.h:323
int size() const
Definition: genericvector.h:72
const STRING debug_string() const
Definition: ratngs.h:501
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:449
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448
Definition: strngs.h:45
void set_permuter(uinT8 perm)
Definition: ratngs.h:371
#define ASSERT_HOST(x)
Definition: errcode.h:84
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430
void remove_last_unichar_id()
Definition: ratngs.h:479
int length() const
Definition: ratngs.h:299
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int dawg_debug_level
Definition: dict.h:605
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 143 of file dict.h.

143  {
144  if (!last_word_on_line_ || first_pos)
145  return false;
146  const GenericVector<UNICHAR_ID>& normed_ids =
147  getUnicharset().normed_ids(unichar_id);
148  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
149  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 151 of file dict.h.

151  {
152  int word_index = word.length() - 1;
153  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
154  }
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int length() const
Definition: ratngs.h:299

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 130 of file dict.h.

130  {
131  return this->hyphenated() ? hyphen_word_->length() : 0;
132  }
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
int length() const
Definition: ratngs.h:299

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 126 of file dict.h.

126  { return
127  !last_word_on_line_ && hyphen_word_;
128  }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 570 of file dict.cpp.

571  {
572  int i;
573  if (hyphenated()) {
574  *active_dawgs = hyphen_active_dawgs_;
575  if (dawg_debug_level >= 3) {
576  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
577  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
578  hyphen_active_dawgs_[i].dawg_index,
579  hyphen_active_dawgs_[i].dawg_ref);
580  }
581  }
582  } else {
583  default_dawgs(active_dawgs, ambigs_mode);
584  }
585 }
#define REFFORMAT
Definition: dawg.h:93
int size() const
Definition: genericvector.h:72
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
#define tprintf(...)
Definition: tprintf.h:31
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:587
int dawg_debug_level
Definition: dict.h:605

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 117 of file dict.h.

117  {
118  const GenericVector<UNICHAR_ID>& normed_ids =
119  getUnicharset().normed_ids(unichar_id);
120  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
121  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 855 of file dict.cpp.

855  {
856  const UNICHARSET &u_set = getUnicharset();
857  if (u_set.han_sid() > 0) return false;
858  if (u_set.katakana_sid() > 0) return false;
859  if (u_set.thai_sid() > 0) return false;
860  return true;
861 }
int han_sid() const
Definition: unicharset.h:887
int katakana_sid() const
Definition: unicharset.h:889
int thai_sid() const
Definition: unicharset.h:890
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 451 of file stopper.cpp.

451  {
452  int shortest = MAX_INT32;
453  int curr_len = 0;
454  for (int w = 0; w < WordChoice.length(); ++w) {
455  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
456  curr_len++;
457  } else if (curr_len > 0) {
458  if (curr_len < shortest) shortest = curr_len;
459  curr_len = 0;
460  }
461  }
462  if (curr_len > 0 && curr_len < shortest) {
463  shortest = curr_len;
464  } else if (shortest == MAX_INT32) {
465  shortest = 0;
466  }
467  return shortest;
468 }
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
#define MAX_INT32
Definition: host.h:62
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int length() const
Definition: ratngs.h:299
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 359 of file dict.h.

360  {
361  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
362  }
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356

◆ Load()

void tesseract::Dict::Load ( const STRING lang,
TessdataManager data_file 
)

Definition at line 224 of file dict.cpp.

224  {
225  // Load dawgs_.
226  if (load_punc_dawg) {
227  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
228  dawg_debug_level, data_file);
229  if (punc_dawg_) dawgs_ += punc_dawg_;
230  }
231  if (load_system_dawg) {
232  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
233  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
234  if (system_dawg) dawgs_ += system_dawg;
235  }
236  if (load_number_dawg) {
237  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
238  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
239  if (number_dawg) dawgs_ += number_dawg;
240  }
241  if (load_bigram_dawg) {
242  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
243  dawg_debug_level, data_file);
244  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
245  // dawgs_!!
246  }
247  if (load_freq_dawg) {
248  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
249  dawg_debug_level, data_file);
250  if (freq_dawg_) dawgs_ += freq_dawg_;
251  }
252  if (load_unambig_dawg) {
253  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
254  dawg_debug_level, data_file);
255  if (unambig_dawg_) dawgs_ += unambig_dawg_;
256  }
257 
258  STRING name;
259  if (((STRING &)user_words_suffix).length() > 0 ||
260  ((STRING &)user_words_file).length() > 0) {
261  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
262  getUnicharset().size(), dawg_debug_level);
263  if (((STRING &)user_words_file).length() > 0) {
264  name = user_words_file;
265  } else {
267  name += user_words_suffix;
268  }
269  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
271  tprintf("Error: failed to load %s\n", name.string());
272  delete trie_ptr;
273  } else {
274  dawgs_ += trie_ptr;
275  }
276  }
277 
278  if (((STRING &)user_patterns_suffix).length() > 0 ||
279  ((STRING &)user_patterns_file).length() > 0) {
280  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
281  getUnicharset().size(), dawg_debug_level);
282  trie_ptr->initialize_patterns(&(getUnicharset()));
283  if (((STRING &)user_patterns_file).length() > 0) {
284  name = user_patterns_file;
285  } else {
287  name += user_patterns_suffix;
288  }
289  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
290  tprintf("Error: failed to load %s\n", name.string());
291  delete trie_ptr;
292  } else {
293  dawgs_ += trie_ptr;
294  }
295  }
296 
297  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
298  getUnicharset().size(), dawg_debug_level);
299  dawgs_ += document_words_;
300 
301  // This dawg is temporary and should not be searched by letter_is_ok.
302  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
303  getUnicharset().size(), dawg_debug_level);
304 }
bool load_unambig_dawg
Definition: dict.h:566
bool load_freq_dawg
Definition: dict.h:565
bool load_bigram_dawg
Definition: dict.h:571
char * user_words_file
Definition: dict.h:557
#define tprintf(...)
Definition: tprintf.h:31
bool load_number_dawg
Definition: dict.h:569
const char * string() const
Definition: strngs.cpp:198
Definition: strngs.h:45
const CCUtil * getCCUtil() const
Definition: dict.h:91
char * user_words_suffix
Definition: dict.h:559
bool load_system_dawg
Definition: dict.h:564
char * user_patterns_file
Definition: dict.h:561
char * user_patterns_suffix
Definition: dict.h:563
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
bool load_punc_dawg
Definition: dict.h:568
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int dawg_debug_level
Definition: dict.h:605
STRING language_data_path_prefix
Definition: ccutil.h:67

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const STRING lang,
TessdataManager data_file 
)

Definition at line 307 of file dict.cpp.

307  {
308  // Load dawgs_.
309  if (load_punc_dawg) {
310  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
311  dawg_debug_level, data_file);
312  if (punc_dawg_) dawgs_ += punc_dawg_;
313  }
314  if (load_system_dawg) {
315  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
316  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
317  if (system_dawg) dawgs_ += system_dawg;
318  }
319  if (load_number_dawg) {
320  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
321  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
322  if (number_dawg) dawgs_ += number_dawg;
323  }
324 }
bool load_number_dawg
Definition: dict.h:569
bool load_system_dawg
Definition: dict.h:564
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
bool load_punc_dawg
Definition: dict.h:568
int dawg_debug_level
Definition: dict.h:605

◆ ngram_probability_in_context()

double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 151 of file stopper.cpp.

154  {
155  if (stopper_debug_level > 2) {
156  tprintf("\nRunning NoDangerousAmbig() for %s\n",
157  best_choice->debug_string().string());
158  }
159 
160  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
161  // for each unichar id in BestChoice.
162  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
163  int i;
164  bool ambigs_found = false;
165  // For each position in best_choice:
166  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
167  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
168  // -- look for ambiguities corresponding to wrong_ngram in the list while
169  // adding the following unichar_ids from best_choice to wrong_ngram
170  //
171  // Repeat the above procedure twice: first time look through
172  // ambigs to be replaced and replace all the ambiguities found;
173  // second time look through dangerous ambiguities and construct
174  // ambig_blob_choices with fake a blob choice for each ambiguity
175  // and pass them to dawg_permute_and_select() to search for
176  // ambiguous words in the dictionaries.
177  //
178  // Note that during the execution of the for loop (on the first pass)
179  // if replacements are made the length of best_choice might change.
180  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
181  bool replace = (fix_replaceable && pass == 0);
182  const UnicharAmbigsVector &table = replace ?
184  if (!replace) {
185  // Initialize ambig_blob_choices with lists containing a single
186  // unichar id for the correspoding position in best_choice.
187  // best_choice consisting from only the original letters will
188  // have a rating of 0.0.
189  for (i = 0; i < best_choice->length(); ++i) {
190  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
191  BLOB_CHOICE_IT lst_it(lst);
192  // TODO(rays/antonova) Put real xheights and y shifts here.
193  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
194  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
195  ambig_blob_choices.push_back(lst);
196  }
197  }
198  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
199  int wrong_ngram_index;
200  int next_index;
201  int blob_index = 0;
202  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
203  ++i) {
204  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
205  if (stopper_debug_level > 2) {
206  tprintf("Looking for %s ngrams starting with %s:\n",
207  replace ? "replaceable" : "ambiguous",
208  getUnicharset().debug_str(curr_unichar_id).string());
209  }
210  int num_wrong_blobs = best_choice->state(i);
211  wrong_ngram_index = 0;
212  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213  if (curr_unichar_id == INVALID_UNICHAR_ID ||
214  curr_unichar_id >= table.size() ||
215  table[curr_unichar_id] == NULL) {
216  continue; // there is no ambig spec for this unichar id
217  }
218  AmbigSpec_IT spec_it(table[curr_unichar_id]);
219  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
220  const AmbigSpec *ambig_spec = spec_it.data();
221  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
222  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
223  ambig_spec->wrong_ngram);
224  if (stopper_debug_level > 2) {
225  tprintf("candidate ngram: ");
227  tprintf("current ngram from spec: ");
228  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
229  tprintf("comparison result: %d\n", compare);
230  }
231  if (compare == 0) {
232  // Record the place where we found an ambiguity.
233  if (fixpt != NULL) {
234  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
235  fixpt->push_back(DANGERR_INFO(
236  blob_index, blob_index + num_wrong_blobs, replace,
237  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
238  leftmost_id));
239  if (stopper_debug_level > 1) {
240  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
241  blob_index + num_wrong_blobs, false,
242  getUnicharset().get_isngram(
243  ambig_spec->correct_ngram_id),
244  getUnicharset().id_to_unichar(leftmost_id));
245  }
246  }
247 
248  if (replace) {
249  if (stopper_debug_level > 2) {
250  tprintf("replace ambiguity with %s : ",
251  getUnicharset().id_to_unichar(
252  ambig_spec->correct_ngram_id));
254  ambig_spec->correct_fragments, getUnicharset());
255  }
256  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
257  ambig_spec->correct_ngram_id,
258  best_choice, ratings);
259  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
260  // We found dang ambig - update ambig_blob_choices.
261  if (stopper_debug_level > 2) {
262  tprintf("found ambiguity: ");
264  ambig_spec->correct_fragments, getUnicharset());
265  }
266  ambigs_found = true;
267  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
268  ++tmp_index) {
269  // Add a blob choice for the corresponding fragment of the
270  // ambiguity. These fake blob choices are initialized with
271  // negative ratings (which are not possible for real blob
272  // choices), so that dawg_permute_and_select() considers any
273  // word not consisting of only the original letters a better
274  // choice and stops searching for alternatives once such a
275  // choice is found.
276  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
277  bc_it.add_to_end(new BLOB_CHOICE(
278  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
279  -1, 0, 1, 0, BCC_AMBIG));
280  }
281  }
282  spec_it.forward();
283  } else if (compare == -1) {
284  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
285  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
286  // Add the next unichar id to wrong_ngram and keep looking for
287  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
288  wrong_ngram[++wrong_ngram_index] =
289  best_choice->unichar_id(next_index);
290  num_wrong_blobs += best_choice->state(next_index);
291  } else {
292  break; // no more matching ambigs in this AMBIG_SPEC_LIST
293  }
294  } else {
295  spec_it.forward();
296  }
297  } // end searching AmbigSpec_LIST
298  } // end searching best_choice
299  } // end searching replace and dangerous ambigs
300 
301  // If any ambiguities were found permute the constructed ambig_blob_choices
302  // to see if an alternative dictionary word can be found.
303  if (ambigs_found) {
304  if (stopper_debug_level > 2) {
305  tprintf("\nResulting ambig_blob_choices:\n");
306  for (i = 0; i < ambig_blob_choices.length(); ++i) {
307  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
308  tprintf("\n");
309  }
310  }
311  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
312  ambigs_found = (alt_word->rating() < 0.0);
313  if (ambigs_found) {
314  if (stopper_debug_level >= 1) {
315  tprintf ("Stopper: Possible ambiguous word = %s\n",
316  alt_word->debug_string().string());
317  }
318  if (fixpt != NULL) {
319  // Note: Currently character choices combined from fragments can only
320  // be generated by NoDangrousAmbigs(). This code should be updated if
321  // the capability to produce classifications combined from character
322  // fragments is added to other functions.
323  int orig_i = 0;
324  for (i = 0; i < alt_word->length(); ++i) {
325  const UNICHARSET &uchset = getUnicharset();
326  bool replacement_is_ngram =
327  uchset.get_isngram(alt_word->unichar_id(i));
328  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
329  if (replacement_is_ngram) {
330  // we have to extract the leftmost unichar from the ngram.
331  const char *str = uchset.id_to_unichar(leftmost_id);
332  int step = uchset.step(str);
333  if (step) leftmost_id = uchset.unichar_to_id(str, step);
334  }
335  int end_i = orig_i + alt_word->state(i);
336  if (alt_word->state(i) > 1 ||
337  (orig_i + 1 == end_i && replacement_is_ngram)) {
338  // Compute proper blob indices.
339  int blob_start = 0;
340  for (int j = 0; j < orig_i; ++j)
341  blob_start += best_choice->state(j);
342  int blob_end = blob_start;
343  for (int j = orig_i; j < end_i; ++j)
344  blob_end += best_choice->state(j);
345  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
346  replacement_is_ngram, leftmost_id));
347  if (stopper_debug_level > 1) {
348  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
349  true, replacement_is_ngram,
350  uchset.id_to_unichar(leftmost_id));
351  }
352  }
353  orig_i += alt_word->state(i);
354  }
355  }
356  }
357  delete alt_word;
358  }
359  if (output_ambig_words_file_ != NULL) {
360  fprintf(output_ambig_words_file_, "\n");
361  }
362 
363  ambig_blob_choices.delete_data_pointers();
364  return !ambigs_found;
365 }
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:98
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:174
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:822
float rating() const
Definition: ratngs.h:323
const STRING debug_string() const
Definition: ratngs.h:501
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:152
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:141
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int push_back(T object)
int stopper_debug_level
Definition: dict.h:622
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:62
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:525
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:377
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
int length() const
Definition: ratngs.h:299
int step(const char *str) const
Definition: unicharset.cpp:230
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:153
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int state(int index) const
Definition: ratngs.h:315
int UNICHAR_ID
Definition: unichar.h:35

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 412 of file dict.h.

412 { return dawgs_.size(); }
int size() const
Definition: genericvector.h:72

◆ ParamsModelClassify()

float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 203 of file permdawg.cpp.

213  {
214  if (debug) {
215  tprintf("%s permute_choices: char_choice_index=%d"
216  " limit=%g rating=%g, certainty=%g word=%s\n",
217  debug, char_choice_index, *limit, word->rating(),
218  word->certainty(), word->debug_string().string());
219  }
220  if (char_choice_index < char_choices.length()) {
221  BLOB_CHOICE_IT blob_choice_it;
222  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
223  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
224  blob_choice_it.forward()) {
225  (*attempts_left)--;
226  append_choices(debug, char_choices, *(blob_choice_it.data()),
227  char_choice_index, prev_char_frag_info, word,
228  certainties, limit, best_choice, attempts_left, more_args);
229  if (*attempts_left <= 0) {
230  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
231  break;
232  }
233  }
234  }
235 }
float rating() const
Definition: ratngs.h:323
float certainty() const
Definition: ratngs.h:326
const STRING debug_string() const
Definition: ratngs.h:501
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int length() const
Definition: genericvector.h:86
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:245
T & get(int index) const

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 372 of file dict.h.

375  {
376  return (this->*probability_in_context_)(
377  getCCUtil()->lang.string(),
378  context, context_bytes,
379  character, character_bytes);
380  }
const char * string() const
Definition: strngs.cpp:198
STRING lang
Definition: ccutil.h:66
const CCUtil * getCCUtil() const
Definition: dict.h:91
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:366

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 532 of file dict.cpp.

535  {
536  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
537  // Try to find the edge corresponding to the exact unichar_id and to all the
538  // edges corresponding to the character class of unichar_id.
539  GenericVector<UNICHAR_ID> unichar_id_patterns;
540  unichar_id_patterns.push_back(unichar_id);
541  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
542  &unichar_id_patterns);
543  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
544  // On the first iteration check all the outgoing edges.
545  // On the second iteration check all self-loops.
546  for (int k = 0; k < 2; ++k) {
547  EDGE_REF edge = (k == 0)
548  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
549  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
550  if (edge == NO_EDGE) continue;
551  if (dawg_debug_level >= 3) {
552  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
553  pos.dawg_index, node, edge);
554  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
555  }
556  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
557  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
558  dawg_args->updated_dawgs->add_unique(
559  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
560  pos.back_to_punc),
561  dawg_debug_level > 0,
562  "Append current dawg to updated active dawgs: ");
563  }
564  }
565 }
#define REFFORMAT
Definition: dawg.h:93
inT64 NODE_REF
Definition: dawg.h:56
int size() const
Definition: genericvector.h:72
inT64 EDGE_REF
Definition: dawg.h:55
#define tprintf(...)
Definition: tprintf.h:31
int push_back(T object)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:420
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int dawg_debug_level
Definition: dict.h:605

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 377 of file stopper.cpp.

379  {
380  int num_blobs_to_replace = 0;
381  int begin_blob_index = 0;
382  int i;
383  // Rating and certainty for the new BLOB_CHOICE are derived from the
384  // replaced choices.
385  float new_rating = 0.0f;
386  float new_certainty = 0.0f;
387  BLOB_CHOICE* old_choice = NULL;
388  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
389  if (i >= wrong_ngram_begin_index) {
390  int num_blobs = werd_choice->state(i);
391  int col = begin_blob_index + num_blobs_to_replace;
392  int row = col + num_blobs - 1;
393  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
394  ASSERT_HOST(choices != NULL);
395  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
396  ASSERT_HOST(old_choice != NULL);
397  new_rating += old_choice->rating();
398  new_certainty += old_choice->certainty();
399  num_blobs_to_replace += num_blobs;
400  } else {
401  begin_blob_index += werd_choice->state(i);
402  }
403  }
404  new_certainty /= wrong_ngram_size;
405  // If there is no entry in the ratings matrix, add it.
406  MATRIX_COORD coord(begin_blob_index,
407  begin_blob_index + num_blobs_to_replace - 1);
408  if (!coord.Valid(*ratings)) {
409  ratings->IncreaseBandSize(coord.row - coord.col + 1);
410  }
411  if (ratings->get(coord.col, coord.row) == NULL)
412  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
413  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
414  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
415  if (choice != NULL) {
416  // Already there. Upgrade if new rating better.
417  if (new_rating < choice->rating())
418  choice->set_rating(new_rating);
419  if (new_certainty < choice->certainty())
420  choice->set_certainty(new_certainty);
421  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
422  } else {
423  // Need a new choice with the correct_ngram_id.
424  choice = new BLOB_CHOICE(*old_choice);
425  choice->set_unichar_id(correct_ngram_id);
426  choice->set_rating(new_rating);
427  choice->set_certainty(new_certainty);
428  choice->set_classifier(BCC_AMBIG);
429  choice->set_matrix_cell(coord.col, coord.row);
430  BLOB_CHOICE_IT it (new_choices);
431  it.add_to_end(choice);
432  }
433  // Remove current unichar from werd_choice. On the last iteration
434  // set the correct replacement unichar instead of removing a unichar.
435  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
436  ++replaced_count) {
437  if (replaced_count + 1 == wrong_ngram_size) {
438  werd_choice->set_blob_choice(wrong_ngram_begin_index,
439  num_blobs_to_replace, choice);
440  } else {
441  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
442  }
443  }
444  if (stopper_debug_level >= 1) {
445  werd_choice->print("ReplaceAmbig() ");
446  tprintf("Modified blob_choices: ");
447  print_ratings_list("\n", new_choices, getUnicharset());
448  }
449 }
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:166
void remove_unichar_id(int index)
Definition: ratngs.h:480
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:822
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:161
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
void print() const
Definition: ratngs.h:576
#define tprintf(...)
Definition: tprintf.h:31
T get(ICOORD pos) const
Definition: matrix.h:227
int stopper_debug_level
Definition: dict.h:622
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
float rating() const
Definition: ratngs.h:79
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:293
void set_certainty(float newrat)
Definition: ratngs.h:150
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
float certainty() const
Definition: ratngs.h:82
void set_matrix_cell(int col, int row)
Definition: ratngs.h:156
void set_rating(float newrat)
Definition: ratngs.h:147
int state(int index) const
Definition: ratngs.h:315

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 32 of file hyphen.cpp.

32  {
33  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
34  if (hyphen_word_ != NULL) {
35  delete hyphen_word_;
36  hyphen_word_ = NULL;
37  hyphen_active_dawgs_.clear();
38  }
39  }
40  if (hyphen_debug_level) {
41  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
42  last_word_on_line_, last_word_on_line);
43  }
44  last_word_on_line_ = last_word_on_line;
45 }
int hyphen_debug_level
Definition: dict.h:606
#define tprintf(...)
Definition: tprintf.h:31

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 310 of file dict.h.

310  {
311  if (pending_words_ != NULL)
312  pending_words_->clear();
313  if (document_words_ != NULL)
314  document_words_->clear();
315  }
void clear()
Definition: trie.cpp:65

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 49 of file hyphen.cpp.

50  {
51  if (hyphen_word_ == NULL) {
52  hyphen_word_ = new WERD_CHOICE(word.unicharset());
53  hyphen_word_->make_bad();
54  }
55  if (hyphen_word_->rating() > word.rating()) {
56  *hyphen_word_ = word;
57  // Remove the last unichar id as it is a hyphen, and remove
58  // any unichar_string/lengths that are present.
59  hyphen_word_->remove_last_unichar_id();
60  hyphen_active_dawgs_ = active_dawgs;
61  }
62  if (hyphen_debug_level) {
63  hyphen_word_->print("set_hyphen_word: ");
64  }
65 }
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
float rating() const
Definition: ratngs.h:323
int hyphen_debug_level
Definition: dict.h:606
void print() const
Definition: ratngs.h:576
void remove_last_unichar_id()
Definition: ratngs.h:479
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:439

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 369 of file stopper.cpp.

369  {
370  reject_offset_ = 0.0;
371 }

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 373 of file stopper.cpp.

373  {
375 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:615

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 206 of file dict.cpp.

206  {
207  if (dawgs_.length() != 0) this->End();
208 
209  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
210  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
211  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
212  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
213 
214  if (dawg_cache != NULL) {
215  dawg_cache_ = dawg_cache;
216  dawg_cache_is_ours_ = false;
217  } else {
218  dawg_cache_ = new DawgCache();
219  dawg_cache_is_ours_ = true;
220  }
221 }
void End()
Definition: dict.cpp:348
int length() const
Definition: genericvector.h:86
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 409 of file dict.h.

409 { wildcard_unichar_id_ = id; }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 491 of file dict.h.

491  {
492  wordseg_rating_adjust_factor_ = f;
493  }

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 470 of file stopper.cpp.

470  {
471  float Certainty;
472  float WorstCertainty = MAX_FLOAT32;
473  float CertaintyThreshold;
474  FLOAT64 TotalCertainty;
475  FLOAT64 TotalCertaintySquared;
476  FLOAT64 Variance;
477  FLOAT32 Mean, StdDev;
478  int word_length = word.length();
479 
480  if (word_length < 3)
481  return true;
482 
483  TotalCertainty = TotalCertaintySquared = 0.0;
484  for (int i = 0; i < word_length; ++i) {
485  Certainty = word.certainty(i);
486  TotalCertainty += Certainty;
487  TotalCertaintySquared += Certainty * Certainty;
488  if (Certainty < WorstCertainty)
489  WorstCertainty = Certainty;
490  }
491 
492  // Subtract off worst certainty from statistics.
493  word_length--;
494  TotalCertainty -= WorstCertainty;
495  TotalCertaintySquared -= WorstCertainty * WorstCertainty;
496 
497  Mean = TotalCertainty / word_length;
498  Variance = ((word_length * TotalCertaintySquared -
499  TotalCertainty * TotalCertainty) /
500  (word_length * (word_length - 1)));
501  if (Variance < 0.0)
502  Variance = 0.0;
503  StdDev = sqrt(Variance);
504 
505  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
506  if (CertaintyThreshold > stopper_nondict_certainty_base)
507  CertaintyThreshold = stopper_nondict_certainty_base;
508 
509  if (word.certainty() < CertaintyThreshold) {
510  if (stopper_debug_level >= 1)
511  tprintf("Stopper: Non-uniform certainty = %4.1f"
512  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
513  word.certainty(), Mean, StdDev, CertaintyThreshold);
514  return false;
515  } else {
516  return true;
517  }
518 }
double stopper_allowable_character_badness
Definition: dict.h:621
float certainty() const
Definition: ratngs.h:326
#define tprintf(...)
Definition: tprintf.h:31
int stopper_debug_level
Definition: dict.h:622
double stopper_nondict_certainty_base
Definition: dict.h:613
float FLOAT32
Definition: host.h:42
int length() const
Definition: ratngs.h:299
#define MAX_FLOAT32
Definition: host.h:66
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:644
double FLOAT64
Definition: host.h:43

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 170 of file dict.h.

171  {
172  if (word.rating() < best_choice->rating()) {
173  *best_choice = word;
174  }
175  }
float rating() const
Definition: ratngs.h:323

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 785 of file dict.cpp.

786  {
787  if (bigram_dawg_ == NULL) return false;
788 
789  // Extract the core word from the middle of each word with any digits
790  // replaced with question marks.
791  int w1start, w1end, w2start, w2end;
792  word1.punct_stripped(&w1start, &w1end);
793  word2.punct_stripped(&w2start, &w2end);
794 
795  // We don't want to penalize a single guillemet, hyphen, etc.
796  // But our bigram list doesn't have any information about punctuation.
797  if (w1start >= w1end) return word1.length() < 3;
798  if (w2start >= w2end) return word2.length() < 3;
799 
800  const UNICHARSET& uchset = getUnicharset();
801  GenericVector<UNICHAR_ID> bigram_string;
802  bigram_string.reserve(w1end + w2end + 1);
803  for (int i = w1start; i < w1end; i++) {
804  const GenericVector<UNICHAR_ID>& normed_ids =
805  getUnicharset().normed_ids(word1.unichar_id(i));
806  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
807  bigram_string.push_back(question_unichar_id_);
808  else
809  bigram_string += normed_ids;
810  }
811  bigram_string.push_back(UNICHAR_SPACE);
812  for (int i = w2start; i < w2end; i++) {
813  const GenericVector<UNICHAR_ID>& normed_ids =
814  getUnicharset().normed_ids(word2.unichar_id(i));
815  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
816  bigram_string.push_back(question_unichar_id_);
817  else
818  bigram_string += normed_ids;
819  }
820  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
821  for (int i = 0; i < bigram_string.size(); ++i) {
822  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
823  0.0f, 0.0f);
824  }
825  return bigram_dawg_->word_in_dawg(normalized_word);
826 }
void reserve(int size)
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
int size() const
Definition: genericvector.h:72
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int push_back(T object)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
int length() const
Definition: ratngs.h:299
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:364
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 828 of file dict.cpp.

828  {
829  if (word.length() == 0) return NO_PERM;
830  int i;
831  WERD_CHOICE new_word(word.unicharset());
832  int last_index = word.length() - 1;
833  int new_len = 0;
834  for (i = 0; i <= last_index; ++i) {
835  UNICHAR_ID unichar_id = (word.unichar_id(i));
836  if (getUnicharset().get_ispunctuation(unichar_id)) {
837  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
838  } else if (!getUnicharset().get_isalpha(unichar_id) &&
839  !getUnicharset().get_isdigit(unichar_id)) {
840  return false; // neither punc, nor alpha, nor digit
841  } else if ((new_len = new_word.length()) == 0 ||
842  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
843  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
844  }
845  }
846  for (i = 0; i < dawgs_.size(); ++i) {
847  if (dawgs_[i] != NULL &&
848  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
849  dawgs_[i]->word_in_dawg(new_word)) return true;
850  }
851  return false;
852 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
int size() const
Definition: genericvector.h:72
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
int length() const
Definition: ratngs.h:299
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:35

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 752 of file dict.cpp.

752  {
753  const WERD_CHOICE *word_ptr = &word;
754  WERD_CHOICE temp_word(word.unicharset());
755  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
756  copy_hyphen_info(&temp_word);
757  temp_word += word;
758  word_ptr = &temp_word;
759  }
760  if (word_ptr->length() == 0) return NO_PERM;
761  // Allocate vectors for holding current and updated
762  // active_dawgs and initialize them.
763  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
764  init_active_dawgs(&(active_dawgs[0]), false);
765  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
766  int last_index = word_ptr->length() - 1;
767  // Call leter_is_okay for each letter in the word.
768  for (int i = hyphen_base_size(); i <= last_index; ++i) {
769  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
770  i == last_index))) break;
771  // Swap active_dawgs, constraints with the corresponding updated vector.
772  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
773  dawg_args.updated_dawgs = &(active_dawgs[0]);
774  ++(dawg_args.active_dawgs);
775  } else {
776  ++(dawg_args.updated_dawgs);
777  dawg_args.active_dawgs = &(active_dawgs[0]);
778  }
779  }
780  delete[] active_dawgs;
781  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
782  dawg_args.permuter : NO_PERM;
783 }
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:570
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:130
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
int length() const
Definition: ratngs.h:299
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:136

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 462 of file dict.h.

462  {
463  return valid_word(word, false); // return NO_PERM for words with digits
464  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 469 of file dict.h.

469  {
470  WERD_CHOICE word(string, getUnicharset());
471  return valid_word(word);
472  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 465 of file dict.h.

465  {
466  return valid_word(word, true); // return NUMBER_PERM for valid numbers
467  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uinT8  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 455 of file dict.h.

455  {
456  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
457  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
458  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
459  (numbers_ok && perm == NUMBER_PERM));
460  }

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 410 of file dict.h.

410 { return wildcard_unichar_id_; }

Member Data Documentation

◆ certainty_scale

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 611 of file dict.h.

◆ dawg_debug_level

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 605 of file dict.h.

◆ doc_dict_certainty_threshold

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 642 of file dict.h.

◆ doc_dict_pending_threshold

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 640 of file dict.h.

◆ fragments_debug

int tesseract::Dict::fragments_debug = 0

"Debug character fragments"

Definition at line 633 of file dict.h.

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 204 of file dict.h.

◆ hyphen_debug_level

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 606 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 356 of file dict.h.

◆ load_bigram_dawg

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 571 of file dict.h.

◆ load_freq_dawg

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 565 of file dict.h.

◆ load_number_dawg

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 569 of file dict.h.

◆ load_punc_dawg

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 568 of file dict.h.

◆ load_system_dawg

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 564 of file dict.h.

◆ load_unambig_dawg

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 566 of file dict.h.

◆ max_permuter_attempts

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 647 of file dict.h.

◆ max_viterbi_list_size

int tesseract::Dict::max_viterbi_list_size = 10

"Maximum size of viterbi list."

Definition at line 607 of file dict.h.

◆ output_ambig_words_file

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 603 of file dict.h.

◆ params_model_classify_

float(Dict::* tesseract::Dict::params_model_classify_) (const char *lang, void *path)

Definition at line 400 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 366 of file dict.h.

◆ save_doc_words

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 638 of file dict.h.

◆ save_raw_choices

bool tesseract::Dict::save_raw_choices = false

"Deprecated- backward compatibility only"

Definition at line 627 of file dict.h.

◆ segment_nonalphabetic_script

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 637 of file dict.h.

◆ segment_penalty_dict_case_bad

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 588 of file dict.h.

◆ segment_penalty_dict_case_ok

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 584 of file dict.h.

◆ segment_penalty_dict_frequent_word

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 580 of file dict.h.

◆ segment_penalty_dict_nonword

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 596 of file dict.h.

◆ segment_penalty_garbage

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 601 of file dict.h.

◆ segment_penalty_ngram_best_choice

double tesseract::Dict::segment_penalty_ngram_best_choice = 1.24

"Multipler to for the best choice from the ngram model."

Definition at line 592 of file dict.h.

◆ stopper_allowable_character_badness

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 621 of file dict.h.

◆ stopper_certainty_per_char

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 619 of file dict.h.

◆ stopper_debug_level

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 622 of file dict.h.

◆ stopper_no_acceptable_choices

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 625 of file dict.h.

◆ stopper_nondict_certainty_base

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 613 of file dict.h.

◆ stopper_phase2_certainty_rejection_offset

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 615 of file dict.h.

◆ stopper_smallword_size

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 617 of file dict.h.

◆ tessedit_truncate_wordchoice_log

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 628 of file dict.h.

◆ use_only_first_uft8_step

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 610 of file dict.h.

◆ user_patterns_file

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 561 of file dict.h.

◆ user_patterns_suffix

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 563 of file dict.h.

◆ user_words_file

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 557 of file dict.h.

◆ user_words_suffix

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 559 of file dict.h.

◆ word_to_debug

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 630 of file dict.h.

◆ word_to_debug_lengths

char* tesseract::Dict::word_to_debug_lengths = ""

"Lengths of unichars in word_to_debug"

Definition at line 632 of file dict.h.

◆ xheight_penalty_inconsistent

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 577 of file dict.h.

◆ xheight_penalty_subscripts

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 574 of file dict.h.


The documentation for this class was generated from the following files: