tesseract v5.3.3.20231005
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word) const
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word) const
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void SetupForLoad (DawgCache *dawg_cache)
 
void Load (const std::string &lang, TessdataManager *data_file)
 
void LoadLSTM (const std::string &lang, TessdataManager *data_file)
 
bool FinishLoad ()
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
void SetWildcardID (UNICHAR_ID id)
 
UNICHAR_ID WildcardID () const
 
int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
bool IsSpaceDelimitedLang () const
 Returns true if the language is space-delimited (not CJ, or T). More...
 
 STRING_VAR_H (user_words_file)
 
 STRING_VAR_H (user_words_suffix)
 
 STRING_VAR_H (user_patterns_file)
 
 STRING_VAR_H (user_patterns_suffix)
 
 BOOL_VAR_H (load_system_dawg)
 
 BOOL_VAR_H (load_freq_dawg)
 
 BOOL_VAR_H (load_unambig_dawg)
 
 BOOL_VAR_H (load_punc_dawg)
 
 BOOL_VAR_H (load_number_dawg)
 
 BOOL_VAR_H (load_bigram_dawg)
 
 double_VAR_H (xheight_penalty_subscripts)
 
 double_VAR_H (xheight_penalty_inconsistent)
 
 double_VAR_H (segment_penalty_dict_frequent_word)
 
 double_VAR_H (segment_penalty_dict_case_ok)
 
 double_VAR_H (segment_penalty_dict_case_bad)
 
 double_VAR_H (segment_penalty_dict_nonword)
 
 double_VAR_H (segment_penalty_garbage)
 
 STRING_VAR_H (output_ambig_words_file)
 
 INT_VAR_H (dawg_debug_level)
 
 INT_VAR_H (hyphen_debug_level)
 
 BOOL_VAR_H (use_only_first_uft8_step)
 
 double_VAR_H (certainty_scale)
 
 double_VAR_H (stopper_nondict_certainty_base)
 
 double_VAR_H (stopper_phase2_certainty_rejection_offset)
 
 INT_VAR_H (stopper_smallword_size)
 
 double_VAR_H (stopper_certainty_per_char)
 
 double_VAR_H (stopper_allowable_character_badness)
 
 INT_VAR_H (stopper_debug_level)
 
 BOOL_VAR_H (stopper_no_acceptable_choices)
 
 INT_VAR_H (tessedit_truncate_wordchoice_log)
 
 STRING_VAR_H (word_to_debug)
 
 BOOL_VAR_H (segment_nonalphabetic_script)
 
 BOOL_VAR_H (save_doc_words)
 
 double_VAR_H (doc_dict_pending_threshold)
 
 double_VAR_H (doc_dict_certainty_threshold)
 
 INT_VAR_H (max_permuter_attempts)
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not nullptr contains information about immediately preceding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uint8_t perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 

Detailed Description

Definition at line 94 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 29 of file dict.cpp.

32 , ccutil_(ccutil)
33 , wildcard_unichar_id_(INVALID_UNICHAR_ID)
34 , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
35 , question_unichar_id_(INVALID_UNICHAR_ID)
36 , slash_unichar_id_(INVALID_UNICHAR_ID)
37 , hyphen_unichar_id_(INVALID_UNICHAR_ID)
38 , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
39 getCCUtil()->params())
40 , STRING_INIT_MEMBER(user_words_suffix, "",
41 "A suffix of user-provided words located in tessdata.",
42 getCCUtil()->params())
43 , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
44 getCCUtil()->params())
45 , STRING_INIT_MEMBER(user_patterns_suffix, "",
46 "A suffix of user-provided patterns located in "
47 "tessdata.",
48 getCCUtil()->params())
49 , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
50 , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
51 , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
52 getCCUtil()->params())
53 , BOOL_INIT_MEMBER(load_punc_dawg, true,
54 "Load dawg with punctuation"
55 " patterns.",
56 getCCUtil()->params())
57 , BOOL_INIT_MEMBER(load_number_dawg, true,
58 "Load dawg with number"
59 " patterns.",
60 getCCUtil()->params())
61 , BOOL_INIT_MEMBER(load_bigram_dawg, true,
62 "Load dawg with special word "
63 "bigrams.",
64 getCCUtil()->params())
65 , double_MEMBER(xheight_penalty_subscripts, 0.125,
66 "Score penalty (0.1 = 10%) added if there are subscripts "
67 "or superscripts in a word, but it is otherwise OK.",
68 getCCUtil()->params())
69 , double_MEMBER(xheight_penalty_inconsistent, 0.25,
70 "Score penalty (0.1 = 10%) added if an xheight is "
71 "inconsistent.",
72 getCCUtil()->params())
73 , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
74 "Score multiplier for word matches which have good case and"
75 " are frequent in the given language (lower is better).",
76 getCCUtil()->params())
77 , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
78 "Score multiplier for word matches that have good case "
79 "(lower is better).",
80 getCCUtil()->params())
81 , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
82 "Default score multiplier for word matches, which may have "
83 "case issues (lower is better).",
84 getCCUtil()->params())
85 , double_MEMBER(segment_penalty_dict_nonword, 1.25,
86 "Score multiplier for glyph fragment segmentations which "
87 "do not match a dictionary word (lower is better).",
88 getCCUtil()->params())
89 , double_MEMBER(segment_penalty_garbage, 1.50,
90 "Score multiplier for poorly cased strings that are not in"
91 " the dictionary and generally look like garbage (lower is"
92 " better).",
93 getCCUtil()->params())
94 , STRING_MEMBER(output_ambig_words_file, "",
95 "Output file for ambiguities found in the dictionary", getCCUtil()->params())
96 , INT_MEMBER(dawg_debug_level, 0,
97 "Set to 1 for general debug info"
98 ", to 2 for more details, to 3 to see all the debug messages",
99 getCCUtil()->params())
100 , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
101 , BOOL_MEMBER(use_only_first_uft8_step, false,
102 "Use only the first UTF8 step of the given string"
103 " when computing log probabilities.",
104 getCCUtil()->params())
105 , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
106 , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
107 getCCUtil()->params())
108 , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
109 getCCUtil()->params())
110 , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
111 getCCUtil()->params())
112 , double_MEMBER(stopper_certainty_per_char, -0.50,
113 "Certainty to add"
114 " for each dict char above small word size.",
115 getCCUtil()->params())
116 , double_MEMBER(stopper_allowable_character_badness, 3.0,
117 "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params())
118 , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
119 , BOOL_MEMBER(stopper_no_acceptable_choices, false,
120 "Make AcceptableChoice() always return false. Useful"
121 " when there is a need to explore all segmentations",
122 getCCUtil()->params())
123 , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
124 getCCUtil()->params())
125 , STRING_MEMBER(word_to_debug, "",
126 "Word for which stopper debug"
127 " information should be printed to stdout",
128 getCCUtil()->params())
129 , BOOL_MEMBER(segment_nonalphabetic_script, false,
130 "Don't use any alphabetic-specific tricks."
131 " Set to true in the traineddata config file for"
132 " scripts that are cursive or inherently fixed-pitch",
133 getCCUtil()->params())
134 , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
135 , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
136 getCCUtil()->params())
137 , double_MEMBER(doc_dict_certainty_threshold, -2.25,
138 "Worst certainty for words that can be inserted into the"
139 " document dictionary",
140 getCCUtil()->params())
141 , INT_MEMBER(max_permuter_attempts, 10000,
142 "Maximum number of different"
143 " character choices to consider during permutation."
144 " This limit is especially useful when user patterns"
145 " are specified, since overly generic patterns can result in"
146 " dawg search exploring an overly large number of options.",
147 getCCUtil()->params()) {
148 reject_offset_ = 0.0;
149 go_deeper_fxn_ = nullptr;
150 hyphen_word_ = nullptr;
151 last_word_on_line_ = false;
152 document_words_ = nullptr;
153 dawg_cache_ = nullptr;
154 dawg_cache_is_ours_ = false;
155 pending_words_ = nullptr;
156 bigram_dawg_ = nullptr;
157 freq_dawg_ = nullptr;
158 punc_dawg_ = nullptr;
159 unambig_dawg_ = nullptr;
160 wordseg_rating_adjust_factor_ = -1.0f;
161 output_ambig_words_file_ = nullptr;
162}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:369
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:381
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:379
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:375
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:373
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:371
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
const CCUtil * getCCUtil() const
Definition: dict.h:98
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:210
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:364
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:354
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:406

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 164 of file dict.cpp.

164 {
165 End();
166 delete hyphen_word_;
167 if (output_ambig_words_file_ != nullptr) {
168 fclose(output_ambig_words_file_);
169 }
170}
void End()
Definition: dict.cpp:379

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 66 of file context.cpp.

66 {
67 if (word.length() < kMinAbsoluteGarbageWordLength) {
68 return false;
69 }
70 int num_alphanum = 0;
71 for (unsigned x = 0; x < word.length(); ++x) {
72 num_alphanum +=
73 (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
74 }
75 return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
76 kMinAbsoluteGarbageAlphanumFrac);
77}

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 42 of file stopper.cpp.

43 {
44 float CertaintyThreshold = stopper_nondict_certainty_base;
45 int WordSize;
46
47 if (stopper_no_acceptable_choices) {
48 return false;
49 }
50
51 if (best_choice.empty()) {
52 return false;
53 }
54
55 bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
56 bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
57 bool is_case_ok = case_ok(best_choice);
58
59 if (stopper_debug_level >= 1) {
60 const char *xht = "UNKNOWN";
61 switch (xheight_consistency) {
62 case XH_GOOD:
63 xht = "NORMAL";
64 break;
65 case XH_SUBNORMAL:
66 xht = "SUBNORMAL";
67 break;
68 case XH_INCONSISTENT:
69 xht = "INCONSISTENT";
70 break;
71 default:
72 xht = "UNKNOWN";
73 }
74 tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
75 best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
76 (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
77 }
78 // Do not accept invalid words in PASS1.
79 if (reject_offset_ <= 0.0f && !is_valid_word) {
80 return false;
81 }
82 if (is_valid_word && is_case_ok) {
83 WordSize = LengthOfShortestAlphaRun(best_choice);
84 WordSize -= stopper_smallword_size;
85 if (WordSize < 0) {
86 WordSize = 0;
87 }
88 CertaintyThreshold += WordSize * stopper_certainty_per_char;
89 }
90
91 if (stopper_debug_level >= 1) {
92 tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
93 best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
94 }
95
96 if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
97 xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
98 return true;
99 } else {
100 if (stopper_debug_level >= 1) {
101 tprintf(
102 "AcceptableChoice() returned false"
103 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
104 no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
105 UniformCertainties(best_choice));
106 }
107 return false;
108 }
109}
@ XH_GOOD
Definition: dict.h:81
@ XH_SUBNORMAL
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:464
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:443
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:45

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES word) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 111 of file stopper.cpp.

111 {
112 if (word->best_choice == nullptr) {
113 return false;
114 }
115 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
116 int WordSize;
117
118 if (stopper_debug_level >= 1) {
119 tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
120 word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
121 (case_ok(*word->best_choice) ? 'y' : 'n'),
122 word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
123 word->best_choices.singleton() ? 'n' : 'y');
124 }
125
126 if (word->best_choice->empty() || !word->best_choices.singleton()) {
127 return false;
128 }
129 if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
130 WordSize = LengthOfShortestAlphaRun(*word->best_choice);
131 WordSize -= stopper_smallword_size;
132 if (WordSize < 0) {
133 WordSize = 0;
134 }
135 CertaintyThreshold += WordSize * stopper_certainty_per_char;
136 }
137
138 if (stopper_debug_level >= 1) {
139 tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ", word->best_choice->certainty(),
140 CertaintyThreshold);
141 }
142
143 if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
144 if (stopper_debug_level >= 1) {
145 tprintf("ACCEPTED\n");
146 }
147 return true;
148 } else {
149 if (stopper_debug_level >= 1) {
150 tprintf("REJECTED\n");
151 }
152 return false;
153 }
154}
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:801

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 647 of file dict.cpp.

647 {
648 // Do not add hyphenated word parts to the document dawg.
649 // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
650 // called when the first part of the hyphenated word is
651 // discovered and while the second part of the word is recognized.
652 // hyphen_word_ is cleared in cc_recg() before the next word on
653 // the line is recognized.
654 if (hyphen_word_) {
655 return;
656 }
657
658 int stringlen = best_choice.length();
659
660 if (valid_word(best_choice) || stringlen < 2) {
661 return;
662 }
663
664 // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
665 if (best_choice.length() >= kDocDictMaxRepChars) {
666 int num_rep_chars = 1;
667 UNICHAR_ID uch_id = best_choice.unichar_id(0);
668 for (unsigned i = 1; i < best_choice.length(); ++i) {
669 if (best_choice.unichar_id(i) != uch_id) {
670 num_rep_chars = 1;
671 uch_id = best_choice.unichar_id(i);
672 } else {
673 ++num_rep_chars;
674 if (num_rep_chars == kDocDictMaxRepChars) {
675 return;
676 }
677 }
678 }
679 }
680
681 if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
682 if (best_choice.certainty() < doc_dict_pending_threshold) {
683 return;
684 }
685
686 if (!pending_words_->word_in_dawg(best_choice)) {
687 if (stringlen > 2 ||
688 (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
689 getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
690 pending_words_->add_word_to_dawg(best_choice);
691 }
692 return;
693 }
694 }
695
696 if (save_doc_words) {
697 std::string filename(getCCUtil()->imagefile);
698 filename += ".doc";
699 FILE *doc_word_file = fopen(filename.c_str(), "a");
700 if (doc_word_file == nullptr) {
701 tprintf("Error: Could not open file %s\n", filename.c_str());
702 ASSERT_HOST(doc_word_file);
703 }
704 fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
705 fclose(doc_word_file);
706 }
707 document_words_->add_word_to_dawg(best_choice);
708}
#define ASSERT_HOST(x)
Definition: errcode.h:54
int UNICHAR_ID
Definition: unichar.h:34
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159

◆ adjust_word()

void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 710 of file dict.cpp.

711 {
712 bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
713 word->GetTopScriptID() == getUnicharset().han_sid());
714 bool case_is_ok = (is_han || case_ok(*word));
715 bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
716
717 float adjust_factor = additional_adjust;
718 float new_rating = word->rating();
719 new_rating += kRatingPad;
720 const char *xheight_triggered = "";
721 if (word->length() > 1) {
722 // Calculate x-height and y-offset consistency penalties.
723 switch (xheight_consistency) {
724 case XH_INCONSISTENT:
725 adjust_factor += xheight_penalty_inconsistent;
726 xheight_triggered = ", xhtBAD";
727 break;
728 case XH_SUBNORMAL:
729 adjust_factor += xheight_penalty_subscripts;
730 xheight_triggered = ", xhtSUB";
731 break;
732 case XH_GOOD:
733 // leave the factor alone - all good!
734 break;
735 }
736 // TODO(eger): if nonword is true, but there is a "core" that is a dict
737 // word, negate nonword status.
738 } else {
739 if (debug) {
740 tprintf("Consistency could not be calculated.\n");
741 }
742 }
743 if (debug) {
744 tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
745 word->rating(), xheight_triggered);
746 }
747
748 if (nonword) { // non-dictionary word
749 if (case_is_ok && punc_is_ok) {
750 adjust_factor += segment_penalty_dict_nonword;
751 new_rating *= adjust_factor;
752 if (debug) {
753 tprintf(", W");
754 }
755 } else {
756 adjust_factor += segment_penalty_garbage;
757 new_rating *= adjust_factor;
758 if (debug) {
759 if (!case_is_ok) {
760 tprintf(", C");
761 }
762 if (!punc_is_ok) {
763 tprintf(", P");
764 }
765 }
766 }
767 } else { // dictionary word
768 if (case_is_ok) {
769 if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
770 word->set_permuter(FREQ_DAWG_PERM);
771 adjust_factor += segment_penalty_dict_frequent_word;
772 new_rating *= adjust_factor;
773 if (debug) {
774 tprintf(", F");
775 }
776 } else {
777 adjust_factor += segment_penalty_dict_case_ok;
778 new_rating *= adjust_factor;
779 if (debug) {
780 tprintf(", ");
781 }
782 }
783 } else {
784 adjust_factor += segment_penalty_dict_case_bad;
785 new_rating *= adjust_factor;
786 if (debug) {
787 tprintf(", C");
788 }
789 }
790 }
791 new_rating -= kRatingPad;
792 if (modify_rating) {
793 word->set_rating(new_rating);
794 }
795 if (debug) {
796 tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
797 }
798 word->set_adjust_factor(adjust_factor);
799}
@ FREQ_DAWG_PERM
Definition: ratngs.h:247
int han_sid() const
Definition: unicharset.h:931
int null_sid() const
Definition: unicharset.h:916
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:883

◆ append_choices()

void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 224 of file permdawg.cpp.

228 {
229 auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230
231 // Deal with fragments.
232 CHAR_FRAGMENT_INFO char_frag_info;
233 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234 prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235 return; // blob_choice must be an invalid fragment
236 }
237 // Search the next letter if this character is a fragment.
238 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239 permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240 limit, best_choice, attempts_left, more_args);
241 return;
242 }
243
244 // Add the next unichar.
245 float old_rating = word->rating();
246 float old_certainty = word->certainty();
247 uint8_t old_permuter = word->permuter();
248 certainties[word->length()] = char_frag_info.certainty;
249 word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250 char_frag_info.rating, char_frag_info.certainty);
251
252 // Explore the next unichar.
253 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254 word, certainties, limit, best_choice, attempts_left, more_args);
255
256 // Remove the unichar we added to explore other choices in it's place.
257 word->remove_last_unichar_id();
258 word->set_rating(old_rating);
259 word->set_certainty(old_certainty);
260 word->set_permuter(old_permuter);
261}
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:187
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:288

◆ BOOL_VAR_H() [1/10]

tesseract::Dict::BOOL_VAR_H ( load_bigram_dawg  )

◆ BOOL_VAR_H() [2/10]

tesseract::Dict::BOOL_VAR_H ( load_freq_dawg  )

◆ BOOL_VAR_H() [3/10]

tesseract::Dict::BOOL_VAR_H ( load_number_dawg  )

◆ BOOL_VAR_H() [4/10]

tesseract::Dict::BOOL_VAR_H ( load_punc_dawg  )

◆ BOOL_VAR_H() [5/10]

tesseract::Dict::BOOL_VAR_H ( load_system_dawg  )

◆ BOOL_VAR_H() [6/10]

tesseract::Dict::BOOL_VAR_H ( load_unambig_dawg  )

◆ BOOL_VAR_H() [7/10]

tesseract::Dict::BOOL_VAR_H ( save_doc_words  )

◆ BOOL_VAR_H() [8/10]

tesseract::Dict::BOOL_VAR_H ( segment_nonalphabetic_script  )

◆ BOOL_VAR_H() [9/10]

tesseract::Dict::BOOL_VAR_H ( stopper_no_acceptable_choices  )

◆ BOOL_VAR_H() [10/10]

tesseract::Dict::BOOL_VAR_H ( use_only_first_uft8_step  )

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE word) const

Check a string to see if it matches a set of lexical rules.

Definition at line 45 of file context.cpp.

45 {
46 int state = 0;
47 const UNICHARSET *unicharset = word.unicharset();
48 for (unsigned x = 0; x < word.length(); ++x) {
49 UNICHAR_ID ch_id = word.unichar_id(x);
50 if (unicharset->get_isupper(ch_id)) {
51 state = case_state_table[state][1];
52 } else if (unicharset->get_islower(ch_id)) {
53 state = case_state_table[state][2];
54 } else if (unicharset->get_isdigit(ch_id)) {
55 state = case_state_table[state][3];
56 } else {
57 state = case_state_table[state][0];
58 }
59 if (state == -1) {
60 return false;
61 }
62 }
63 return state != 5; // single lower is bad
64}
const int case_state_table[6][4]
Definition: context.cpp:28

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg ( const UNICHARSET unicharset,
UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 411 of file dict.h.

411 {
412 if (!dawg) {
413 return ch;
414 }
415 switch (dawg->type()) {
416 case DAWG_TYPE_NUMBER:
417 return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
418 default:
419 return ch;
420 }
421 }
@ DAWG_TYPE_NUMBER
Definition: dawg.h:67
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 116 of file dict.h.

116 {
117 const UNICHARSET &unicharset = getUnicharset();
118 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
119 const auto &normed_ids = unicharset.normed_ids(unichar_id);
120 return normed_ids.size() == 1 &&
121 (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
122 }

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 145 of file dict.h.

145 {
146 if (this->hyphenated()) {
147 *word = *hyphen_word_;
148 if (hyphen_debug_level) {
149 word->print("copy_hyphen_info: ");
150 }
151 }
152 }
void print() const
Definition: ratngs.h:561
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 159 of file permdawg.cpp.

160 {
161 auto *best_choice = new WERD_CHOICE(&getUnicharset());
162 best_choice->make_bad();
163 best_choice->set_rating(rating_limit);
164 if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165 return best_choice;
166 }
167 auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168 init_active_dawgs(&(active_dawgs[0]), true);
169 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
170 WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
171
172 float certainties[MAX_WERD_LENGTH];
174 int attempts_left = max_permuter_attempts;
175 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176 &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177 delete[] active_dawgs;
178 return best_choice;
179}
#define MAX_WERD_LENGTH
Definition: dict.h:45
@ NO_PERM
Definition: ratngs.h:236
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:43
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable there from – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 406 of file dict.cpp.

407 {
408 auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
409
410 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
411
412 if (dawg_debug_level >= 3) {
413 tprintf(
414 "def_letter_is_okay: current unichar=%s word_end=%d"
415 " num active dawgs=%zu\n",
416 getUnicharset().debug_str(unichar_id).c_str(), word_end, dawg_args->active_dawgs->size());
417 }
418
419 // Do not accept words that contain kPatternUnicharID.
420 // (otherwise pattern dawgs would not function correctly).
421 // Do not accept words containing INVALID_UNICHAR_IDs.
422 if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
423 dawg_args->permuter = NO_PERM;
424 return NO_PERM;
425 }
426
427 // Initialization.
428 PermuterType curr_perm = NO_PERM;
429 dawg_args->updated_dawgs->clear();
430 dawg_args->valid_end = false;
431
432 // Go over the active_dawgs vector and insert DawgPosition records
433 // with the updated ref (an edge with the corresponding unichar id) into
434 // dawg_args->updated_pos.
435 for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
436 const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
437 const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
438 const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
439
440 if (!dawg && !punc_dawg) {
441 // shouldn't happen.
442 tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
443 continue;
444 }
445 if (!dawg) {
446 // We're in the punctuation dawg. A core dawg has not been chosen.
447 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
448 EDGE_REF punc_transition_edge =
449 punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
450 if (punc_transition_edge != NO_EDGE) {
451 // Find all successors, and see which can transition.
452 const SuccessorList &slist = *(successors_[pos.punc_index]);
453 for (int sdawg_index : slist) {
454 const Dawg *sdawg = dawgs_[sdawg_index];
455 UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
456 EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
457 if (dawg_edge != NO_EDGE) {
458 if (dawg_debug_level >= 3) {
459 tprintf("Letter found in dawg %d\n", sdawg_index);
460 }
461 dawg_args->updated_dawgs->add_unique(
462 DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
463 dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
464 if (sdawg->permuter() > curr_perm) {
465 curr_perm = sdawg->permuter();
466 }
467 if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
468 dawg_args->valid_end = true;
469 }
470 }
471 }
472 }
473 EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
474 if (punc_edge != NO_EDGE) {
475 if (dawg_debug_level >= 3) {
476 tprintf("Letter found in punctuation dawg\n");
477 }
478 dawg_args->updated_dawgs->add_unique(
479 DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
480 "Extend punctuation dawg: ");
481 if (PUNC_PERM > curr_perm) {
482 curr_perm = PUNC_PERM;
483 }
484 if (punc_dawg->end_of_word(punc_edge)) {
485 dawg_args->valid_end = true;
486 }
487 }
488 continue;
489 }
490
491 if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
492 // We can end the main word here.
493 // If we can continue on the punc ref, add that possibility.
494 NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
495 EDGE_REF punc_edge =
496 punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
497 if (punc_edge != NO_EDGE) {
498 dawg_args->updated_dawgs->add_unique(
499 DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
500 dawg_debug_level > 0, "Return to punctuation dawg: ");
501 if (dawg->permuter() > curr_perm) {
502 curr_perm = dawg->permuter();
503 }
504 if (punc_dawg->end_of_word(punc_edge)) {
505 dawg_args->valid_end = true;
506 }
507 }
508 }
509
510 if (pos.back_to_punc) {
511 continue;
512 }
513
514 // If we are dealing with the pattern dawg, look up all the
515 // possible edges, not only for the exact unichar_id, but also
516 // for all its character classes (alpha, digit, etc).
517 if (dawg->type() == DAWG_TYPE_PATTERN) {
518 ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
519 // There can't be any successors to dawg that is of type
520 // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
521 continue;
522 }
523
524 // Find the edge out of the node for the unichar_id.
525 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
526 EDGE_REF edge =
527 (node == NO_EDGE)
528 ? NO_EDGE
529 : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
530
531 if (dawg_debug_level >= 3) {
532 tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
533 }
534
535 if (edge != NO_EDGE) { // the unichar was found in the current dawg
536 if (dawg_debug_level >= 3) {
537 tprintf("Letter found in dawg %d\n", pos.dawg_index);
538 }
539 if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
540 if (dawg_debug_level >= 3) {
541 tprintf("Punctuation constraint not satisfied at end of word.\n");
542 }
543 continue;
544 }
545 if (dawg->permuter() > curr_perm) {
546 curr_perm = dawg->permuter();
547 }
548 if (dawg->end_of_word(edge) &&
549 (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
550 dawg_args->valid_end = true;
551 }
552 dawg_args->updated_dawgs->add_unique(
553 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
554 dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
555 }
556 } // end for
557 // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
558 // or if we found the current letter in a non-punctuation dawg. This
559 // allows preserving information on which dawg the "core" word came from.
560 // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
561 if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
562 (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
563 dawg_args->permuter = curr_perm;
564 }
565 if (dawg_debug_level >= 2) {
566 tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
567 }
568 return dawg_args->permuter;
569}
#define REFFORMAT
Definition: dawg.h:85
@ DAWG_TYPE_PATTERN
Definition: dawg.h:68
int64_t EDGE_REF
Definition: dawg.h:49
std::vector< int > SuccessorList
Definition: dawg.h:61
int64_t NODE_REF
Definition: dawg.h:50
PermuterType
Definition: ratngs.h:235
@ COMPOUND_PERM
Definition: ratngs.h:248
@ PUNC_PERM
Definition: ratngs.h:237
size_t size() const
Definition: unicharset.h:355
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:571
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 364 of file dict.h.

365 {
366 (void)lang;
367 (void)context;
368 (void)context_bytes;
369 (void)character;
370 (void)character_bytes;
371 return 0.0;
372 }
@ character
Definition: mfoutline.h:53

◆ default_dawgs()

void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 624 of file dict.cpp.

624 {
625 bool punc_dawg_available = (punc_dawg_ != nullptr) &&
626 punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
627
628 for (unsigned i = 0; i < dawgs_.size(); i++) {
629 if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
630 int dawg_ty = dawgs_[i]->type();
631 bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
632 if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
633 dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
634 if (dawg_debug_level >= 3) {
635 tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
636 }
637 } else if (!punc_dawg_available || !subsumed_by_punc) {
638 dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
639 if (dawg_debug_level >= 3) {
640 tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
641 }
642 }
643 }
644 }
645}
@ DAWG_TYPE_PUNCTUATION
Definition: dawg.h:65
type
Definition: upload.py:458
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.

◆ double_VAR_H() [1/14]

tesseract::Dict::double_VAR_H ( certainty_scale  )

◆ double_VAR_H() [2/14]

tesseract::Dict::double_VAR_H ( doc_dict_certainty_threshold  )

◆ double_VAR_H() [3/14]

tesseract::Dict::double_VAR_H ( doc_dict_pending_threshold  )

◆ double_VAR_H() [4/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_bad  )

◆ double_VAR_H() [5/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_ok  )

◆ double_VAR_H() [6/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_frequent_word  )

◆ double_VAR_H() [7/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_nonword  )

◆ double_VAR_H() [8/14]

tesseract::Dict::double_VAR_H ( segment_penalty_garbage  )

◆ double_VAR_H() [9/14]

tesseract::Dict::double_VAR_H ( stopper_allowable_character_badness  )

◆ double_VAR_H() [10/14]

tesseract::Dict::double_VAR_H ( stopper_certainty_per_char  )

◆ double_VAR_H() [11/14]

tesseract::Dict::double_VAR_H ( stopper_nondict_certainty_base  )

◆ double_VAR_H() [12/14]

tesseract::Dict::double_VAR_H ( stopper_phase2_certainty_rejection_offset  )

◆ double_VAR_H() [13/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_inconsistent  )

◆ double_VAR_H() [14/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_subscripts  )

◆ End()

void tesseract::Dict::End ( )

Definition at line 379 of file dict.cpp.

379 {
380 if (dawgs_.empty()) {
381 return; // Not safe to call twice.
382 }
383 for (auto &dawg : dawgs_) {
384 if (!dawg_cache_->FreeDawg(dawg)) {
385 delete dawg;
386 }
387 }
388 dawg_cache_->FreeDawg(bigram_dawg_);
389 if (dawg_cache_is_ours_) {
390 delete dawg_cache_;
391 dawg_cache_ = nullptr;
392 }
393 for (auto successor : successors_) {
394 delete successor;
395 }
396 dawgs_.clear();
397 successors_.clear();
398 document_words_ = nullptr;
399 delete pending_words_;
400 pending_words_ = nullptr;
401}
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:37

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 358 of file stopper.cpp.

358{}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 357 of file dict.cpp.

357 {
358 if (dawgs_.empty()) {
359 return false;
360 }
361 // Construct a list of corresponding successors for each dawg. Each entry, i,
362 // in the successors_ vector is a vector of integers that represent the
363 // indices into the dawgs_ vector of the successors for dawg i.
364 successors_.reserve(dawgs_.size());
365 for (auto dawg : dawgs_) {
366 auto *lst = new SuccessorList();
367 for (unsigned j = 0; j < dawgs_.size(); ++j) {
368 const Dawg *other = dawgs_[j];
369 if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
370 kDawgSuccessors[dawg->type()][other->type()]) {
371 lst->push_back(j);
372 }
373 }
374 successors_.push_back(lst);
375 }
376 return true;
377}

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 288 of file permdawg.cpp.

290 {
291 const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292 const CHAR_FRAGMENT *prev_fragment =
293 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294
295 // Print debug info for fragments.
296 if (debug && (prev_fragment || this_fragment)) {
297 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298 getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299 if (prev_fragment) {
300 tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301 }
302 if (this_fragment) {
303 tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304 }
305 }
306
307 char_frag_info->unichar_id = curr_unichar_id;
308 char_frag_info->fragment = this_fragment;
309 char_frag_info->rating = curr_rating;
310 char_frag_info->certainty = curr_certainty;
311 char_frag_info->num_fragments = 1;
312 if (prev_fragment && !this_fragment) {
313 if (debug) {
314 tprintf("Skip choice with incomplete fragment\n");
315 }
316 return false;
317 }
318 if (this_fragment) {
319 // We are dealing with a fragment.
320 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321 if (prev_fragment) {
322 if (!this_fragment->is_continuation_of(prev_fragment)) {
323 if (debug) {
324 tprintf("Non-matching fragment piece\n");
325 }
326 return false;
327 }
328 if (this_fragment->is_ending()) {
329 char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330 char_frag_info->fragment = nullptr;
331 if (debug) {
332 tprintf("Built character %s from fragments\n",
333 getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334 }
335 } else {
336 if (debug) {
337 tprintf("Record fragment continuation\n");
338 }
339 char_frag_info->fragment = this_fragment;
340 }
341 // Update certainty and rating.
342 char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344 char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345 } else {
346 if (this_fragment->is_beginning()) {
347 if (debug) {
348 tprintf("Record fragment beginning\n");
349 }
350 } else {
351 if (debug) {
352 tprintf("Non-starting fragment piece with no prev_fragment\n");
353 }
354 return false;
355 }
356 }
357 }
358 if (word_ending && char_frag_info->fragment) {
359 if (debug) {
360 tprintf("Word cannot end with a fragment\n");
361 }
362 return false;
363 }
364 return true;
365}
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

◆ getCCUtil() [1/2]

CCUtil * tesseract::Dict::getCCUtil ( )
inline

Definition at line 101 of file dict.h.

101 {
102 return ccutil_;
103 }

◆ getCCUtil() [2/2]

const CCUtil * tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 98 of file dict.h.

98 {
99 return ccutil_;
100 }

◆ GetDawg()

const Dawg * tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 385 of file dict.h.

385 {
386 return dawgs_[index];
387 }

◆ GetPuncDawg()

const Dawg * tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 389 of file dict.h.

389 {
390 return punc_dawg_;
391 }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 397 of file dict.h.

397 {
398 if (edge_ref == NO_EDGE) {
399 return 0; // beginning to explore the dawg
400 }
401 NODE_REF node = dawg->next_node(edge_ref);
402 if (node == 0) {
403 node = NO_EDGE; // end of word
404 }
405 return node;
406 }

◆ GetUnambigDawg()

const Dawg * tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 393 of file dict.h.

393 {
394 return unambig_dawg_;
395 }

◆ getUnicharAmbigs()

const UnicharAmbigs & tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 111 of file dict.h.

111 {
112 return getCCUtil()->unichar_ambigs;
113 }
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63

◆ getUnicharset() [1/2]

UNICHARSET & tesseract::Dict::getUnicharset ( )
inline

Definition at line 107 of file dict.h.

107 {
108 return getCCUtil()->unicharset;
109 }
UNICHARSET unicharset
Definition: ccutil.h:61

◆ getUnicharset() [2/2]

const UNICHARSET & tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 104 of file dict.h.

104 {
105 return getCCUtil()->unicharset;
106 }

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 172 of file dict.cpp.

172 {
173 // This global cache (a singleton) will outlive every Tesseract instance
174 // (even those that someone else might declare as global static variables).
175 static DawgCache cache;
176 return &cache;
177}

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 43 of file permdawg.cpp.

47 {
48 auto *more_args = static_cast<DawgArgs *>(void_more_args);
49 word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50 int word_index = word->length() - 1;
51 if (best_choice->rating() < *limit) {
52 return;
53 }
54 // Look up char in DAWG
55
56 // If the current unichar is an ngram first try calling
57 // letter_is_okay() for each unigram it contains separately.
58 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59 bool checked_unigrams = false;
60 if (getUnicharset().get_isngram(orig_uch_id)) {
61 if (dawg_debug_level) {
62 tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63 }
64 int num_unigrams = 0;
65 word->remove_last_unichar_id();
66 std::vector<UNICHAR_ID> encoding;
67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68 // Since the string came out of the unicharset, failure is impossible.
69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70 bool unigrams_ok = true;
71 // Construct DawgArgs that reflect the current state.
72 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73 DawgPositionVector unigram_updated_dawgs;
74 DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75 // Check unigrams in the ngram with letter_is_okay().
76 for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77 UNICHAR_ID uch_id = encoding[i];
78 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79 ++num_unigrams;
80 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81 unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82 word->unichar_id(word_index + num_unigrams - 1),
83 word_ending && i == encoding.size() - 1);
84 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85 if (dawg_debug_level) {
86 tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87 unigrams_ok ? "OK" : "not OK");
88 }
89 }
90 // Restore the word and copy the updated dawg state if needed.
91 while (num_unigrams-- > 0) {
92 word->remove_last_unichar_id();
93 }
94 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95 if (unigrams_ok) {
96 checked_unigrams = true;
97 more_args->permuter = unigram_dawg_args.permuter;
98 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99 }
100 }
101
102 // Check which dawgs from the dawgs_ vector contain the word
103 // up to and including the current unichar.
104 if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105 word->unichar_id(word_index), word_ending)) {
106 // Add a new word choice
107 if (word_ending) {
108 if (dawg_debug_level) {
109 tprintf("found word = %s\n", word->debug_string().c_str());
110 }
111 if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112 if (output_ambig_words_file_ == nullptr) {
113 output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114 if (output_ambig_words_file_ == nullptr) {
115 tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116 exit(1);
117 }
118 std::string word_str;
119 word->string_and_lengths(&word_str, nullptr);
120 word_str += " ";
121 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122 }
123 std::string word_str;
124 word->string_and_lengths(&word_str, nullptr);
125 word_str += " ";
126 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127 }
128 WERD_CHOICE *adjusted_word = word;
129 adjusted_word->set_permuter(more_args->permuter);
130 update_best_choice(*adjusted_word, best_choice);
131 } else { // search the next letter
132 // Make updated_* point to the next entries in the DawgPositionVector
133 // arrays (that were originally created in dawg_permute_and_select)
134 ++(more_args->updated_dawgs);
135 // Make active_dawgs and constraints point to the updated ones.
136 ++(more_args->active_dawgs);
137 permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138 certainties, limit, best_choice, attempts_left, more_args);
139 // Restore previous state to explore another letter in this position.
140 --(more_args->updated_dawgs);
141 --(more_args->active_dawgs);
142 }
143 } else {
144 if (dawg_debug_level) {
145 tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146 }
147 }
148}
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end ( const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 154 of file dict.h.

155 {
156 if (!last_word_on_line_ || first_pos) {
157 return false;
158 }
159 ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
160 const auto &normed_ids = unicharset->normed_ids(unichar_id);
161 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
162 }

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 164 of file dict.h.

164 {
165 int word_index = word.length() - 1;
166 return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
167 }
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 139 of file dict.h.

139 {
140 return this->hyphenated() ? hyphen_word_->length() : 0;
141 }
unsigned length() const
Definition: ratngs.h:287

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 135 of file dict.h.

135 {
136 return !last_word_on_line_ && hyphen_word_;
137 }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 610 of file dict.cpp.

610 {
611 if (hyphenated()) {
612 *active_dawgs = hyphen_active_dawgs_;
613 if (dawg_debug_level >= 3) {
614 for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
615 tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
616 hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
617 }
618 }
619 } else {
620 default_dawgs(active_dawgs, ambigs_mode);
621 }
622}
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:624

◆ INT_VAR_H() [1/6]

tesseract::Dict::INT_VAR_H ( dawg_debug_level  )

◆ INT_VAR_H() [2/6]

tesseract::Dict::INT_VAR_H ( hyphen_debug_level  )

◆ INT_VAR_H() [3/6]

tesseract::Dict::INT_VAR_H ( max_permuter_attempts  )

◆ INT_VAR_H() [4/6]

tesseract::Dict::INT_VAR_H ( stopper_debug_level  )

◆ INT_VAR_H() [5/6]

tesseract::Dict::INT_VAR_H ( stopper_smallword_size  )

◆ INT_VAR_H() [6/6]

tesseract::Dict::INT_VAR_H ( tessedit_truncate_wordchoice_log  )

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 125 of file dict.h.

125 {
126 const UNICHARSET &unicharset = getUnicharset();
127 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
128 const auto &normed_ids = unicharset.normed_ids(unichar_id);
129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130 }

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 912 of file dict.cpp.

912 {
913 const UNICHARSET &u_set = getUnicharset();
914 if (u_set.han_sid() > 0) {
915 return false;
916 }
917 if (u_set.katakana_sid() > 0) {
918 return false;
919 }
920 if (u_set.thai_sid() > 0) {
921 return false;
922 }
923 return true;
924}

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 443 of file stopper.cpp.

443 {
444 int shortest = INT32_MAX;
445 int curr_len = 0;
446 for (unsigned w = 0; w < WordChoice.length(); ++w) {
447 if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
448 curr_len++;
449 } else if (curr_len > 0) {
450 if (curr_len < shortest) {
451 shortest = curr_len;
452 }
453 curr_len = 0;
454 }
455 }
456 if (curr_len > 0 && curr_len < shortest) {
457 shortest = curr_len;
458 } else if (shortest == INT32_MAX) {
459 shortest = 0;
460 }
461 return shortest;
462}

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
const UNICHARSET unicharset,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 348 of file dict.h.

349 {
350 return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
351 }

◆ Load()

void tesseract::Dict::Load ( const std::string &  lang,
TessdataManager data_file 
)

Definition at line 200 of file dict.cpp.

200 {
201 // Load dawgs_.
202 if (load_punc_dawg) {
203 punc_dawg_ =
204 dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
205 if (punc_dawg_) {
206 dawgs_.push_back(punc_dawg_);
207 }
208 }
209 if (load_system_dawg) {
210 Dawg *system_dawg =
211 dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
212 if (system_dawg) {
213 dawgs_.push_back(system_dawg);
214 }
215 }
216 if (load_number_dawg) {
217 Dawg *number_dawg =
218 dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
219 if (number_dawg) {
220 dawgs_.push_back(number_dawg);
221 }
222 }
223 if (load_bigram_dawg) {
224 bigram_dawg_ =
225 dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
226 // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
227 // dawgs_!!
228 }
229 if (load_freq_dawg) {
230 freq_dawg_ =
231 dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
232 if (freq_dawg_) {
233 dawgs_.push_back(freq_dawg_);
234 }
235 }
236 if (load_unambig_dawg) {
237 unambig_dawg_ =
238 dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
239 if (unambig_dawg_) {
240 dawgs_.push_back(unambig_dawg_);
241 }
242 }
243
244 std::string name;
245 if (!user_words_suffix.empty() || !user_words_file.empty()) {
246 Trie *trie_ptr =
247 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
248 if (!user_words_file.empty()) {
249 name = user_words_file;
250 } else {
252 name += user_words_suffix;
253 }
254 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
256 tprintf("Error: failed to load %s\n", name.c_str());
257 delete trie_ptr;
258 } else {
259 dawgs_.push_back(trie_ptr);
260 }
261 }
262
263 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
264 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
265 dawg_debug_level);
266 trie_ptr->initialize_patterns(&(getUnicharset()));
267 if (!user_patterns_file.empty()) {
268 name = user_patterns_file;
269 } else {
271 name += user_patterns_suffix;
272 }
273 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
274 tprintf("Error: failed to load %s\n", name.c_str());
275 delete trie_ptr;
276 } else {
277 dawgs_.push_back(trie_ptr);
278 }
279 }
280
281 document_words_ =
282 new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
283 dawgs_.push_back(document_words_);
284
285 // This dawg is temporary and should not be searched by letter_is_ok.
286 pending_words_ =
287 new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
288}
@ DAWG_TYPE_WORD
Definition: dawg.h:66
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_SYSTEM_DAWG
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ USER_PATTERN_PERM
Definition: ratngs.h:243
@ DOC_DAWG_PERM
Definition: ratngs.h:245
std::string language_data_path_prefix
Definition: ccutil.h:60
Dawg * GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:43
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM ( const std::string &  lang,
TessdataManager data_file 
)

Definition at line 291 of file dict.cpp.

291 {
292 // Load dawgs_.
293 if (load_punc_dawg) {
294 punc_dawg_ =
295 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
296 if (punc_dawg_) {
297 dawgs_.push_back(punc_dawg_);
298 }
299 }
300 if (load_system_dawg) {
301 Dawg *system_dawg =
302 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
303 if (system_dawg) {
304 dawgs_.push_back(system_dawg);
305 }
306 }
307 if (load_number_dawg) {
308 Dawg *number_dawg =
309 dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
310 if (number_dawg) {
311 dawgs_.push_back(number_dawg);
312 }
313 }
314
315 // stolen from Dict::Load (but needs params_ from Tesseract
316 // langdata/config/api):
317 std::string name;
318 if (!user_words_suffix.empty() || !user_words_file.empty()) {
319 Trie *trie_ptr =
320 new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
321 if (!user_words_file.empty()) {
322 name = user_words_file;
323 } else {
325 name += user_words_suffix;
326 }
327 if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
329 tprintf("Error: failed to load %s\n", name.c_str());
330 delete trie_ptr;
331 } else {
332 dawgs_.push_back(trie_ptr);
333 }
334 }
335
336 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
337 Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
338 dawg_debug_level);
339 trie_ptr->initialize_patterns(&(getUnicharset()));
340 if (!user_patterns_file.empty()) {
341 name = user_patterns_file;
342 } else {
344 name += user_patterns_suffix;
345 }
346 if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
347 tprintf("Error: failed to load %s\n", name.c_str());
348 delete trie_ptr;
349 } else {
350 dawgs_.push_back(trie_ptr);
351 }
352 }
353}
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 158 of file stopper.cpp.

159 {
160 if (stopper_debug_level > 2) {
161 tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
162 }
163
164 // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
165 // for each unichar id in BestChoice.
166 BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
167 bool ambigs_found = false;
168 // For each position in best_choice:
169 // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
170 // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
171 // -- look for ambiguities corresponding to wrong_ngram in the list while
172 // adding the following unichar_ids from best_choice to wrong_ngram
173 //
174 // Repeat the above procedure twice: first time look through
175 // ambigs to be replaced and replace all the ambiguities found;
176 // second time look through dangerous ambiguities and construct
177 // ambig_blob_choices with fake a blob choice for each ambiguity
178 // and pass them to dawg_permute_and_select() to search for
179 // ambiguous words in the dictionaries.
180 //
181 // Note that during the execution of the for loop (on the first pass)
182 // if replacements are made the length of best_choice might change.
183 for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
184 bool replace = (fix_replaceable && pass == 0);
185 const UnicharAmbigsVector &table =
187 if (!replace) {
188 // Initialize ambig_blob_choices with lists containing a single
189 // unichar id for the corresponding position in best_choice.
190 // best_choice consisting from only the original letters will
191 // have a rating of 0.0.
192 for (unsigned i = 0; i < best_choice->length(); ++i) {
193 auto *lst = new BLOB_CHOICE_LIST();
194 BLOB_CHOICE_IT lst_it(lst);
195 // TODO(rays/antonova) Put real xheights and y shifts here.
196 lst_it.add_to_end(
197 new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
198 ambig_blob_choices.push_back(lst);
199 }
200 }
201 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
202 int wrong_ngram_index;
203 int blob_index = 0;
204 for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
205 auto curr_unichar_id = best_choice->unichar_id(i);
206 if (stopper_debug_level > 2) {
207 tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
208 getUnicharset().debug_str(curr_unichar_id).c_str());
209 }
210 int num_wrong_blobs = best_choice->state(i);
211 wrong_ngram_index = 0;
212 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213 if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
214 table[curr_unichar_id] == nullptr) {
215 continue; // there is no ambig spec for this unichar id
216 }
217 AmbigSpec_IT spec_it(table[curr_unichar_id]);
218 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
219 const AmbigSpec *ambig_spec = spec_it.data();
220 wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
221 int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
222 if (stopper_debug_level > 2) {
223 tprintf("candidate ngram: ");
225 tprintf("current ngram from spec: ");
226 UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
227 tprintf("comparison result: %d\n", compare);
228 }
229 if (compare == 0) {
230 // Record the place where we found an ambiguity.
231 if (fixpt != nullptr) {
232 UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
233 fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
234 getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
235 leftmost_id));
236 if (stopper_debug_level > 1) {
237 tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
238 getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
239 getUnicharset().id_to_unichar(leftmost_id));
240 }
241 }
242
243 if (replace) {
244 if (stopper_debug_level > 2) {
245 tprintf("replace ambiguity with %s : ",
246 getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
247 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
248 }
249 ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
250 ratings);
251 } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
252 // We found dang ambig - update ambig_blob_choices.
253 if (stopper_debug_level > 2) {
254 tprintf("found ambiguity: ");
255 UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
256 }
257 ambigs_found = true;
258 for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
259 // Add a blob choice for the corresponding fragment of the
260 // ambiguity. These fake blob choices are initialized with
261 // negative ratings (which are not possible for real blob
262 // choices), so that dawg_permute_and_select() considers any
263 // word not consisting of only the original letters a better
264 // choice and stops searching for alternatives once such a
265 // choice is found.
266 BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
267 bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
268 -1, 0, 1, 0, BCC_AMBIG));
269 }
270 }
271 spec_it.forward();
272 } else if (compare == -1) {
273 unsigned next_index;
274 if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
275 ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
276 // Add the next unichar id to wrong_ngram and keep looking for
277 // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
278 wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
279 num_wrong_blobs += best_choice->state(next_index);
280 } else {
281 break; // no more matching ambigs in this AMBIG_SPEC_LIST
282 }
283 } else {
284 spec_it.forward();
285 }
286 } // end searching AmbigSpec_LIST
287 } // end searching best_choice
288 } // end searching replace and dangerous ambigs
289
290 // If any ambiguities were found permute the constructed ambig_blob_choices
291 // to see if an alternative dictionary word can be found.
292 if (ambigs_found) {
293 if (stopper_debug_level > 2) {
294 tprintf("\nResulting ambig_blob_choices:\n");
295 for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
296 print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
297 tprintf("\n");
298 }
299 }
300 WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
301 ambigs_found = (alt_word->rating() < 0.0);
302 if (ambigs_found) {
303 if (stopper_debug_level >= 1) {
304 tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
305 }
306 if (fixpt != nullptr) {
307 // Note: Currently character choices combined from fragments can only
308 // be generated by NoDangrousAmbigs(). This code should be updated if
309 // the capability to produce classifications combined from character
310 // fragments is added to other functions.
311 int orig_i = 0;
312 for (unsigned i = 0; i < alt_word->length(); ++i) {
313 const UNICHARSET &uchset = getUnicharset();
314 bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
315 UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
316 if (replacement_is_ngram) {
317 // we have to extract the leftmost unichar from the ngram.
318 const char *str = uchset.id_to_unichar(leftmost_id);
319 int step = uchset.step(str);
320 if (step) {
321 leftmost_id = uchset.unichar_to_id(str, step);
322 }
323 }
324 int end_i = orig_i + alt_word->state(i);
325 if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
326 // Compute proper blob indices.
327 int blob_start = 0;
328 for (int j = 0; j < orig_i; ++j) {
329 blob_start += best_choice->state(j);
330 }
331 int blob_end = blob_start;
332 for (int j = orig_i; j < end_i; ++j) {
333 blob_end += best_choice->state(j);
334 }
335 fixpt->push_back(
336 DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
337 if (stopper_debug_level > 1) {
338 tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
339 replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
340 }
341 }
342 orig_i += alt_word->state(i);
343 }
344 }
345 }
346 delete alt_word;
347 }
348 if (output_ambig_words_file_ != nullptr) {
349 fprintf(output_ambig_words_file_, "\n");
350 }
351
352 for (auto data : ambig_blob_choices) {
353 delete data;
354 }
355 return !ambigs_found;
356}
#define MAX_AMBIG_SIZE
Definition: ambigs.h:34
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
@ CASE_AMBIG
Definition: ambigs.h:45
@ BCC_AMBIG
Definition: ratngs.h:52
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:627
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:93
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:58
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:160
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:370
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:159
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 381 of file dict.h.

381 {
382 return dawgs_.size();
383 }

◆ permute_choices()

void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 187 of file permdawg.cpp.

190 {
191 if (debug) {
192 tprintf(
193 "%s permute_choices: char_choice_index=%d"
194 " limit=%g rating=%g, certainty=%g word=%s\n",
195 debug, char_choice_index, *limit, word->rating(), word->certainty(),
196 word->debug_string().c_str());
197 }
198 if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199 BLOB_CHOICE_IT blob_choice_it;
200 blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202 (*attempts_left)--;
203 append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204 prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205 more_args);
206 if (*attempts_left <= 0) {
207 if (debug) {
208 tprintf("permute_choices(): attempts_left is 0\n");
209 }
210 break;
211 }
212 }
213 }
214}
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:224

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 357 of file dict.h.

358 {
359 return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
360 character, character_bytes);
361 }
std::string lang
Definition: ccutil.h:59

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgArgs dawg_args,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 571 of file dict.cpp.

572 {
573 NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
574 // Try to find the edge corresponding to the exact unichar_id and to all the
575 // edges corresponding to the character class of unichar_id.
576 std::vector<UNICHAR_ID> unichar_id_patterns;
577 unichar_id_patterns.push_back(unichar_id);
578 dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
579 for (int unichar_id_pattern : unichar_id_patterns) {
580 // On the first iteration check all the outgoing edges.
581 // On the second iteration check all self-loops.
582 for (int k = 0; k < 2; ++k) {
583 EDGE_REF edge = (k == 0)
584 ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
585 : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
586 if (edge == NO_EDGE) {
587 continue;
588 }
589 if (dawg_debug_level >= 3) {
590 tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
591 edge);
592 tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
593 }
594 if (dawg->permuter() > *curr_perm) {
595 *curr_perm = dawg->permuter();
596 }
597 if (dawg->end_of_word(edge)) {
598 dawg_args->valid_end = true;
599 }
600 dawg_args->updated_dawgs->add_unique(
601 DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
602 dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
603 }
604 }
605}

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 370 of file stopper.cpp.

371 {
372 int num_blobs_to_replace = 0;
373 int begin_blob_index = 0;
374 int i;
375 // Rating and certainty for the new BLOB_CHOICE are derived from the
376 // replaced choices.
377 float new_rating = 0.0f;
378 float new_certainty = 0.0f;
379 BLOB_CHOICE *old_choice = nullptr;
380 for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
381 if (i >= wrong_ngram_begin_index) {
382 int num_blobs = werd_choice->state(i);
383 int col = begin_blob_index + num_blobs_to_replace;
384 int row = col + num_blobs - 1;
385 BLOB_CHOICE_LIST *choices = ratings->get(col, row);
386 ASSERT_HOST(choices != nullptr);
387 old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
388 ASSERT_HOST(old_choice != nullptr);
389 new_rating += old_choice->rating();
390 new_certainty += old_choice->certainty();
391 num_blobs_to_replace += num_blobs;
392 } else {
393 begin_blob_index += werd_choice->state(i);
394 }
395 }
396 new_certainty /= wrong_ngram_size;
397 // If there is no entry in the ratings matrix, add it.
398 MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
399 if (!coord.Valid(*ratings)) {
400 ratings->IncreaseBandSize(coord.row - coord.col + 1);
401 }
402 if (ratings->get(coord.col, coord.row) == nullptr) {
403 ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
404 }
405 BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
406 BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
407 if (choice != nullptr) {
408 // Already there. Upgrade if new rating better.
409 if (new_rating < choice->rating()) {
410 choice->set_rating(new_rating);
411 }
412 if (new_certainty < choice->certainty()) {
413 choice->set_certainty(new_certainty);
414 }
415 // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
416 } else {
417 // Need a new choice with the correct_ngram_id.
418 choice = new BLOB_CHOICE(*old_choice);
419 choice->set_unichar_id(correct_ngram_id);
420 choice->set_rating(new_rating);
421 choice->set_certainty(new_certainty);
422 choice->set_classifier(BCC_AMBIG);
423 choice->set_matrix_cell(coord.col, coord.row);
424 BLOB_CHOICE_IT it(new_choices);
425 it.add_to_end(choice);
426 }
427 // Remove current unichar from werd_choice. On the last iteration
428 // set the correct replacement unichar instead of removing a unichar.
429 for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
430 if (replaced_count + 1 == wrong_ngram_size) {
431 werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
432 } else {
433 werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
434 }
435 }
436 if (stopper_debug_level >= 1) {
437 werd_choice->print("ReplaceAmbig() ");
438 tprintf("Modified blob_choices: ");
439 print_ratings_list("\n", new_choices, getUnicharset());
440 }
441}
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 27 of file hyphen.cpp.

27 {
28 if (!(last_word_on_line_ == true && last_word_on_line == false)) {
29 if (hyphen_word_ != nullptr) {
30 delete hyphen_word_;
31 hyphen_word_ = nullptr;
32 hyphen_active_dawgs_.clear();
33 }
34 }
35 if (hyphen_debug_level) {
36 tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n", last_word_on_line_,
37 last_word_on_line);
38 }
39 last_word_on_line_ = last_word_on_line;
40}

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 297 of file dict.h.

297 {
298 if (pending_words_ != nullptr) {
299 pending_words_->clear();
300 }
301 if (document_words_ != nullptr) {
302 document_words_->clear();
303 }
304 }
void clear()
Definition: trie.cpp:50

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 44 of file hyphen.cpp.

44 {
45 if (hyphen_word_ == nullptr) {
46 hyphen_word_ = new WERD_CHOICE(word.unicharset());
47 hyphen_word_->make_bad();
48 }
49 if (hyphen_word_->rating() > word.rating()) {
50 *hyphen_word_ = word;
51 // Remove the last unichar id as it is a hyphen, and remove
52 // any unichar_string/lengths that are present.
53 hyphen_word_->remove_last_unichar_id();
54 hyphen_active_dawgs_ = active_dawgs;
55 }
56 if (hyphen_debug_level) {
57 hyphen_word_->print("set_hyphen_word: ");
58 }
59}
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:419
void remove_last_unichar_id()
Definition: ratngs.h:455
float rating() const
Definition: ratngs.h:312

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 362 of file stopper.cpp.

362 {
363 reject_offset_ = 0.0;
364}

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 366 of file stopper.cpp.

366 {
367 reject_offset_ = stopper_phase2_certainty_rejection_offset;
368}

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache dawg_cache)

Definition at line 180 of file dict.cpp.

180 {
181 if (dawgs_.size() != 0) {
182 this->End();
183 }
184
185 apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
186 question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
187 slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
188 hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
189
190 if (dawg_cache != nullptr) {
191 dawg_cache_ = dawg_cache;
192 dawg_cache_is_ours_ = false;
193 } else {
194 dawg_cache_ = new DawgCache();
195 dawg_cache_is_ours_ = true;
196 }
197}

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 374 of file dict.h.

374 {
375 wildcard_unichar_id_ = id;
376 }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 469 of file dict.h.

469 {
470 wordseg_rating_adjust_factor_ = f;
471 }

◆ STRING_VAR_H() [1/6]

tesseract::Dict::STRING_VAR_H ( output_ambig_words_file  )

◆ STRING_VAR_H() [2/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_file  )

◆ STRING_VAR_H() [3/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_suffix  )

◆ STRING_VAR_H() [4/6]

tesseract::Dict::STRING_VAR_H ( user_words_file  )

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class.

◆ STRING_VAR_H() [5/6]

tesseract::Dict::STRING_VAR_H ( user_words_suffix  )

◆ STRING_VAR_H() [6/6]

tesseract::Dict::STRING_VAR_H ( word_to_debug  )

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 464 of file stopper.cpp.

464 {
465 float Certainty;
466 float WorstCertainty = FLT_MAX;
467 float CertaintyThreshold;
468 double TotalCertainty;
469 double TotalCertaintySquared;
470 double Variance;
471 float Mean, StdDev;
472 int word_length = word.length();
473
474 if (word_length < 3) {
475 return true;
476 }
477
478 TotalCertainty = TotalCertaintySquared = 0.0;
479 for (int i = 0; i < word_length; ++i) {
480 Certainty = word.certainty(i);
481 TotalCertainty += Certainty;
482 TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
483 if (Certainty < WorstCertainty) {
484 WorstCertainty = Certainty;
485 }
486 }
487
488 // Subtract off worst certainty from statistics.
489 word_length--;
490 TotalCertainty -= WorstCertainty;
491 TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
492
493 Mean = TotalCertainty / word_length;
494 Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
495 (word_length * (word_length - 1)));
496 if (Variance < 0.0) {
497 Variance = 0.0;
498 }
499 StdDev = sqrt(Variance);
500
501 CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
502 if (CertaintyThreshold > stopper_nondict_certainty_base) {
503 CertaintyThreshold = stopper_nondict_certainty_base;
504 }
505
506 if (word.certainty() < CertaintyThreshold) {
507 if (stopper_debug_level >= 1) {
508 tprintf(
509 "Stopper: Non-uniform certainty = %4.1f"
510 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
511 word.certainty(), Mean, StdDev, CertaintyThreshold);
512 }
513 return false;
514 } else {
515 return true;
516 }
517}
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1662

◆ update_best_choice()

void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 182 of file dict.h.

182 {
183 if (word.rating() < best_choice->rating()) {
184 *best_choice = word;
185 }
186 }

◆ valid_bigram()

bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 836 of file dict.cpp.

836 {
837 if (bigram_dawg_ == nullptr) {
838 return false;
839 }
840
841 // Extract the core word from the middle of each word with any digits
842 // replaced with question marks.
843 unsigned w1start, w1end, w2start, w2end;
844 word1.punct_stripped(&w1start, &w1end);
845 word2.punct_stripped(&w2start, &w2end);
846
847 // We don't want to penalize a single guillemet, hyphen, etc.
848 // But our bigram list doesn't have any information about punctuation.
849 if (w1start >= w1end) {
850 return word1.length() < 3;
851 }
852 if (w2start >= w2end) {
853 return word2.length() < 3;
854 }
855
856 const UNICHARSET &uchset = getUnicharset();
857 std::vector<UNICHAR_ID> bigram_string;
858 bigram_string.reserve(w1end + w2end + 1);
859 for (auto i = w1start; i < w1end; i++) {
860 const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
861 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
862 bigram_string.push_back(question_unichar_id_);
863 } else {
864 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
865 }
866 }
867 bigram_string.push_back(UNICHAR_SPACE);
868 for (auto i = w2start; i < w2end; i++) {
869 const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
870 if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
871 bigram_string.push_back(question_unichar_id_);
872 } else {
873 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
874 }
875 }
876 WERD_CHOICE normalized_word(&uchset, bigram_string.size());
877 for (int i : bigram_string) {
878 normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
879 }
880 return bigram_dawg_->word_in_dawg(normalized_word);
881}
@ UNICHAR_SPACE
Definition: unicharset.h:36
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 883 of file dict.cpp.

883 {
884 if (word.empty()) {
885 return NO_PERM;
886 }
887 WERD_CHOICE new_word(word.unicharset());
888 auto last_index = word.length() - 1;
889 int new_len = 0;
890 for (unsigned i = 0; i <= last_index; ++i) {
891 UNICHAR_ID unichar_id = (word.unichar_id(i));
892 if (getUnicharset().get_ispunctuation(unichar_id)) {
893 new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
894 } else if (!getUnicharset().get_isalpha(unichar_id) &&
895 !getUnicharset().get_isdigit(unichar_id)) {
896 return false; // neither punc, nor alpha, nor digit
897 } else if ((new_len = new_word.length()) == 0 ||
898 new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
899 new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
900 }
901 }
902 for (unsigned i = 0; i < dawgs_.size(); ++i) {
903 if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
904 dawgs_[i]->word_in_dawg(new_word)) {
905 return true;
906 }
907 }
908 return false;
909}
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 450 of file dict.h.

450 {
451 WERD_CHOICE word(string, getUnicharset());
452 return valid_word(word);
453 }

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 443 of file dict.h.

443 {
444 return valid_word(word, false); // return NO_PERM for words with digits
445 }

◆ valid_word() [3/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 801 of file dict.cpp.

801 {
802 const WERD_CHOICE *word_ptr = &word;
803 WERD_CHOICE temp_word(word.unicharset());
804 if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
805 copy_hyphen_info(&temp_word);
806 temp_word += word;
807 word_ptr = &temp_word;
808 }
809 if (word_ptr->empty()) {
810 return NO_PERM;
811 }
812 // Allocate vectors for holding current and updated
813 // active_dawgs and initialize them.
814 DawgPositionVector active_dawgs[2];
815 init_active_dawgs(&(active_dawgs[0]), false);
816 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
817 int last_index = word_ptr->length() - 1;
818 // Call letter_is_okay for each letter in the word.
819 for (int i = hyphen_base_size(); i <= last_index; ++i) {
820 if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
821 i == last_index))) {
822 break;
823 }
824 // Swap active_dawgs, constraints with the corresponding updated vector.
825 if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
826 dawg_args.updated_dawgs = &(active_dawgs[0]);
827 ++(dawg_args.active_dawgs);
828 } else {
829 ++(dawg_args.updated_dawgs);
830 dawg_args.active_dawgs = &(active_dawgs[0]);
831 }
832 }
833 return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
834}
const UNICHARSET * unicharset() const
Definition: ratngs.h:281
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 446 of file dict.h.

446 {
447 return valid_word(word, true); // return NUMBER_PERM for valid numbers
448 }

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter ( uint8_t  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 437 of file dict.h.

437 {
438 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
439 perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
440 (numbers_ok && perm == NUMBER_PERM));
441 }
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ NUMBER_PERM
Definition: ratngs.h:242

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 377 of file dict.h.

377 {
378 return wildcard_unichar_id_;
379 }

Member Data Documentation

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 210 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 345 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 354 of file dict.h.


The documentation for this class was generated from the following files: