tesseract v5.3.3.20231005
dict.h
Go to the documentation of this file.
1
2// File: dict.h
3// Description: dict class.
4// Author: Samuel Charron
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_DICT_DICT_H_
20#define TESSERACT_DICT_DICT_H_
21
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h" // DISABLED_LEGACY_ENGINE
24#endif
25
26#ifndef DISABLED_LEGACY_ENGINE
27# include "ambigs.h"
28#endif
29#include "dawg.h"
30#include "dawg_cache.h"
31#include "ratngs.h"
32#include "stopper.h"
33#include "trie.h"
34#include "unicharset.h"
35#ifndef DISABLED_LEGACY_ENGINE
37#endif // ndef DISABLED_LEGACY_ENGINE
38
39namespace tesseract {
40
41class MATRIX;
42class WERD_RES;
43
44#define CHARS_PER_LINE 500
45#define MAX_WERD_LENGTH (int64_t)128
46#define NO_RATING -1
47
53 float rating;
54 float certainty;
55};
56
57using DawgVector = std::vector<Dawg *>;
58
59//
60// Constants
61//
62static const int kRatingPad = 4;
63static const int kDictMaxWildcards = 2; // max wildcards for a word
64// TODO(daria): If hyphens are different in different languages and can be
65// inferred from training data we should load their values dynamically.
66static const char kHyphenSymbol[] = "-";
67static const char kSlashSymbol[] = "/";
68static const char kQuestionSymbol[] = "?";
69static const char kApostropheSymbol[] = "'";
70static const float kSimCertaintyScale = -10.0; // similarity matcher scaling
71static const float kSimCertaintyOffset = -10.0; // similarity matcher offset
72static const float kSimilarityFloor = 100.0; // worst E*L product to stop on
73static const int kDocDictMaxRepChars = 4;
74
75// Enum for describing whether the x-height for the word is consistent:
76// 0 - everything is good.
77// 1 - there are one or two secondary (but consistent) baselines
78// [think subscript and superscript], or there is an oversized
79// first character.
80// 2 - the word is inconsistent.
82
83struct DawgArgs {
85 : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}
86
90 // True if the current position is a valid word end.
92};
93
95public:
96 Dict(CCUtil *image_ptr);
97 ~Dict();
98 const CCUtil *getCCUtil() const {
99 return ccutil_;
100 }
102 return ccutil_;
103 }
104 const UNICHARSET &getUnicharset() const {
105 return getCCUtil()->unicharset;
106 }
108 return getCCUtil()->unicharset;
109 }
110#ifndef DISABLED_LEGACY_ENGINE
112 return getCCUtil()->unichar_ambigs;
113 }
114#endif
115 // Returns true if unichar_id is a word compounding character like - or /.
116 inline bool compound_marker(UNICHAR_ID unichar_id) {
117 const UNICHARSET &unicharset = getUnicharset();
118 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
119 const auto &normed_ids = unicharset.normed_ids(unichar_id);
120 return normed_ids.size() == 1 &&
121 (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
122 }
123 // Returns true if unichar_id is an apostrophe-like character that may
124 // separate prefix/suffix words from a main body word.
125 inline bool is_apostrophe(UNICHAR_ID unichar_id) {
126 const UNICHARSET &unicharset = getUnicharset();
127 ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
128 const auto &normed_ids = unicharset.normed_ids(unichar_id);
129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
130 }
131
132 /* hyphen.cpp ************************************************************/
133
135 inline bool hyphenated() const {
136 return !last_word_on_line_ && hyphen_word_;
137 }
139 inline int hyphen_base_size() const {
140 return this->hyphenated() ? hyphen_word_->length() : 0;
141 }
145 inline void copy_hyphen_info(WERD_CHOICE *word) const {
146 if (this->hyphenated()) {
147 *word = *hyphen_word_;
148 if (hyphen_debug_level) {
149 word->print("copy_hyphen_info: ");
150 }
151 }
152 }
154 inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,
155 bool first_pos) const {
156 if (!last_word_on_line_ || first_pos) {
157 return false;
158 }
159 ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
160 const auto &normed_ids = unicharset->normed_ids(unichar_id);
161 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
162 }
164 inline bool has_hyphen_end(const WERD_CHOICE &word) const {
165 int word_index = word.length() - 1;
166 return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
167 }
171 void reset_hyphen_vars(bool last_word_on_line);
174 void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);
175
176 /* permdawg.cpp ************************************************************/
177 // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().
178 // When this function is refactored, permdawg.cpp can be removed.
179
182 inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {
183 if (word.rating() < best_choice->rating()) {
184 *best_choice = word;
185 }
186 }
190 void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;
191 // Fill the given vector with the default collection of any-length dawgs
192 void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;
193
199 WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,
200 float rating_limit);
204 void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
205 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
206 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
207 WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);
208
210 void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
211 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
212 bool word_ending, WERD_CHOICE *word, float certainties[],
213 float *limit, WERD_CHOICE *best_choice, int *attempts_left,
214 void *void_more_args);
215 //
216 // Helper functions for dawg_permute_and_select().
217 //
218 void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
219 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
220 WERD_CHOICE *word, float certainties[], float *limit,
221 WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
222
223 void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
224 const BLOB_CHOICE &blob_choice, int char_choice_index,
225 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
226 float certainties[], float *limit, WERD_CHOICE *best_choice,
227 int *attempts_left, void *more_args);
228
229 bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
230 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
231 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);
232
233 /* stopper.cpp *************************************************************/
234#if !defined(DISABLED_LEGACY_ENGINE)
235 bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,
236 MATRIX *ratings);
237#endif // !defined(DISABLED_LEGACY_ENGINE)
238 // Replaces the corresponding wrong ngram in werd_choice with the correct
239 // one. The whole correct n-gram is inserted into the ratings matrix and
240 // the werd_choice: no more fragments!. Rating and certainty of new entries
241 // in matrix and werd_choice are the sum and mean of the wrong ngram
242 // respectively.
243 // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes
244 // mystring", with a new entry in the ratings matrix for ".
245 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,
246 WERD_CHOICE *werd_choice, MATRIX *ratings);
247
249 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;
257 int UniformCertainties(const WERD_CHOICE &word);
259 bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);
263 bool AcceptableResult(WERD_RES *word) const;
264#if !defined(DISABLED_LEGACY_ENGINE)
265 void EndDangerousAmbigs();
266#endif // !defined(DISABLED_LEGACY_ENGINE)
268 void DebugWordChoices();
270 void SettupStopperPass1();
272 void SettupStopperPass2();
273 /* context.cpp *************************************************************/
275 int case_ok(const WERD_CHOICE &word) const;
278 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
279
280 /* dict.cpp ****************************************************************/
281
284 static DawgCache *GlobalDawgCache();
285 // Sets up ready for a Load or LoadLSTM.
286 void SetupForLoad(DawgCache *dawg_cache);
287 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
288 void Load(const std::string &lang, TessdataManager *data_file);
289 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
290 void LoadLSTM(const std::string &lang, TessdataManager *data_file);
291 // Completes the loading process after Load() and/or LoadLSTM().
292 // Returns false if no dictionaries were loaded.
293 bool FinishLoad();
294 void End();
295
296 // Resets the document dictionary analogous to ResetAdaptiveClassifier.
298 if (pending_words_ != nullptr) {
299 pending_words_->clear();
300 }
301 if (document_words_ != nullptr) {
302 document_words_->clear();
303 }
304 }
305
341 //
342 int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
343 bool word_end) const;
344
345 int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,
346 UNICHAR_ID unichar_id, bool word_end) const;
348 int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,
349 bool word_end) const {
350 return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
351 }
352
354 double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,
355 const char *character, int character_bytes);
357 double ProbabilityInContext(const char *context, int context_bytes, const char *character,
358 int character_bytes) {
359 return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
360 character, character_bytes);
361 }
362
364 double def_probability_in_context(const char *lang, const char *context, int context_bytes,
365 const char *character, int character_bytes) {
366 (void)lang;
367 (void)context;
368 (void)context_bytes;
369 (void)character;
370 (void)character_bytes;
371 return 0.0;
372 }
373
374 inline void SetWildcardID(UNICHAR_ID id) {
375 wildcard_unichar_id_ = id;
376 }
377 inline UNICHAR_ID WildcardID() const {
378 return wildcard_unichar_id_;
379 }
381 inline int NumDawgs() const {
382 return dawgs_.size();
383 }
385 inline const Dawg *GetDawg(int index) const {
386 return dawgs_[index];
387 }
389 inline const Dawg *GetPuncDawg() const {
390 return punc_dawg_;
391 }
393 inline const Dawg *GetUnambigDawg() const {
394 return unambig_dawg_;
395 }
397 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
398 if (edge_ref == NO_EDGE) {
399 return 0; // beginning to explore the dawg
400 }
401 NODE_REF node = dawg->next_node(edge_ref);
402 if (node == 0) {
403 node = NO_EDGE; // end of word
404 }
405 return node;
406 }
407
408 // Given a unichar from a string and a given dawg, return the unichar
409 // we should use to match in that dawg type. (for example, in the number
410 // dawg, all numbers are transformed to kPatternUnicharId).
411 UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {
412 if (!dawg) {
413 return ch;
414 }
415 switch (dawg->type()) {
416 case DAWG_TYPE_NUMBER:
417 return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
418 default:
419 return ch;
420 }
421 }
422
428 void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,
429 bool word_end, DawgArgs *dawg_args,
430 PermuterType *current_permuter) const;
431
435
437 inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {
438 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
439 perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
440 (numbers_ok && perm == NUMBER_PERM));
441 }
442 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
443 int valid_word(const WERD_CHOICE &word) const {
444 return valid_word(word, false); // return NO_PERM for words with digits
445 }
446 int valid_word_or_number(const WERD_CHOICE &word) const {
447 return valid_word(word, true); // return NUMBER_PERM for valid numbers
448 }
450 int valid_word(const char *string) const {
451 WERD_CHOICE word(string, getUnicharset());
452 return valid_word(word);
453 }
454 // Do the two WERD_CHOICEs form a meaningful bigram?
455 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
460 bool valid_punctuation(const WERD_CHOICE &word);
462 int good_choice(const WERD_CHOICE &choice);
464 void add_document_word(const WERD_CHOICE &best_choice);
466 void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,
467 float additional_adjust, bool modify_rating, bool debug);
469 inline void SetWordsegRatingAdjustFactor(float f) {
470 wordseg_rating_adjust_factor_ = f;
471 }
473 bool IsSpaceDelimitedLang() const;
474
475private:
477 CCUtil *ccutil_;
484#ifndef DISABLED_LEGACY_ENGINE
485 UnicharAmbigs *dang_ambigs_table_ = nullptr;
487 UnicharAmbigs *replace_ambigs_table_ = nullptr;
488#endif
490 float reject_offset_;
491 // Cached UNICHAR_IDs:
492 UNICHAR_ID wildcard_unichar_id_; // kDictWildcard.
493 UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.
494 UNICHAR_ID question_unichar_id_; // kQuestionSymbol.
495 UNICHAR_ID slash_unichar_id_; // kSlashSymbol.
496 UNICHAR_ID hyphen_unichar_id_; // kHyphenSymbol.
497 // Hyphen-related variables.
498 WERD_CHOICE *hyphen_word_;
499 DawgPositionVector hyphen_active_dawgs_;
500 bool last_word_on_line_;
501 // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary
502 // matching. The first member of each list is taken as canonical. For
503 // example, the first list contains hyphens and dashes with the first symbol
504 // being the ASCII hyphen minus.
505 std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
506 // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
507 DawgCache *dawg_cache_;
508 bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
509 // Dawgs.
510 DawgVector dawgs_;
511 SuccessorListsVector successors_;
512 Trie *pending_words_;
515 // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if
516 // any of them are present on the best choices list for a word pair.
517 // the bigrams are stored as space-separated words where:
518 // (1) leading and trailing punctuation has been removed from each word and
519 // (2) any digits have been replaced with '?' marks.
520 Dawg *bigram_dawg_;
521 // TODO(daria): need to support multiple languages in the future,
522 // so maybe will need to maintain a list of dawgs of each kind.
523 Dawg *freq_dawg_;
524 Dawg *unambig_dawg_;
525 Dawg *punc_dawg_;
526 Trie *document_words_;
529 float wordseg_rating_adjust_factor_;
530 // File for recording ambiguities discovered during dictionary search.
531 FILE *output_ambig_words_file_;
532
533public:
537 STRING_VAR_H(user_words_file);
538 STRING_VAR_H(user_words_suffix);
539 STRING_VAR_H(user_patterns_file);
540 STRING_VAR_H(user_patterns_suffix);
541 BOOL_VAR_H(load_system_dawg);
542 BOOL_VAR_H(load_freq_dawg);
543 BOOL_VAR_H(load_unambig_dawg);
544 BOOL_VAR_H(load_punc_dawg);
545 BOOL_VAR_H(load_number_dawg);
546 BOOL_VAR_H(load_bigram_dawg);
547 double_VAR_H(xheight_penalty_subscripts);
548 double_VAR_H(xheight_penalty_inconsistent);
549 double_VAR_H(segment_penalty_dict_frequent_word);
550 double_VAR_H(segment_penalty_dict_case_ok);
551 double_VAR_H(segment_penalty_dict_case_bad);
552 double_VAR_H(segment_penalty_dict_nonword);
553 double_VAR_H(segment_penalty_garbage);
554 STRING_VAR_H(output_ambig_words_file);
555 INT_VAR_H(dawg_debug_level);
556 INT_VAR_H(hyphen_debug_level);
557 BOOL_VAR_H(use_only_first_uft8_step);
558 double_VAR_H(certainty_scale);
559 double_VAR_H(stopper_nondict_certainty_base);
560 double_VAR_H(stopper_phase2_certainty_rejection_offset);
561 INT_VAR_H(stopper_smallword_size);
562 double_VAR_H(stopper_certainty_per_char);
563 double_VAR_H(stopper_allowable_character_badness);
564 INT_VAR_H(stopper_debug_level);
565 BOOL_VAR_H(stopper_no_acceptable_choices);
566 INT_VAR_H(tessedit_truncate_wordchoice_log);
567 STRING_VAR_H(word_to_debug);
568 BOOL_VAR_H(segment_nonalphabetic_script);
569 BOOL_VAR_H(save_doc_words);
570 double_VAR_H(doc_dict_pending_threshold);
571 double_VAR_H(doc_dict_certainty_threshold);
572 INT_VAR_H(max_permuter_attempts);
573};
574
575} // namespace tesseract
576
577#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_
#define ASSERT_HOST(x)
Definition: errcode.h:54
const char * p
@ DAWG_TYPE_NUMBER
Definition: dawg.h:67
int64_t EDGE_REF
Definition: dawg.h:49
XHeightConsistencyEnum
Definition: dict.h:81
@ XH_GOOD
Definition: dict.h:81
@ XH_SUBNORMAL
Definition: dict.h:81
@ XH_INCONSISTENT
Definition: dict.h:81
std::vector< SuccessorList * > SuccessorListsVector
Definition: dawg.h:62
int64_t NODE_REF
Definition: dawg.h:50
@ character
Definition: mfoutline.h:53
std::vector< Dawg * > DawgVector
Definition: dict.h:57
int UNICHAR_ID
Definition: unichar.h:34
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47
PermuterType
Definition: ratngs.h:235
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ NUMBER_PERM
Definition: ratngs.h:242
@ COMPOUND_PERM
Definition: ratngs.h:248
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ USER_PATTERN_PERM
Definition: ratngs.h:243
@ DOC_DAWG_PERM
Definition: ratngs.h:245
@ FREQ_DAWG_PERM
Definition: ratngs.h:247
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:627
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
const UNICHARSET * unicharset() const
Definition: ratngs.h:281
unsigned length() const
Definition: ratngs.h:287
void print() const
Definition: ratngs.h:561
float rating() const
Definition: ratngs.h:312
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
DawgType type() const
Definition: dawg.h:119
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117
const CHAR_FRAGMENT * fragment
Definition: dict.h:51
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:84
DawgPositionVector * updated_dawgs
Definition: dict.h:88
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
bool valid_end
Definition: dict.h:91
BOOL_VAR_H(use_only_first_uft8_step)
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145
double_VAR_H(segment_penalty_dict_case_ok)
UNICHAR_ID WildcardID() const
Definition: dict.h:377
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:469
STRING_VAR_H(user_words_file)
double_VAR_H(doc_dict_certainty_threshold)
BOOL_VAR_H(load_punc_dawg)
double_VAR_H(stopper_nondict_certainty_base)
const CCUtil * getCCUtil() const
Definition: dict.h:98
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:443
BOOL_VAR_H(load_bigram_dawg)
STRING_VAR_H(word_to_debug)
INT_VAR_H(dawg_debug_level)
STRING_VAR_H(user_patterns_suffix)
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:348
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
double_VAR_H(segment_penalty_dict_case_bad)
STRING_VAR_H(user_words_suffix)
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411
INT_VAR_H(hyphen_debug_level)
UNICHARSET & getUnicharset()
Definition: dict.h:107
double_VAR_H(certainty_scale)
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:381
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:450
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
double_VAR_H(xheight_penalty_subscripts)
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:364
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385
BOOL_VAR_H(load_unambig_dawg)
INT_VAR_H(tessedit_truncate_wordchoice_log)
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:116
double_VAR_H(stopper_phase2_certainty_rejection_offset)
STRING_VAR_H(output_ambig_words_file)
double_VAR_H(doc_dict_pending_threshold)
double_VAR_H(segment_penalty_dict_nonword)
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:393
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:125
void ResetDocumentDictionary()
Definition: dict.h:297
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:389
BOOL_VAR_H(load_system_dawg)
BOOL_VAR_H(load_number_dawg)
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:357
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
BOOL_VAR_H(segment_nonalphabetic_script)
double_VAR_H(segment_penalty_garbage)
INT_VAR_H(stopper_debug_level)
double_VAR_H(segment_penalty_dict_frequent_word)
STRING_VAR_H(user_patterns_file)
BOOL_VAR_H(stopper_no_acceptable_choices)
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:164
INT_VAR_H(stopper_smallword_size)
double_VAR_H(stopper_certainty_per_char)
BOOL_VAR_H(save_doc_words)
double_VAR_H(stopper_allowable_character_badness)
CCUtil * getCCUtil()
Definition: dict.h:101
BOOL_VAR_H(load_freq_dawg)
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
INT_VAR_H(max_permuter_attempts)
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:374
double_VAR_H(xheight_penalty_inconsistent)
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:446
#define TESS_API
Definition: export.h:32