31 , probability_in_context_(&
tesseract::
Dict::def_probability_in_context)
33 , wildcard_unichar_id_(INVALID_UNICHAR_ID)
34 , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
35 , question_unichar_id_(INVALID_UNICHAR_ID)
36 , slash_unichar_id_(INVALID_UNICHAR_ID)
37 , hyphen_unichar_id_(INVALID_UNICHAR_ID)
38 ,
STRING_MEMBER(user_words_file,
"",
"A filename of user-provided words.",
39 getCCUtil()->params())
41 "A suffix of user-provided words located in tessdata.",
42 getCCUtil()->params())
43 ,
STRING_MEMBER(user_patterns_file,
"",
"A filename of user-provided patterns.",
44 getCCUtil()->params())
46 "A suffix of user-provided patterns located in "
48 getCCUtil()->params())
49 ,
BOOL_INIT_MEMBER(load_system_dawg, true,
"Load system word dawg.", getCCUtil()->params())
50 ,
BOOL_INIT_MEMBER(load_freq_dawg, true,
"Load frequent word dawg.", getCCUtil()->params())
52 getCCUtil()->params())
54 "Load dawg with punctuation"
56 getCCUtil()->params())
58 "Load dawg with number"
60 getCCUtil()->params())
62 "Load dawg with special word "
64 getCCUtil()->params())
66 "Score penalty (0.1 = 10%) added if there are subscripts "
67 "or superscripts in a word, but it is otherwise OK.",
68 getCCUtil()->params())
70 "Score penalty (0.1 = 10%) added if an xheight is "
72 getCCUtil()->params())
74 "Score multiplier for word matches which have good case and"
75 " are frequent in the given language (lower is better).",
76 getCCUtil()->params())
78 "Score multiplier for word matches that have good case "
80 getCCUtil()->params())
82 "Default score multiplier for word matches, which may have "
83 "case issues (lower is better).",
84 getCCUtil()->params())
86 "Score multiplier for glyph fragment segmentations which "
87 "do not match a dictionary word (lower is better).",
88 getCCUtil()->params())
90 "Score multiplier for poorly cased strings that are not in"
91 " the dictionary and generally look like garbage (lower is"
93 getCCUtil()->params())
95 "Output file for ambiguities found in the dictionary", getCCUtil()->params())
97 "Set to 1 for general debug info"
98 ", to 2 for more details, to 3 to see all the debug messages",
99 getCCUtil()->params())
100 ,
INT_MEMBER(hyphen_debug_level, 0,
"Debug level for hyphenated words.", getCCUtil()->params())
102 "Use only the first UTF8 step of the given string"
103 " when computing log probabilities.",
104 getCCUtil()->params())
105 ,
double_MEMBER(certainty_scale, 20.0,
"Certainty scaling factor", getCCUtil()->params())
106 ,
double_MEMBER(stopper_nondict_certainty_base, -2.50,
"Certainty threshold for non-dict words",
107 getCCUtil()->params())
108 ,
double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
"Reject certainty offset",
109 getCCUtil()->params())
110 ,
INT_MEMBER(stopper_smallword_size, 2,
"Size of dict word to be treated as non-dict word",
111 getCCUtil()->params())
114 " for each dict char above small word size.",
115 getCCUtil()->params())
117 "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params())
118 ,
INT_MEMBER(stopper_debug_level, 0,
"Stopper debug level", getCCUtil()->params())
119 ,
BOOL_MEMBER(stopper_no_acceptable_choices, false,
120 "Make AcceptableChoice() always return false. Useful"
121 " when there is a need to explore all segmentations",
122 getCCUtil()->params())
123 ,
INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
"Max words to keep in list",
124 getCCUtil()->params())
126 "Word for which stopper debug"
127 " information should be printed to stdout",
128 getCCUtil()->params())
130 "Don't use any alphabetic-specific tricks."
131 " Set to true in the traineddata config file for"
132 " scripts that are cursive or inherently fixed-pitch",
133 getCCUtil()->params())
134 ,
BOOL_MEMBER(save_doc_words, 0,
"Save Document Words", getCCUtil()->params())
135 ,
double_MEMBER(doc_dict_pending_threshold, 0.0,
"Worst certainty for using pending dictionary",
136 getCCUtil()->params())
138 "Worst certainty for words that can be inserted into the"
139 " document dictionary",
140 getCCUtil()->params())
142 "Maximum number of different"
143 " character choices to consider during permutation."
144 " This limit is especially useful when user patterns"
145 " are specified, since overly generic patterns can result in"
146 " dawg search exploring an overly large number of options.",
147 getCCUtil()->params()) {
148 reject_offset_ = 0.0;
150 hyphen_word_ =
nullptr;
151 last_word_on_line_ =
false;
152 document_words_ =
nullptr;
153 dawg_cache_ =
nullptr;
154 dawg_cache_is_ours_ =
false;
155 pending_words_ =
nullptr;
156 bigram_dawg_ =
nullptr;
157 freq_dawg_ =
nullptr;
158 punc_dawg_ =
nullptr;
159 unambig_dawg_ =
nullptr;
160 wordseg_rating_adjust_factor_ = -1.0f;
161 output_ambig_words_file_ =
nullptr;
167 if (output_ambig_words_file_ !=
nullptr) {
168 fclose(output_ambig_words_file_);
181 if (dawgs_.size() != 0) {
190 if (dawg_cache !=
nullptr) {
191 dawg_cache_ = dawg_cache;
192 dawg_cache_is_ours_ =
false;
195 dawg_cache_is_ours_ =
true;
202 if (load_punc_dawg) {
206 dawgs_.push_back(punc_dawg_);
209 if (load_system_dawg) {
213 dawgs_.push_back(system_dawg);
216 if (load_number_dawg) {
220 dawgs_.push_back(number_dawg);
223 if (load_bigram_dawg) {
229 if (load_freq_dawg) {
233 dawgs_.push_back(freq_dawg_);
236 if (load_unambig_dawg) {
240 dawgs_.push_back(unambig_dawg_);
245 if (!user_words_suffix.empty() || !user_words_file.empty()) {
248 if (!user_words_file.empty()) {
249 name = user_words_file;
252 name += user_words_suffix;
256 tprintf(
"Error: failed to load %s\n", name.c_str());
259 dawgs_.push_back(trie_ptr);
263 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
267 if (!user_patterns_file.empty()) {
268 name = user_patterns_file;
271 name += user_patterns_suffix;
274 tprintf(
"Error: failed to load %s\n", name.c_str());
277 dawgs_.push_back(trie_ptr);
283 dawgs_.push_back(document_words_);
293 if (load_punc_dawg) {
297 dawgs_.push_back(punc_dawg_);
300 if (load_system_dawg) {
304 dawgs_.push_back(system_dawg);
307 if (load_number_dawg) {
311 dawgs_.push_back(number_dawg);
318 if (!user_words_suffix.empty() || !user_words_file.empty()) {
321 if (!user_words_file.empty()) {
322 name = user_words_file;
325 name += user_words_suffix;
329 tprintf(
"Error: failed to load %s\n", name.c_str());
332 dawgs_.push_back(trie_ptr);
336 if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
340 if (!user_patterns_file.empty()) {
341 name = user_patterns_file;
344 name += user_patterns_suffix;
347 tprintf(
"Error: failed to load %s\n", name.c_str());
350 dawgs_.push_back(trie_ptr);
358 if (dawgs_.empty()) {
364 successors_.reserve(dawgs_.size());
365 for (
auto dawg : dawgs_) {
367 for (
unsigned j = 0; j < dawgs_.size(); ++j) {
368 const Dawg *other = dawgs_[j];
369 if (dawg !=
nullptr && other !=
nullptr && (dawg->lang() == other->
lang()) &&
370 kDawgSuccessors[dawg->type()][other->
type()]) {
374 successors_.push_back(lst);
380 if (dawgs_.empty()) {
383 for (
auto &dawg : dawgs_) {
388 dawg_cache_->
FreeDawg(bigram_dawg_);
389 if (dawg_cache_is_ours_) {
391 dawg_cache_ =
nullptr;
393 for (
auto successor : successors_) {
398 document_words_ =
nullptr;
399 delete pending_words_;
400 pending_words_ =
nullptr;
408 auto *dawg_args =
static_cast<DawgArgs *
>(void_dawg_args);
412 if (dawg_debug_level >= 3) {
414 "def_letter_is_okay: current unichar=%s word_end=%d"
415 " num active dawgs=%zu\n",
416 getUnicharset().debug_str(unichar_id).c_str(), word_end, dawg_args->active_dawgs->
size());
429 dawg_args->updated_dawgs->clear();
430 dawg_args->valid_end =
false;
435 for (
unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
436 const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
440 if (!dawg && !punc_dawg) {
442 tprintf(
"Received DawgPosition with no dawg or punc_dawg. wth?\n");
450 if (punc_transition_edge != NO_EDGE) {
453 for (
int sdawg_index : slist) {
454 const Dawg *sdawg = dawgs_[sdawg_index];
457 if (dawg_edge != NO_EDGE) {
458 if (dawg_debug_level >= 3) {
459 tprintf(
"Letter found in dawg %d\n", sdawg_index);
461 dawg_args->updated_dawgs->add_unique(
463 dawg_debug_level > 0,
"Append transition from punc dawg to current dawgs: ");
464 if (sdawg->
permuter() > curr_perm) {
468 dawg_args->valid_end =
true;
474 if (punc_edge != NO_EDGE) {
475 if (dawg_debug_level >= 3) {
476 tprintf(
"Letter found in punctuation dawg\n");
478 dawg_args->updated_dawgs->add_unique(
480 "Extend punctuation dawg: ");
485 dawg_args->valid_end =
true;
496 punc_node == NO_EDGE ? NO_EDGE : punc_dawg->
edge_char_of(punc_node, unichar_id, word_end);
497 if (punc_edge != NO_EDGE) {
498 dawg_args->updated_dawgs->add_unique(
500 dawg_debug_level > 0,
"Return to punctuation dawg: ");
505 dawg_args->valid_end =
true;
531 if (dawg_debug_level >= 3) {
535 if (edge != NO_EDGE) {
536 if (dawg_debug_level >= 3) {
540 if (dawg_debug_level >= 3) {
541 tprintf(
"Punctuation constraint not satisfied at end of word.\n");
550 dawg_args->valid_end =
true;
552 dawg_args->updated_dawgs->add_unique(
554 dawg_debug_level > 0,
"Append current dawg to updated active dawgs: ");
563 dawg_args->permuter = curr_perm;
565 if (dawg_debug_level >= 2) {
566 tprintf(
"Returning %d for permuter code for this character.\n", dawg_args->permuter);
568 return dawg_args->permuter;
576 std::vector<UNICHAR_ID> unichar_id_patterns;
577 unichar_id_patterns.push_back(unichar_id);
579 for (
int unichar_id_pattern : unichar_id_patterns) {
582 for (
int k = 0; k < 2; ++k) {
584 ? dawg->
edge_char_of(node, unichar_id_pattern, word_end)
586 if (edge == NO_EDGE) {
589 if (dawg_debug_level >= 3) {
594 if (dawg->
permuter() > *curr_perm) {
602 dawg_debug_level > 0,
"Append current dawg to updated active dawgs: ");
612 *active_dawgs = hyphen_active_dawgs_;
613 if (dawg_debug_level >= 3) {
614 for (
unsigned i = 0;
i < hyphen_active_dawgs_.size(); ++
i) {
616 hyphen_active_dawgs_[
i].dawg_index, hyphen_active_dawgs_[
i].dawg_ref);
625 bool punc_dawg_available = (punc_dawg_ !=
nullptr) &&
628 for (
unsigned i = 0;
i < dawgs_.size();
i++) {
630 int dawg_ty = dawgs_[
i]->type();
633 dawg_pos_vec->push_back(
DawgPosition(-1, NO_EDGE,
i, NO_EDGE,
false));
634 if (dawg_debug_level >= 3) {
637 }
else if (!punc_dawg_available || !subsumed_by_punc) {
638 dawg_pos_vec->push_back(
DawgPosition(
i, NO_EDGE, -1, NO_EDGE,
false));
639 if (dawg_debug_level >= 3) {
658 int stringlen = best_choice.
length();
660 if (
valid_word(best_choice) || stringlen < 2) {
665 if (best_choice.
length() >= kDocDictMaxRepChars) {
666 int num_rep_chars = 1;
668 for (
unsigned i = 1;
i < best_choice.
length(); ++
i) {
674 if (num_rep_chars == kDocDictMaxRepChars) {
681 if (best_choice.
certainty() < doc_dict_certainty_threshold || stringlen == 2) {
682 if (best_choice.
certainty() < doc_dict_pending_threshold) {
696 if (save_doc_words) {
697 std::string filename(
getCCUtil()->imagefile);
699 FILE *doc_word_file = fopen(filename.c_str(),
"a");
700 if (doc_word_file ==
nullptr) {
701 tprintf(
"Error: Could not open file %s\n", filename.c_str());
704 fprintf(doc_word_file,
"%s\n", best_choice.
debug_string().c_str());
705 fclose(doc_word_file);
711 float additional_adjust,
bool modify_rating,
bool debug) {
714 bool case_is_ok = (is_han ||
case_ok(*word));
717 float adjust_factor = additional_adjust;
718 float new_rating = word->
rating();
719 new_rating += kRatingPad;
720 const char *xheight_triggered =
"";
723 switch (xheight_consistency) {
725 adjust_factor += xheight_penalty_inconsistent;
726 xheight_triggered =
", xhtBAD";
729 adjust_factor += xheight_penalty_subscripts;
730 xheight_triggered =
", xhtSUB";
740 tprintf(
"Consistency could not be calculated.\n");
745 word->
rating(), xheight_triggered);
749 if (case_is_ok && punc_is_ok) {
750 adjust_factor += segment_penalty_dict_nonword;
751 new_rating *= adjust_factor;
756 adjust_factor += segment_penalty_garbage;
757 new_rating *= adjust_factor;
769 if (!is_han && freq_dawg_ !=
nullptr && freq_dawg_->
word_in_dawg(*word)) {
771 adjust_factor += segment_penalty_dict_frequent_word;
772 new_rating *= adjust_factor;
777 adjust_factor += segment_penalty_dict_case_ok;
778 new_rating *= adjust_factor;
784 adjust_factor += segment_penalty_dict_case_bad;
785 new_rating *= adjust_factor;
791 new_rating -= kRatingPad;
796 tprintf(
" %4.2f --> %4.2f\n", adjust_factor, new_rating);
807 word_ptr = &temp_word;
809 if (word_ptr->
empty()) {
817 int last_index = word_ptr->
length() - 1;
837 if (bigram_dawg_ ==
nullptr) {
843 unsigned w1start, w1end, w2start, w2end;
849 if (w1start >= w1end) {
850 return word1.
length() < 3;
852 if (w2start >= w2end) {
853 return word2.
length() < 3;
857 std::vector<UNICHAR_ID> bigram_string;
858 bigram_string.reserve(w1end + w2end + 1);
859 for (
auto i = w1start;
i < w1end;
i++) {
861 if (normed_ids.size() == 1 && uchset.
get_isdigit(normed_ids[0])) {
862 bigram_string.push_back(question_unichar_id_);
864 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
868 for (
auto i = w2start;
i < w2end;
i++) {
870 if (normed_ids.size() == 1 && uchset.
get_isdigit(normed_ids[0])) {
871 bigram_string.push_back(question_unichar_id_);
873 bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
877 for (
int i : bigram_string) {
888 auto last_index = word.
length() - 1;
890 for (
unsigned i = 0;
i <= last_index; ++
i) {
897 }
else if ((new_len = new_word.
length()) == 0 ||
902 for (
unsigned i = 0;
i < dawgs_.size(); ++
i) {
904 dawgs_[
i]->word_in_dawg(new_word)) {
#define INT_MEMBER(name, val, comment, vec)
#define STRING_INIT_MEMBER(name, val, comment, vec)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
#define double_MEMBER(name, val, comment, vec)
#define STRING_MEMBER(name, val, comment, vec)
#define BOOL_MEMBER(name, val, comment, vec)
std::vector< int > SuccessorList
void tprintf(const char *format,...)
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG
void punct_stripped(unsigned *start_core, unsigned *end_core) const
std::string debug_string() const
int GetTopScriptID() const
UNICHAR_ID unichar_id(unsigned index) const
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void set_permuter(uint8_t perm)
const UNICHARSET * unicharset() const
void set_adjust_factor(float factor)
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
std::string & unichar_string()
void set_rating(float new_val)
std::string language_data_path_prefix
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
bool get_isalpha(UNICHAR_ID unichar_id) const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const std::string & lang() const
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
PermuterType permuter() const
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
static const UNICHAR_ID kPatternUnicharID
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Dawg * GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
bool FreeDawg(Dawg *dawg)
DawgPositionVector * updated_dawgs
DawgPositionVector * active_dawgs
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
void copy_hyphen_info(WERD_CHOICE *word) const
static DawgCache * GlobalDawgCache()
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
const CCUtil * getCCUtil() const
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
void LoadLSTM(const std::string &lang, TessdataManager *data_file)
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
void SetupForLoad(DawgCache *dawg_cache)
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
bool valid_punctuation(const WERD_CHOICE &word)
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
void Load(const std::string &lang, TessdataManager *data_file)
int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
const UNICHARSET & getUnicharset() const
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
void initialize_patterns(UNICHARSET *unicharset)
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)