46static inline double log2(
double n) {
47 return log(n) / log(2.0);
54 :
INT_MEMBER(language_model_debug_level, 0,
"Language model debug level",
55 dict->getCCUtil()->params())
57 "Turn on/off the use of character ngram model", dict->getCCUtil()->params())
58 ,
INT_MEMBER(language_model_ngram_order, 8,
"Maximum order of the character ngram model",
59 dict->getCCUtil()->params())
60 ,
INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
61 "Maximum number of prunable (those for which"
62 " PrunablePath() is true) entries in each viterbi list"
63 " recorded in BLOB_CHOICEs",
64 dict->getCCUtil()->params())
65 ,
INT_MEMBER(language_model_viterbi_list_max_size, 500,
66 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params())
69 "To avoid overly small denominators use this as the "
70 "floor of the probability returned by the ngram model.",
71 dict->getCCUtil()->params())
73 "Average classifier score of a non-matching unichar.",
74 dict->getCCUtil()->params())
75 ,
BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
76 "Use only the first UTF8 step of the given string"
77 " when computing log probabilities.",
78 dict->getCCUtil()->params())
80 "Strength of the character ngram model relative to the"
81 " character classifier ",
82 dict->getCCUtil()->params())
84 "Factor to bring log-probs into the same range as ratings"
85 " when multiplied by outline length ",
86 dict->getCCUtil()->params())
87 ,
BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
88 "Words are delimited by space", dict->getCCUtil()->params())
89 ,
INT_MEMBER(language_model_min_compound_length, 3,
"Minimum length of compound words",
90 dict->getCCUtil()->params())
91 ,
double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
92 "Penalty for words not in the frequent word dictionary",
93 dict->getCCUtil()->params())
94 ,
double_MEMBER(language_model_penalty_non_dict_word, 0.15,
"Penalty for non-dictionary words",
95 dict->getCCUtil()->params())
96 ,
double_MEMBER(language_model_penalty_punc, 0.2,
"Penalty for inconsistent punctuation",
97 dict->getCCUtil()->params())
98 ,
double_MEMBER(language_model_penalty_case, 0.1,
"Penalty for inconsistent case",
99 dict->getCCUtil()->params())
100 ,
double_MEMBER(language_model_penalty_script, 0.5,
"Penalty for inconsistent script",
101 dict->getCCUtil()->params())
102 ,
double_MEMBER(language_model_penalty_chartype, 0.3,
"Penalty for inconsistent character type",
103 dict->getCCUtil()->params())
107 double_MEMBER(language_model_penalty_font, 0.00,
"Penalty for inconsistent font",
108 dict->getCCUtil()->params())
109 ,
double_MEMBER(language_model_penalty_spacing, 0.05,
"Penalty for inconsistent spacing",
110 dict->getCCUtil()->params())
111 ,
double_MEMBER(language_model_penalty_increment, 0.01,
"Penalty increment",
112 dict->getCCUtil()->params())
113 ,
INT_MEMBER(wordrec_display_segmentations, 0,
"Display Segmentations (ScrollView)",
114 dict->getCCUtil()->params())
116 "Use sigmoidal score for certainty", dict->getCCUtil()->params())
118 , fontinfo_table_(fontinfo_table)
128 float max_char_wh_ratio,
float rating_cert_scale) {
143 if (language_model_ngram_on) {
144 if (prev_word !=
nullptr && !prev_word->
unichar_string().empty()) {
146 if (language_model_ngram_space_delimited_language) {
169 if (parent_node ==
nullptr) {
173 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
174 ViterbiStateEntry *vse = vit.data();
175 vse->competing_vse =
nullptr;
176 UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
179 if (other_case == unichar_id) {
187 for (vit2.mark_cycle_pt();
188 !vit2.cycled_list() && vit2.data()->curr_b->unichar_id() != other_case; vit2.forward()) {
190 if (!vit2.cycled_list()) {
191 vse->competing_vse = vit2.data();
201static bool HasBetterCaseVariant(
const UNICHARSET &unicharset,
const BLOB_CHOICE *choice,
202 BLOB_CHOICE_LIST *choices) {
204 UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
205 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID) {
208 if (unicharset.SizesDistinct(choice_id, other_case)) {
211 BLOB_CHOICE_IT bc_it(choices);
212 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
213 BLOB_CHOICE *better_choice = bc_it.data();
214 if (better_choice->unichar_id() == other_case) {
216 }
else if (better_choice == choice) {
253 if (language_model_debug_level > 0) {
254 tprintf(
"\nUpdateState: col=%d row=%d %s", curr_col, curr_row,
255 just_classified ?
"just_classified" :
"");
256 if (language_model_debug_level > 5) {
257 tprintf(
"(parent=%p)\n",
static_cast<void *
>(parent_node));
264 bool new_changed =
false;
265 float denom = (language_model_ngram_on) ?
ComputeDenom(curr_list) : 1.0f;
270 bool has_alnum_mix =
false;
271 if (parent_node !=
nullptr) {
274 if (language_model_debug_level > 0) {
275 tprintf(
"No parents found to process\n");
280 has_alnum_mix =
true;
284 has_alnum_mix =
false;
286 ScanParentsForCaseMix(unicharset, parent_node);
287 if (language_model_debug_level > 3 && parent_node !=
nullptr) {
288 parent_node->
Print(
"Parent viterbi list");
293 ViterbiStateEntry_IT vit;
294 BLOB_CHOICE_IT c_it(curr_list);
295 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
307 if (c_it.at_first() || !new_changed) {
310 if (first_lower == choice) {
313 if (first_upper == choice) {
316 if (first_digit == choice) {
320 if (parent_node ==
nullptr) {
332 if (HasBetterCaseVariant(unicharset, choice, curr_list)) {
340 choice, curr_state,
nullptr, pain_points, word_res,
341 best_choice_bundle, blamer_bundle);
350 GetNextParentVSE(just_classified, has_alnum_mix, c_it.data(), blob_choice_flags,
351 unicharset, word_res, &vit, &top_choice_flags)) !=
nullptr) {
355 (++vit_counter > language_model_viterbi_list_max_num_prunable ||
364 HasBetterCaseVariant(unicharset, choice, curr_list)) {
370 c_it.data(), curr_state, parent_vse, pain_points,
371 word_res, best_choice_bundle, blamer_bundle);
387 BLOB_CHOICE_IT c_it(curr_list);
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
395 if (first_unichar ==
nullptr) {
396 first_unichar = c_it.data();
398 if (*first_lower ==
nullptr && unicharset.
get_islower(unichar_id)) {
399 *first_lower = c_it.data();
401 if (*first_upper ==
nullptr && unicharset.
get_isalpha(unichar_id) &&
403 *first_upper = c_it.data();
405 if (*first_digit ==
nullptr && unicharset.
get_isdigit(unichar_id)) {
406 *first_digit = c_it.data();
410 bool mixed = (*first_lower !=
nullptr || *first_upper !=
nullptr) && *first_digit !=
nullptr;
411 if (*first_lower ==
nullptr) {
412 *first_lower = first_unichar;
414 if (*first_upper ==
nullptr) {
415 *first_upper = first_unichar;
417 if (*first_digit ==
nullptr) {
418 *first_digit = first_unichar;
433 if (parent_node ==
nullptr) {
441 float lower_rating = 0.0f;
442 float upper_rating = 0.0f;
443 float digit_rating = 0.0f;
444 float top_rating = 0.0f;
447 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
454 while (unichar_id == INVALID_UNICHAR_ID && unichar_vse->
parent_vse !=
nullptr) {
459 if (unichar_id != INVALID_UNICHAR_ID) {
461 if (top_lower ==
nullptr || lower_rating > rating) {
463 lower_rating = rating;
466 if (top_upper ==
nullptr || upper_rating > rating) {
468 upper_rating = rating;
471 if (top_digit ==
nullptr || digit_rating > rating) {
473 digit_rating = rating;
477 if (top_choice ==
nullptr || top_rating > rating) {
483 if (top_choice ==
nullptr) {
486 bool mixed = (top_lower !=
nullptr || top_upper !=
nullptr) && top_digit !=
nullptr;
487 if (top_lower ==
nullptr) {
488 top_lower = top_choice;
491 if (top_upper ==
nullptr) {
492 top_upper = top_choice;
495 if (top_digit ==
nullptr) {
496 top_digit = top_choice;
506 return mixed ? 1 : 0;
518 ViterbiStateEntry_IT *vse_it,
520 for (; !vse_it->cycled_list(); vse_it->forward()) {
524 if (!just_classified && !parent_vse->
updated) {
527 if (language_model_debug_level > 2) {
528 parent_vse->
Print(
"Considering");
531 *top_choice_flags = blob_choice_flags;
542 (mixed_alnum || *top_choice_flags == 0)) {
548 (mixed_alnum || *top_choice_flags == 0)) {
557 if (language_model_debug_level >= 5) {
565 language_model_debug_level >= 5) &&
578 bool word_end,
int curr_col,
int curr_row,
BLOB_CHOICE *b,
583 ViterbiStateEntry_IT vit;
584 if (language_model_debug_level > 1) {
586 "AddViterbiStateEntry for unichar %s rating=%.4f"
587 " certainty=%.4f top_choice_flags=0x%x",
590 if (language_model_debug_level > 5) {
591 tprintf(
" parent_vse=%p\n",
static_cast<void *
>(parent_vse));
599 if (language_model_debug_level > 1) {
600 tprintf(
"AddViterbiStateEntry: viterbi list is full!\n");
611 if (language_model_ngram_on) {
614 denom, curr_col, curr_row, outline_length, parent_vse);
617 bool liked_by_language_model =
618 dawg_info !=
nullptr || (ngram_info !=
nullptr && !ngram_info->
pruned);
621 if (!liked_by_language_model && top_choice_flags == 0) {
622 if (language_model_debug_level > 1) {
623 tprintf(
"Language model components very early pruned this entry\n");
639 top_choice_flags &= ~kXhtConsistentFlag;
644 if (!liked_by_language_model && top_choice_flags == 0) {
645 if (language_model_debug_level > 1) {
646 tprintf(
"Language model components early pruned this entry\n");
655 if (dawg_info !=
nullptr && consistency_info.
invalid_punc) {
663 if (parent_vse !=
nullptr) {
669 auto *new_vse =
new ViterbiStateEntry(parent_vse, b, 0.0, outline_length, consistency_info,
670 associate_stats, top_choice_flags, dawg_info, ngram_info,
671 (language_model_debug_level > 0)
675 if (language_model_debug_level >= 3) {
676 tprintf(
"Adjusted cost = %g\n", new_vse->cost);
686 bool keep = new_vse->top_choice_flags || liked_by_language_model;
692 if (language_model_debug_level > 1) {
693 tprintf(
"Language model components did not like this entry\n");
704 language_model_viterbi_list_max_num_prunable) &&
706 if (language_model_debug_level > 1) {
707 tprintf(
"Discarded ViterbiEntry with high cost %g max cost %g\n", new_vse->cost,
716 UpdateBestChoice(new_vse, pain_points, word_res, best_choice_bundle, blamer_bundle);
719 if (language_model_debug_level > 1) {
720 tprintf(
"Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
737 language_model_viterbi_list_max_num_prunable) ||
738 new_vse->top_choice_flags) {
740 int prunable_counter = language_model_viterbi_list_max_num_prunable;
742 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
754 if (prunable_counter == 0) {
756 if (language_model_debug_level > 1) {
757 tprintf(
"Set viterbi_state_entries_prunable_max_cost to %g\n",
760 prunable_counter = -1;
766 if (language_model_debug_level > 2) {
767 new_vse->Print(
"New");
768 if (language_model_debug_level > 5) {
769 curr_state->
Print(
"Updated viterbi list");
780 for (vit.mark_cycle_pt();
787 if (language_model_debug_level > 2) {
797 if (parent_vse ==
nullptr) {
810 if (language_model_debug_level > 0) {
811 tprintf(
"Hyphenated word found\n");
819 if (language_model_debug_level > 0) {
820 tprintf(
"Found compound marker\n");
827 parent_vse->
length < language_model_min_compound_length) {
832 bool has_word_ending =
false;
841 has_word_ending =
true;
845 if (!has_word_ending) {
849 if (language_model_debug_level > 0) {
850 tprintf(
"Compound word found\n");
862 for (
unsigned i = 0;
i < normed_ids.size(); ++
i) {
863 if (language_model_debug_level > 2) {
867 word_end &&
i == normed_ids.size() - 1);
870 }
else if (
i < normed_ids.size() - 1) {
874 if (language_model_debug_level > 2) {
881 }
else if (language_model_debug_level > 3) {
889 float denom,
int curr_col,
int curr_row,
890 float outline_length,
893 const char *pcontext_ptr =
"";
894 int pcontext_unichar_step_len = 0;
895 if (parent_vse ==
nullptr) {
903 int unichar_step_len = 0;
906 float ngram_and_classifier_cost =
ComputeNgramCost(unichar, certainty, denom, pcontext_ptr,
907 &unichar_step_len, &pruned, &ngram_cost);
911 ngram_and_classifier_cost *= outline_length / language_model_ngram_rating_factor;
913 if (parent_vse !=
nullptr) {
919 int num_remove = (unichar_step_len + pcontext_unichar_step_len - language_model_ngram_order);
920 if (num_remove > 0) {
921 pcontext_unichar_step_len -= num_remove;
923 while (num_remove > 0 && *pcontext_ptr !=
'\0') {
935 ngram_cost, ngram_and_classifier_cost);
936 ngram_info->context += unichar;
937 ngram_info->context_unichar_step_len += unichar_step_len;
938 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
943 const char *context,
int *unichar_step_len,
944 bool *found_small_prob,
float *ngram_cost) {
945 const char *context_ptr = context;
946 char *modified_context =
nullptr;
947 char *modified_context_end =
nullptr;
948 const char *unichar_ptr = unichar;
949 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
953 if (language_model_debug_level > 1) {
954 tprintf(
"prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
958 ++(*unichar_step_len);
959 if (language_model_ngram_use_only_first_uft8_step) {
966 if (unichar_ptr < unichar_end) {
967 if (modified_context ==
nullptr) {
968 size_t context_len = strlen(context);
969 modified_context =
new char[context_len + strlen(unichar_ptr) + step + 1];
970 memcpy(modified_context, context, context_len);
971 modified_context_end = modified_context + context_len;
972 context_ptr = modified_context;
974 strncpy(modified_context_end, unichar_ptr - step, step);
975 modified_context_end += step;
976 *modified_context_end =
'\0';
979 prob /=
static_cast<float>(*unichar_step_len);
980 if (prob < language_model_ngram_small_prob) {
981 if (language_model_debug_level > 0) {
982 tprintf(
"Found small prob %g\n", prob);
984 *found_small_prob =
true;
985 prob = language_model_ngram_small_prob;
987 *ngram_cost = -1 * std::log2(prob);
988 float ngram_and_classifier_cost = -1 * std::log2(
CertaintyScore(certainty) / denom) +
989 *ngram_cost * language_model_ngram_scale_factor;
990 if (language_model_debug_level > 1) {
991 tprintf(
"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar, unichar, context_ptr,
992 CertaintyScore(certainty) / denom, prob, ngram_and_classifier_cost);
994 delete[] modified_context;
995 return ngram_and_classifier_cost;
999 if (curr_list->empty()) {
1004 BLOB_CHOICE_IT c_it(curr_list);
1005 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1037 consistency_info->
punc_ref = NO_EDGE;
1040 bool prev_is_numalpha =
1045 (is_apos && prev_is_numalpha))
1053 node, pattern_unichar_id, word_end)
1055 if (consistency_info->
punc_ref == NO_EDGE) {
1069 }
else if ((parent_b !=
nullptr) && unicharset.
get_isupper(unichar_id)) {
1091 if (parent_vse !=
nullptr &&
1096 consistency_info->
script_id = parent_script_id;
1098 if (consistency_info->
script_id != parent_script_id) {
1114 int fontinfo_id = -1;
1122 if (language_model_debug_level > 1) {
1124 "pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1130 (fontinfo_id >= 0) ?
fontinfo_table_->at(fontinfo_id).name :
"", fontinfo_id);
1133 bool expected_gap_found =
false;
1134 float expected_gap = 0.0f;
1136 if (fontinfo_id >= 0) {
1137 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1139 .get_spacing(parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1140 expected_gap = temp_gap;
1141 expected_gap_found =
true;
1146 int num_addends = 0;
1148 for (
int i = 0;
i < 4; ++
i) {
1151 }
else if (
i == 1) {
1153 }
else if (
i == 2) {
1158 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1160 unichar_id, &temp_gap)) {
1161 expected_gap += temp_gap;
1165 if (num_addends > 0) {
1166 expected_gap /=
static_cast<float>(num_addends);
1167 expected_gap_found =
true;
1170 if (expected_gap_found) {
1171 int actual_gap = word_res->
GetBlobsGap(curr_col - 1);
1172 if (actual_gap == 0) {
1175 float gap_ratio = expected_gap / actual_gap;
1181 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185 if (language_model_debug_level > 1) {
1186 tprintf(
"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1188 unicharset.
id_to_unichar(unichar_id), unichar_id, curr_col, expected_gap,
1202 if (language_model_debug_level > 3) {
1203 tprintf(
"ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1204 if (language_model_debug_level >= 5) {
1206 tprintf(
"%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1212 float adjustment = 1.0f;
1214 adjustment += language_model_penalty_non_freq_dict_word;
1217 adjustment += language_model_penalty_non_dict_word;
1218 if (vse->
length > language_model_min_compound_length) {
1220 ((vse->
length - language_model_min_compound_length) * language_model_penalty_increment);
1226 if (language_model_ngram_on) {
1241 ConstructWord(vse, word_res, &best_choice_bundle->
fixpt, blamer_bundle, &truth_path);
1243 if (
dict_->stopper_debug_level >= 1) {
1244 std::string word_str;
1246 vse->
Print(word_str.c_str());
1248 if (language_model_debug_level > 0) {
1249 word->
print(
"UpdateBestChoice() constructed word");
1253 if (blamer_bundle !=
nullptr) {
1260 if (language_model_debug_level > 0) {
1261 tprintf(
"Raw features extracted from %s (cost=%g) [ ", curr_hyp.
str.c_str(), curr_hyp.
cost);
1262 for (
float feature : curr_hyp.
features) {
1284 if (word_res->
LogNewRawChoice(word) && language_model_debug_level > 0) {
1285 tprintf(
"Updated raw choice\n");
1295 false, language_model_debug_level > 0);
1298 dict_->stopper_debug_level >= 1, word)) {
1309 best_choice_bundle->
updated =
true;
1310 best_choice_bundle->
best_vse = vse;
1311 if (language_model_debug_level > 0) {
1312 tprintf(
"Updated best choice\n");
1324 if (blamer_bundle !=
nullptr) {
1329#ifndef GRAPHICS_DISABLED
1330 if (wordrec_display_segmentations && word_res->
chopped_word !=
nullptr) {
1339 int len = vse.
length <= kMaxSmallWordUnichars ? 0 : vse.
length <= kMaxMediumWordUnichars ? 1 : 2;
1389 if (truth_path !=
nullptr) {
1401 float full_wh_ratio_mean = 0.0f;
1404 full_wh_ratio_mean =
1411 word->set_length(vse->
length);
1412 int total_blobs = 0;
1413 for (
i = (vse->
length - 1);
i >= 0; --
i) {
1414 if (blamer_bundle !=
nullptr && truth_path !=
nullptr && *truth_path &&
1416 *truth_path =
false;
1420 total_blobs += num_blobs;
1421 word->set_blob_choice(
i, num_blobs, curr_b);
1425 if ((full_wh_ratio_mean != 0.0f &&
1426 ((curr_vse != vse && curr_vse->
parent_vse !=
nullptr) ||
1430 if (language_model_debug_level > 2) {
1431 tprintf(
"full_wh_ratio_var += (%g-%g)^2\n", full_wh_ratio_mean,
1445 if (curr_vse ==
nullptr) {
1448 curr_b = curr_vse->
curr_b;
1453 if (full_wh_ratio_mean != 0.0f) {
#define INT_MEMBER(name, val, comment, vec)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
#define double_MEMBER(name, val, comment, vec)
#define BOOL_MEMBER(name, val, comment, vec)
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
void tprintf(const char *format,...)
@ PTRAIN_NUM_FEATURE_TYPES
@ PTRAIN_NGRAM_COST_PER_CHAR
@ PTRAIN_NUM_BAD_CHAR_TYPE
@ PTRAIN_SHAPE_COST_PER_CHAR
@ PTRAIN_XHEIGHT_CONSISTENCY
std::vector< DANGERR_INFO > DANGERR
static int utf8_step(const char *utf8_str)
bool GuidedSegsearchStillGoing() const
void set_best_choice_is_dict_and_top_choice(bool value)
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
void UpdateBestRating(float rating)
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
int correct_segmentation_length() const
std::vector< TBLOB * > blobs
WERD_CHOICE * best_choice
int GetBlobsGap(unsigned blob_index) const
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
const UNICHARSET * uch_set
std::vector< int > blob_widths
bool LogNewRawChoice(WERD_CHOICE *word_choice)
float features[PTRAIN_NUM_FEATURE_TYPES]
int16_t fontinfo_id2() const
UNICHAR_ID unichar_id() const
int16_t fontinfo_id() const
const MATRIX_COORD & matrix_cell()
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
void print_state(const char *msg) const
static const float kBadRating
void DisplaySegmentation(TWERD *word)
std::string & unichar_string()
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
void set_rating(float new_val)
int get_script(UNICHAR_ID unichar_id) const
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
bool get_isalpha(UNICHAR_ID unichar_id) const
bool get_islower(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
virtual bool end_of_word(EDGE_REF edge_ref) const =0
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
static const UNICHAR_ID kPatternUnicharID
DawgPositionVector * updated_dawgs
DawgPositionVector * active_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
void reset_hyphen_vars(bool last_word_on_line)
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
bool compound_marker(UNICHAR_ID unichar_id)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
bool is_apostrophe(UNICHAR_ID unichar_id)
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
const UNICHARSET & getUnicharset() const
float full_wh_ratio_total
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
DawgPositionVector beginning_active_dawgs_
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kXhtConsistentFlag
static const LanguageModelFlagsType kSmallestRatingFlag
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
ParamsModel params_model_
static const LanguageModelFlagsType kDigitFlag
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool AcceptablePath(const ViterbiStateEntry &vse)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int prev_word_unichar_step_len_
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
static const LanguageModelFlagsType kLowerCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
bool correct_segmentation_explored_
static const float kMaxAvgNgramCost
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
std::string prev_word_str_
DawgPositionVector very_beginning_active_dawgs_
static const LanguageModelFlagsType kUpperCaseFlag
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float CertaintyScore(float cert)
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
const UnicityTable< FontInfo > * fontinfo_table_
bool acceptable_choice_found_
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
int num_inconsistent_spaces
int NumInconsistentSpaces() const
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
int InconsistentXHeight() const
float BodyMaxXHeight() const
int NumInconsistentCase() const
XHeightConsistencyEnum xht_decision
int NumInconsistentChartype() const
float BodyMinXHeight() const
DawgPositionVector active_dawgs
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
int context_unichar_step_len
float ngram_cost
-ln(P_ngram_model(path))
LanguageModelDawgInfo * dawg_info
float outline_length
length of the outline so far
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
AssociateStats associate_stats
character widths/gaps/seams
ViterbiStateEntry * competing_vse
int length
number of characters on the path
void Print(const char *msg) const
ViterbiStateEntry * parent_vse
LanguageModelNgramInfo * ngram_info
LanguageModelFlagsType top_choice_flags
static int Compare(const void *e1, const void *e2)
float ratings_sum
sum of ratings of character on the path
bool updated
set to true if the entry has just been created/updated
LMConsistencyInfo consistency_info
path consistency info
float min_certainty
minimum certainty on the path
bool HasAlnumChoice(const UNICHARSET &unicharset)
Struct to store information maintained by various language model components.
float viterbi_state_entries_prunable_max_cost
void Print(const char *msg)
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Bundle together all the things pertaining to the best choice/state.
std::vector< LanguageModelState * > beam
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
bool updated
Flag to indicate whether anything was changed.
float ComputeCost(const float features[]) const