tesseract  4.00.00dev
language_model.cpp
Go to the documentation of this file.
1 // File: language_model.cpp
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 // Created: Mon Nov 11 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include <math.h>
22 
23 #include "language_model.h"
24 
25 #include "dawg.h"
26 #include "intproto.h"
27 #include "helpers.h"
28 #include "lm_state.h"
29 #include "lm_pain_points.h"
30 #include "matrix.h"
31 #include "params.h"
33 
34 #if (defined(_MSC_VER) && _MSC_VER < 1900) || defined(ANDROID)
35 double log2(double n) {
36  return log(n) / log(2.0);
37 }
38 #endif // _MSC_VER
39 
40 namespace tesseract {
41 
42 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
43 
45  Dict *dict)
46  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
47  dict->getCCUtil()->params()),
48  BOOL_INIT_MEMBER(language_model_ngram_on, false,
49  "Turn on/off the use of character ngram model",
50  dict->getCCUtil()->params()),
51  INT_MEMBER(language_model_ngram_order, 8,
52  "Maximum order of the character ngram model",
53  dict->getCCUtil()->params()),
54  INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
55  "Maximum number of prunable (those for which"
56  " PrunablePath() is true) entries in each viterbi list"
57  " recorded in BLOB_CHOICEs",
58  dict->getCCUtil()->params()),
59  INT_MEMBER(language_model_viterbi_list_max_size, 500,
60  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
61  dict->getCCUtil()->params()),
62  double_MEMBER(language_model_ngram_small_prob, 0.000001,
63  "To avoid overly small denominators use this as the "
64  "floor of the probability returned by the ngram model.",
65  dict->getCCUtil()->params()),
66  double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
67  "Average classifier score of a non-matching unichar.",
68  dict->getCCUtil()->params()),
69  BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
70  "Use only the first UTF8 step of the given string"
71  " when computing log probabilities.",
72  dict->getCCUtil()->params()),
73  double_MEMBER(language_model_ngram_scale_factor, 0.03,
74  "Strength of the character ngram model relative to the"
75  " character classifier ",
76  dict->getCCUtil()->params()),
77  double_MEMBER(language_model_ngram_rating_factor, 16.0,
78  "Factor to bring log-probs into the same range as ratings"
79  " when multiplied by outline length ",
80  dict->getCCUtil()->params()),
81  BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
82  "Words are delimited by space", dict->getCCUtil()->params()),
83  INT_MEMBER(language_model_min_compound_length, 3,
84  "Minimum length of compound words",
85  dict->getCCUtil()->params()),
86  double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
87  "Penalty for words not in the frequent word dictionary",
88  dict->getCCUtil()->params()),
89  double_MEMBER(language_model_penalty_non_dict_word, 0.15,
90  "Penalty for non-dictionary words",
91  dict->getCCUtil()->params()),
92  double_MEMBER(language_model_penalty_punc, 0.2,
93  "Penalty for inconsistent punctuation",
94  dict->getCCUtil()->params()),
95  double_MEMBER(language_model_penalty_case, 0.1,
96  "Penalty for inconsistent case",
97  dict->getCCUtil()->params()),
98  double_MEMBER(language_model_penalty_script, 0.5,
99  "Penalty for inconsistent script",
100  dict->getCCUtil()->params()),
101  double_MEMBER(language_model_penalty_chartype, 0.3,
102  "Penalty for inconsistent character type",
103  dict->getCCUtil()->params()),
104  // TODO(daria, rays): enable font consistency checking
105  // after improving font analysis.
106  double_MEMBER(language_model_penalty_font, 0.00,
107  "Penalty for inconsistent font",
108  dict->getCCUtil()->params()),
109  double_MEMBER(language_model_penalty_spacing, 0.05,
110  "Penalty for inconsistent spacing",
111  dict->getCCUtil()->params()),
112  double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
113  dict->getCCUtil()->params()),
114  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
115  dict->getCCUtil()->params()),
116  BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
117  "Use sigmoidal score for certainty",
118  dict->getCCUtil()->params()),
119  dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
120  fontinfo_table_(fontinfo_table),
121  dict_(dict),
122  fixed_pitch_(false),
123  max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
126 }
127 
129 
131  bool fixed_pitch, float max_char_wh_ratio,
132  float rating_cert_scale) {
133  fixed_pitch_ = fixed_pitch;
134  max_char_wh_ratio_ = max_char_wh_ratio;
135  rating_cert_scale_ = rating_cert_scale;
136  acceptable_choice_found_ = false;
138 
139  // Initialize vectors with beginning DawgInfos.
144 
145  // Fill prev_word_str_ with the last language_model_ngram_order
146  // unichars from prev_word.
148  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
149  prev_word_str_ = prev_word->unichar_string();
151  } else {
152  prev_word_str_ = " ";
153  }
154  const char *str_ptr = prev_word_str_.string();
155  const char *str_end = str_ptr + prev_word_str_.length();
156  int step;
158  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
159  str_ptr += step;
161  }
162  ASSERT_HOST(str_ptr == str_end);
163  }
164 }
165 
170 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
171  LanguageModelState* parent_node) {
172  if (parent_node == NULL) return;
173  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
174  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
175  ViterbiStateEntry* vse = vit.data();
176  vse->competing_vse = NULL;
177  UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
178  if (unicharset.get_isupper(unichar_id) ||
179  unicharset.get_islower(unichar_id)) {
180  UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
181  if (other_case == unichar_id) continue; // Not in unicharset.
182  // Find other case in same list. There could be multiple entries with
183  // the same unichar_id, but in theory, they should all point to the
184  // same BLOB_CHOICE, and that is what we will be using to decide
185  // which to keep.
186  ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
187  for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
188  vit2.data()->curr_b->unichar_id() != other_case;
189  vit2.forward()) {}
190  if (!vit2.cycled_list()) {
191  vse->competing_vse = vit2.data();
192  }
193  }
194  }
195 }
196 
201 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
202  const BLOB_CHOICE* choice,
203  BLOB_CHOICE_LIST* choices) {
204  UNICHAR_ID choice_id = choice->unichar_id();
205  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
206  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
207  return false; // Not upper or lower or not in unicharset.
208  if (unicharset.SizesDistinct(choice_id, other_case))
209  return false; // Can be separated by size.
210  BLOB_CHOICE_IT bc_it(choices);
211  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
212  BLOB_CHOICE* better_choice = bc_it.data();
213  if (better_choice->unichar_id() == other_case)
214  return true; // Found an earlier instance of other_case.
215  else if (better_choice == choice)
216  return false; // Reached the original choice.
217  }
218  return false; // Should never happen, but just in case.
219 }
220 
248  bool just_classified,
249  int curr_col, int curr_row,
250  BLOB_CHOICE_LIST *curr_list,
251  LanguageModelState *parent_node,
252  LMPainPoints *pain_points,
253  WERD_RES *word_res,
254  BestChoiceBundle *best_choice_bundle,
255  BlamerBundle *blamer_bundle) {
256  if (language_model_debug_level > 0) {
257  tprintf("\nUpdateState: col=%d row=%d %s",
258  curr_col, curr_row, just_classified ? "just_classified" : "");
260  tprintf("(parent=%p)\n", parent_node);
261  else
262  tprintf("\n");
263  }
264  // Initialize helper variables.
265  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
266  bool new_changed = false;
267  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
268  const UNICHARSET& unicharset = dict_->getUnicharset();
269  BLOB_CHOICE *first_lower = NULL;
270  BLOB_CHOICE *first_upper = NULL;
271  BLOB_CHOICE *first_digit = NULL;
272  bool has_alnum_mix = false;
273  if (parent_node != NULL) {
274  int result = SetTopParentLowerUpperDigit(parent_node);
275  if (result < 0) {
277  tprintf("No parents found to process\n");
278  return false;
279  }
280  if (result > 0)
281  has_alnum_mix = true;
282  }
283  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
284  &first_digit))
285  has_alnum_mix = false;;
286  ScanParentsForCaseMix(unicharset, parent_node);
287  if (language_model_debug_level > 3 && parent_node != NULL) {
288  parent_node->Print("Parent viterbi list");
289  }
290  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
291 
292  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
293  ViterbiStateEntry_IT vit;
294  BLOB_CHOICE_IT c_it(curr_list);
295  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
296  BLOB_CHOICE* choice = c_it.data();
297  // TODO(antonova): make sure commenting this out if ok for ngram
298  // model scoring (I think this was introduced to fix ngram model quirks).
299  // Skip NULL unichars unless it is the only choice.
300  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
301  UNICHAR_ID unichar_id = choice->unichar_id();
302  if (unicharset.get_fragment(unichar_id)) {
303  continue; // Skip fragments.
304  }
305  // Set top choice flags.
306  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
307  if (c_it.at_first() || !new_changed)
308  blob_choice_flags |= kSmallestRatingFlag;
309  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
310  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
311  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
312 
313  if (parent_node == NULL) {
314  // Process the beginning of a word.
315  // If there is a better case variant that is not distinguished by size,
316  // skip this blob choice, as we have no choice but to accept the result
317  // of the character classifier to distinguish between them, even if
318  // followed by an upper case.
319  // With words like iPoc, and other CamelBackWords, the lower-upper
320  // transition can only be achieved if the classifier has the correct case
321  // as the top choice, and leaving an initial I lower down the list
322  // increases the chances of choosing IPoc simply because it doesn't
323  // include such a transition. iPoc will beat iPOC and ipoc because
324  // the other words are baseline/x-height inconsistent.
325  if (HasBetterCaseVariant(unicharset, choice, curr_list))
326  continue;
327  // Upper counts as lower at the beginning of a word.
328  if (blob_choice_flags & kUpperCaseFlag)
329  blob_choice_flags |= kLowerCaseFlag;
330  new_changed |= AddViterbiStateEntry(
331  blob_choice_flags, denom, word_end, curr_col, curr_row,
332  choice, curr_state, NULL, pain_points,
333  word_res, best_choice_bundle, blamer_bundle);
334  } else {
335  // Get viterbi entries from each parent ViterbiStateEntry.
336  vit.set_to_list(&parent_node->viterbi_state_entries);
337  int vit_counter = 0;
338  vit.mark_cycle_pt();
339  ViterbiStateEntry* parent_vse = NULL;
340  LanguageModelFlagsType top_choice_flags;
341  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
342  c_it.data(), blob_choice_flags,
343  unicharset, word_res, &vit,
344  &top_choice_flags)) != NULL) {
345  // Skip pruned entries and do not look at prunable entries if already
346  // examined language_model_viterbi_list_max_num_prunable of those.
347  if (PrunablePath(*parent_vse) &&
349  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
350  continue;
351  }
352  // If the parent has no alnum choice, (ie choice is the first in a
353  // string of alnum), and there is a better case variant that is not
354  // distinguished by size, skip this blob choice/parent, as with the
355  // initial blob treatment above.
356  if (!parent_vse->HasAlnumChoice(unicharset) &&
357  HasBetterCaseVariant(unicharset, choice, curr_list))
358  continue;
359  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
360  // looks good according to the Dawgs or character ngram model.
361  new_changed |= AddViterbiStateEntry(
362  top_choice_flags, denom, word_end, curr_col, curr_row,
363  c_it.data(), curr_state, parent_vse, pain_points,
364  word_res, best_choice_bundle, blamer_bundle);
365  }
366  }
367  }
368  return new_changed;
369 }
370 
377 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
378  BLOB_CHOICE **first_lower,
379  BLOB_CHOICE **first_upper,
380  BLOB_CHOICE **first_digit) const {
381  BLOB_CHOICE_IT c_it(curr_list);
382  const UNICHARSET &unicharset = dict_->getUnicharset();
383  BLOB_CHOICE *first_unichar = NULL;
384  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
385  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
386  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
387  if (first_unichar == NULL) first_unichar = c_it.data();
388  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
389  *first_lower = c_it.data();
390  }
391  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
392  !unicharset.get_islower(unichar_id)) {
393  *first_upper = c_it.data();
394  }
395  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
396  *first_digit = c_it.data();
397  }
398  }
399  ASSERT_HOST(first_unichar != NULL);
400  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
401  *first_digit != NULL;
402  if (*first_lower == NULL) *first_lower = first_unichar;
403  if (*first_upper == NULL) *first_upper = first_unichar;
404  if (*first_digit == NULL) *first_digit = first_unichar;
405  return mixed;
406 }
407 
418  LanguageModelState *parent_node) const {
419  if (parent_node == NULL) return -1;
420  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
421  ViterbiStateEntry* top_lower = NULL;
422  ViterbiStateEntry* top_upper = NULL;
423  ViterbiStateEntry* top_digit = NULL;
424  ViterbiStateEntry* top_choice = NULL;
425  float lower_rating = 0.0f;
426  float upper_rating = 0.0f;
427  float digit_rating = 0.0f;
428  float top_rating = 0.0f;
429  const UNICHARSET &unicharset = dict_->getUnicharset();
430  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
431  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
432  ViterbiStateEntry* vse = vit.data();
433  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
434  // back to the real character if needed.
435  ViterbiStateEntry* unichar_vse = vse;
436  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
437  float rating = unichar_vse->curr_b->rating();
438  while (unichar_id == INVALID_UNICHAR_ID &&
439  unichar_vse->parent_vse != NULL) {
440  unichar_vse = unichar_vse->parent_vse;
441  unichar_id = unichar_vse->curr_b->unichar_id();
442  rating = unichar_vse->curr_b->rating();
443  }
444  if (unichar_id != INVALID_UNICHAR_ID) {
445  if (unicharset.get_islower(unichar_id)) {
446  if (top_lower == NULL || lower_rating > rating) {
447  top_lower = vse;
448  lower_rating = rating;
449  }
450  } else if (unicharset.get_isalpha(unichar_id)) {
451  if (top_upper == NULL || upper_rating > rating) {
452  top_upper = vse;
453  upper_rating = rating;
454  }
455  } else if (unicharset.get_isdigit(unichar_id)) {
456  if (top_digit == NULL || digit_rating > rating) {
457  top_digit = vse;
458  digit_rating = rating;
459  }
460  }
461  }
462  if (top_choice == NULL || top_rating > rating) {
463  top_choice = vse;
464  top_rating = rating;
465  top_id = unichar_id;
466  }
467  }
468  if (top_choice == NULL) return -1;
469  bool mixed = (top_lower != NULL || top_upper != NULL) &&
470  top_digit != NULL;
471  if (top_lower == NULL) top_lower = top_choice;
472  top_lower->top_choice_flags |= kLowerCaseFlag;
473  if (top_upper == NULL) top_upper = top_choice;
474  top_upper->top_choice_flags |= kUpperCaseFlag;
475  if (top_digit == NULL) top_digit = top_choice;
476  top_digit->top_choice_flags |= kDigitFlag;
477  top_choice->top_choice_flags |= kSmallestRatingFlag;
478  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
479  (top_choice->top_choice_flags &
481  // If the compound marker top choice carries any of the top alnum flags,
482  // then give it all of them, allowing words like I-295 to be chosen.
483  top_choice->top_choice_flags |=
485  }
486  return mixed ? 1 : 0;
487 }
488 
495  bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
496  LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
497  WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
498  LanguageModelFlagsType* top_choice_flags) const {
499  for (; !vse_it->cycled_list(); vse_it->forward()) {
500  ViterbiStateEntry* parent_vse = vse_it->data();
501  // Only consider the parent if it has been updated or
502  // if the current ratings cell has just been classified.
503  if (!just_classified && !parent_vse->updated) continue;
505  parent_vse->Print("Considering");
506  // If the parent is non-alnum, then upper counts as lower.
507  *top_choice_flags = blob_choice_flags;
508  if ((blob_choice_flags & kUpperCaseFlag) &&
509  !parent_vse->HasAlnumChoice(unicharset)) {
510  *top_choice_flags |= kLowerCaseFlag;
511  }
512  *top_choice_flags &= parent_vse->top_choice_flags;
513  UNICHAR_ID unichar_id = bc->unichar_id();
514  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
515  UNICHAR_ID parent_id = parent_b->unichar_id();
516  // Digits do not bind to alphas if there is a mix in both parent and current
517  // or if the alpha is not the top choice.
518  if (unicharset.get_isdigit(unichar_id) &&
519  unicharset.get_isalpha(parent_id) &&
520  (mixed_alnum || *top_choice_flags == 0))
521  continue; // Digits don't bind to alphas.
522  // Likewise alphas do not bind to digits if there is a mix in both or if
523  // the digit is not the top choice.
524  if (unicharset.get_isalpha(unichar_id) &&
525  unicharset.get_isdigit(parent_id) &&
526  (mixed_alnum || *top_choice_flags == 0))
527  continue; // Alphas don't bind to digits.
528  // If there is a case mix of the same alpha in the parent list, then
529  // competing_vse is non-null and will be used to determine whether
530  // or not to bind the current blob choice.
531  if (parent_vse->competing_vse != NULL) {
532  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
533  UNICHAR_ID other_id = competing_b->unichar_id();
534  if (language_model_debug_level >= 5) {
535  tprintf("Parent %s has competition %s\n",
536  unicharset.id_to_unichar(parent_id),
537  unicharset.id_to_unichar(other_id));
538  }
539  if (unicharset.SizesDistinct(parent_id, other_id)) {
540  // If other_id matches bc wrt position and size, and parent_id, doesn't,
541  // don't bind to the current parent.
542  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
544  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
546  continue; // Competing blobchoice has a better vertical match.
547  }
548  }
549  vse_it->forward();
550  return parent_vse; // This one is good!
551  }
552  return NULL; // Ran out of possibilities.
553 }
554 
556  LanguageModelFlagsType top_choice_flags,
557  float denom,
558  bool word_end,
559  int curr_col, int curr_row,
560  BLOB_CHOICE *b,
561  LanguageModelState *curr_state,
562  ViterbiStateEntry *parent_vse,
563  LMPainPoints *pain_points,
564  WERD_RES *word_res,
565  BestChoiceBundle *best_choice_bundle,
566  BlamerBundle *blamer_bundle) {
567  ViterbiStateEntry_IT vit;
568  if (language_model_debug_level > 1) {
569  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
570  " certainty=%.4f top_choice_flags=0x%x",
572  b->rating(), b->certainty(), top_choice_flags);
574  tprintf(" parent_vse=%p\n", parent_vse);
575  else
576  tprintf("\n");
577  }
578  // Check whether the list is full.
579  if (curr_state != NULL &&
580  curr_state->viterbi_state_entries_length >=
582  if (language_model_debug_level > 1) {
583  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
584  }
585  return false;
586  }
587 
588  // Invoke Dawg language model component.
589  LanguageModelDawgInfo *dawg_info =
590  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
591 
592  float outline_length =
594  // Invoke Ngram language model component.
595  LanguageModelNgramInfo *ngram_info = NULL;
597  ngram_info = GenerateNgramInfo(
599  denom, curr_col, curr_row, outline_length, parent_vse);
600  ASSERT_HOST(ngram_info != NULL);
601  }
602  bool liked_by_language_model = dawg_info != NULL ||
603  (ngram_info != NULL && !ngram_info->pruned);
604  // Quick escape if not liked by the language model, can't be consistent
605  // xheight, and not top choice.
606  if (!liked_by_language_model && top_choice_flags == 0) {
607  if (language_model_debug_level > 1) {
608  tprintf("Language model components very early pruned this entry\n");
609  }
610  delete ngram_info;
611  delete dawg_info;
612  return false;
613  }
614 
615  // Check consistency of the path and set the relevant consistency_info.
616  LMConsistencyInfo consistency_info(
617  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
618  // Start with just the x-height consistency, as it provides significant
619  // pruning opportunity.
620  consistency_info.ComputeXheightConsistency(
622  // Turn off xheight consistent flag if not consistent.
623  if (consistency_info.InconsistentXHeight()) {
624  top_choice_flags &= ~kXhtConsistentFlag;
625  }
626 
627  // Quick escape if not liked by the language model, not consistent xheight,
628  // and not top choice.
629  if (!liked_by_language_model && top_choice_flags == 0) {
630  if (language_model_debug_level > 1) {
631  tprintf("Language model components early pruned this entry\n");
632  }
633  delete ngram_info;
634  delete dawg_info;
635  return false;
636  }
637 
638  // Compute the rest of the consistency info.
639  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
640  word_res, &consistency_info);
641  if (dawg_info != NULL && consistency_info.invalid_punc) {
642  consistency_info.invalid_punc = false; // do not penalize dict words
643  }
644 
645  // Compute cost of associating the blobs that represent the current unichar.
646  AssociateStats associate_stats;
647  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
648  parent_vse, word_res, &associate_stats);
649  if (parent_vse != NULL) {
650  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
651  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
652  }
653 
654  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
655  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
656  parent_vse, b, 0.0, outline_length,
657  consistency_info, associate_stats, top_choice_flags, dawg_info,
658  ngram_info, (language_model_debug_level > 0) ?
659  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
660  new_vse->cost = ComputeAdjustedPathCost(new_vse);
662  tprintf("Adjusted cost = %g\n", new_vse->cost);
663 
664  // Invoke Top Choice language model component to make the final adjustments
665  // to new_vse->top_choice_flags.
666  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
667  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
668  }
669 
670  // If language model components did not like this unichar - return.
671  bool keep = new_vse->top_choice_flags || liked_by_language_model;
672  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
673  consistency_info.inconsistent_script) { // with inconsistent script
674  keep = false;
675  }
676  if (!keep) {
677  if (language_model_debug_level > 1) {
678  tprintf("Language model components did not like this entry\n");
679  }
680  delete new_vse;
681  return false;
682  }
683 
684  // Discard this entry if it represents a prunable path and
685  // language_model_viterbi_list_max_num_prunable such entries with a lower
686  // cost have already been recorded.
687  if (PrunablePath(*new_vse) &&
690  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
691  if (language_model_debug_level > 1) {
692  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
693  new_vse->cost,
695  }
696  delete new_vse;
697  return false;
698  }
699 
700  // Update best choice if needed.
701  if (word_end) {
702  UpdateBestChoice(new_vse, pain_points, word_res,
703  best_choice_bundle, blamer_bundle);
704  // Discard the entry if UpdateBestChoice() found flaws in it.
705  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
706  new_vse != best_choice_bundle->best_vse) {
707  if (language_model_debug_level > 1) {
708  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
709  }
710  delete new_vse;
711  return false;
712  }
713  }
714 
715  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
716  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
717  false, new_vse);
718  curr_state->viterbi_state_entries_length++;
719  if (PrunablePath(*new_vse)) {
721  }
722 
723  // Update lms->viterbi_state_entries_prunable_max_cost and clear
724  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
725  if ((curr_state->viterbi_state_entries_prunable_length >=
727  new_vse->top_choice_flags) {
728  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
729  int prunable_counter = language_model_viterbi_list_max_num_prunable;
730  vit.set_to_list(&(curr_state->viterbi_state_entries));
731  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
732  ViterbiStateEntry *curr_vse = vit.data();
733  // Clear the appropriate top choice flags of the entries in the
734  // list that have cost higher thank new_entry->cost
735  // (since they will not be top choices any more).
736  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
737  curr_vse->cost > new_vse->cost) {
738  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
739  }
740  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
741  // Update curr_state->viterbi_state_entries_prunable_max_cost.
742  if (prunable_counter == 0) {
743  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
744  if (language_model_debug_level > 1) {
745  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
747  }
748  prunable_counter = -1; // stop counting
749  }
750  }
751  }
752 
753  // Print the newly created ViterbiStateEntry.
754  if (language_model_debug_level > 2) {
755  new_vse->Print("New");
757  curr_state->Print("Updated viterbi list");
758  }
759 
760  return true;
761 }
762 
764  const ViterbiStateEntry *parent_vse,
765  LanguageModelState *lms) {
766  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
767  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
768  new_vse->cost >= vit.data()->cost; vit.forward()) {
769  // Clear the appropriate flags if the list already contains
770  // a top choice entry with a lower cost.
771  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
772  }
773  if (language_model_debug_level > 2) {
774  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
775  new_vse->top_choice_flags);
776  }
777 }
778 
780  bool word_end,
781  int curr_col, int curr_row,
782  const BLOB_CHOICE &b,
783  const ViterbiStateEntry *parent_vse) {
784  // Initialize active_dawgs from parent_vse if it is not NULL.
785  // Otherwise use very_beginning_active_dawgs_.
786  if (parent_vse == NULL) {
789  } else {
790  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
792  dawg_args_.permuter = parent_vse->dawg_info->permuter;
793  }
794 
795  // Deal with hyphenated words.
796  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
797  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
799  }
800 
801  // Deal with compound words.
802  if (dict_->compound_marker(b.unichar_id()) &&
803  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
804  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
805  // Do not allow compound operators at the beginning and end of the word.
806  // Do not allow more than one compound operator per word.
807  // Do not allow compounding of words with lengths shorter than
808  // language_model_min_compound_length
809  if (parent_vse == NULL || word_end ||
812  return NULL;
813 
814  int i;
815  // Check a that the path terminated before the current character is a word.
816  bool has_word_ending = false;
817  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
818  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
819  const Dawg *pdawg = pos.dawg_index < 0
820  ? NULL : dict_->GetDawg(pos.dawg_index);
821  if (pdawg == NULL || pos.back_to_punc) continue;;
822  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
823  pdawg->end_of_word(pos.dawg_ref)) {
824  has_word_ending = true;
825  break;
826  }
827  }
828  if (!has_word_ending) return NULL;
829 
830  if (language_model_debug_level > 0) tprintf("Compound word found\n");
832  } // done dealing with compound words
833 
834  LanguageModelDawgInfo *dawg_info = NULL;
835 
836  // Call LetterIsOkay().
837  // Use the normalized IDs so that all shapes of ' can be allowed in words
838  // like don't.
839  const GenericVector<UNICHAR_ID>& normed_ids =
841  DawgPositionVector tmp_active_dawgs;
842  for (int i = 0; i < normed_ids.size(); ++i) {
844  tprintf("Test Letter OK for unichar %d, normed %d\n",
845  b.unichar_id(), normed_ids[i]);
846  dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
847  word_end && i == normed_ids.size() - 1);
848  if (dawg_args_.permuter == NO_PERM) {
849  break;
850  } else if (i < normed_ids.size() - 1) {
851  tmp_active_dawgs = *dawg_args_.updated_dawgs;
852  dawg_args_.active_dawgs = &tmp_active_dawgs;
853  }
855  tprintf("Letter was OK for unichar %d, normed %d\n",
856  b.unichar_id(), normed_ids[i]);
857  }
858  dawg_args_.active_dawgs = nullptr;
859  if (dawg_args_.permuter != NO_PERM) {
862  } else if (language_model_debug_level > 3) {
863  tprintf("Letter %s not OK!\n",
865  }
866 
867  return dawg_info;
868 }
869 
871  const char *unichar, float certainty, float denom,
872  int curr_col, int curr_row, float outline_length,
873  const ViterbiStateEntry *parent_vse) {
874  // Initialize parent context.
875  const char *pcontext_ptr = "";
876  int pcontext_unichar_step_len = 0;
877  if (parent_vse == NULL) {
878  pcontext_ptr = prev_word_str_.string();
879  pcontext_unichar_step_len = prev_word_unichar_step_len_;
880  } else {
881  pcontext_ptr = parent_vse->ngram_info->context.string();
882  pcontext_unichar_step_len =
884  }
885  // Compute p(unichar | parent context).
886  int unichar_step_len = 0;
887  bool pruned = false;
888  float ngram_cost;
889  float ngram_and_classifier_cost =
890  ComputeNgramCost(unichar, certainty, denom,
891  pcontext_ptr, &unichar_step_len,
892  &pruned, &ngram_cost);
893  // Normalize just the ngram_and_classifier_cost by outline_length.
894  // The ngram_cost is used by the params_model, so it needs to be left as-is,
895  // and the params model cost will be normalized by outline_length.
896  ngram_and_classifier_cost *=
897  outline_length / language_model_ngram_rating_factor;
898  // Add the ngram_cost of the parent.
899  if (parent_vse != NULL) {
900  ngram_and_classifier_cost +=
902  ngram_cost += parent_vse->ngram_info->ngram_cost;
903  }
904 
905  // Shorten parent context string by unichar_step_len unichars.
906  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
908  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
909  while (num_remove > 0 && *pcontext_ptr != '\0') {
910  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
911  --num_remove;
912  }
913 
914  // Decide whether to prune this ngram path and update changed accordingly.
915  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
916 
917  // Construct and return the new LanguageModelNgramInfo.
919  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
920  ngram_and_classifier_cost);
921  ngram_info->context += unichar;
922  ngram_info->context_unichar_step_len += unichar_step_len;
924  return ngram_info;
925 }
926 
927 float LanguageModel::ComputeNgramCost(const char *unichar,
928  float certainty,
929  float denom,
930  const char *context,
931  int *unichar_step_len,
932  bool *found_small_prob,
933  float *ngram_cost) {
934  const char *context_ptr = context;
935  char *modified_context = NULL;
936  char *modified_context_end = NULL;
937  const char *unichar_ptr = unichar;
938  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
939  float prob = 0.0f;
940  int step = 0;
941  while (unichar_ptr < unichar_end &&
942  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
943  if (language_model_debug_level > 1) {
944  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
945  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
946  }
947  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
948  ++(*unichar_step_len);
950  unichar_ptr += step;
951  // If there are multiple UTF8 characters present in unichar, context is
952  // updated to include the previously examined characters from str,
953  // unless use_only_first_uft8_step is true.
954  if (unichar_ptr < unichar_end) {
955  if (modified_context == NULL) {
956  int context_len = strlen(context);
957  modified_context =
958  new char[context_len + strlen(unichar_ptr) + step + 1];
959  strncpy(modified_context, context, context_len);
960  modified_context_end = modified_context + context_len;
961  context_ptr = modified_context;
962  }
963  strncpy(modified_context_end, unichar_ptr - step, step);
964  modified_context_end += step;
965  *modified_context_end = '\0';
966  }
967  }
968  prob /= static_cast<float>(*unichar_step_len); // normalize
969  if (prob < language_model_ngram_small_prob) {
970  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
971  *found_small_prob = true;
973  }
974  *ngram_cost = -1.0*log2(prob);
975  float ngram_and_classifier_cost =
976  -1.0*log2(CertaintyScore(certainty)/denom) +
977  *ngram_cost * language_model_ngram_scale_factor;
978  if (language_model_debug_level > 1) {
979  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
980  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
981  ngram_and_classifier_cost);
982  }
983  delete[] modified_context;
984  return ngram_and_classifier_cost;
985 }
986 
987 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
988  if (curr_list->empty()) return 1.0f;
989  float denom = 0.0f;
990  int len = 0;
991  BLOB_CHOICE_IT c_it(curr_list);
992  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
993  ASSERT_HOST(c_it.data() != NULL);
994  ++len;
995  denom += CertaintyScore(c_it.data()->certainty());
996  }
997  assert(len != 0);
998  // The ideal situation would be to have the classifier scores for
999  // classifying each position as each of the characters in the unicharset.
1000  // Since we can not do this because of speed, we add a very crude estimate
1001  // of what these scores for the "missing" classifications would sum up to.
1002  denom += (dict_->getUnicharset().size() - len) *
1004 
1005  return denom;
1006 }
1007 
1009  int curr_col,
1010  bool word_end,
1011  BLOB_CHOICE *b,
1012  ViterbiStateEntry *parent_vse,
1013  WERD_RES *word_res,
1014  LMConsistencyInfo *consistency_info) {
1015  const UNICHARSET &unicharset = dict_->getUnicharset();
1016  UNICHAR_ID unichar_id = b->unichar_id();
1017  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1018 
1019  // Check punctuation validity.
1020  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1021  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1022  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1023  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1024  unicharset.get_isdigit(parent_b->unichar_id()))) {
1025  // reset punc_ref for compound words
1026  consistency_info->punc_ref = NO_EDGE;
1027  } else {
1028  bool is_apos = dict_->is_apostrophe(unichar_id);
1029  bool prev_is_numalpha = (parent_b != NULL &&
1030  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031  unicharset.get_isdigit(parent_b->unichar_id())));
1032  UNICHAR_ID pattern_unichar_id =
1033  (unicharset.get_isalpha(unichar_id) ||
1034  unicharset.get_isdigit(unichar_id) ||
1035  (is_apos && prev_is_numalpha)) ?
1036  Dawg::kPatternUnicharID : unichar_id;
1037  if (consistency_info->punc_ref == NO_EDGE ||
1038  pattern_unichar_id != Dawg::kPatternUnicharID ||
1039  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1042  consistency_info->punc_ref);
1043  consistency_info->punc_ref =
1044  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1045  node, pattern_unichar_id, word_end) : NO_EDGE;
1046  if (consistency_info->punc_ref == NO_EDGE) {
1047  consistency_info->invalid_punc = true;
1048  }
1049  }
1050  }
1051  }
1052 
1053  // Update case related counters.
1054  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1055  // Reset counters if we are dealing with a compound word.
1056  consistency_info->num_lower = 0;
1057  consistency_info->num_non_first_upper = 0;
1058  }
1059  else if (unicharset.get_islower(unichar_id)) {
1060  consistency_info->num_lower++;
1061  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1062  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1063  consistency_info->num_lower > 0 ||
1064  consistency_info->num_non_first_upper > 0) {
1065  consistency_info->num_non_first_upper++;
1066  }
1067  }
1068 
1069  // Initialize consistency_info->script_id (use script of unichar_id
1070  // if it is not Common, use script id recorded by the parent otherwise).
1071  // Set inconsistent_script to true if the script of the current unichar
1072  // is not consistent with that of the parent.
1073  consistency_info->script_id = unicharset.get_script(unichar_id);
1074  // Hiragana and Katakana can mix with Han.
1076  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1077  consistency_info->script_id == unicharset.hiragana_sid()) ||
1078  (unicharset.katakana_sid() != unicharset.null_sid() &&
1079  consistency_info->script_id == unicharset.katakana_sid())) {
1080  consistency_info->script_id = dict_->getUnicharset().han_sid();
1081  }
1082  }
1083 
1084  if (parent_vse != NULL &&
1085  (parent_vse->consistency_info.script_id !=
1086  dict_->getUnicharset().common_sid())) {
1087  int parent_script_id = parent_vse->consistency_info.script_id;
1088  // If script_id is Common, use script id of the parent instead.
1089  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1090  consistency_info->script_id = parent_script_id;
1091  }
1092  if (consistency_info->script_id != parent_script_id) {
1093  consistency_info->inconsistent_script = true;
1094  }
1095  }
1096 
1097  // Update chartype related counters.
1098  if (unicharset.get_isalpha(unichar_id)) {
1099  consistency_info->num_alphas++;
1100  } else if (unicharset.get_isdigit(unichar_id)) {
1101  consistency_info->num_digits++;
1102  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1103  consistency_info->num_other++;
1104  }
1105 
1106  // Check font and spacing consistency.
1107  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1108  int fontinfo_id = -1;
1109  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1110  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1111  fontinfo_id = b->fontinfo_id();
1112  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1113  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1114  fontinfo_id = b->fontinfo_id2();
1115  }
1116  if(language_model_debug_level > 1) {
1117  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1118  (parent_b->fontinfo_id() >= 0) ?
1119  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1120  (parent_b->fontinfo_id2() >= 0) ?
1121  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1122  (b->fontinfo_id() >= 0) ?
1123  fontinfo_table_->get(b->fontinfo_id()).name : "",
1124  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1125  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1126  fontinfo_id);
1127  }
1128  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1129  bool expected_gap_found = false;
1130  float expected_gap;
1131  int temp_gap;
1132  if (fontinfo_id >= 0) { // found a common font
1133  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1134  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1135  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1136  expected_gap = temp_gap;
1137  expected_gap_found = true;
1138  }
1139  } else {
1140  consistency_info->inconsistent_font = true;
1141  // Get an average of the expected gaps in each font
1142  int num_addends = 0;
1143  expected_gap = 0;
1144  int temp_fid;
1145  for (int i = 0; i < 4; ++i) {
1146  if (i == 0) {
1147  temp_fid = parent_b->fontinfo_id();
1148  } else if (i == 1) {
1149  temp_fid = parent_b->fontinfo_id2();
1150  } else if (i == 2) {
1151  temp_fid = b->fontinfo_id();
1152  } else {
1153  temp_fid = b->fontinfo_id2();
1154  }
1155  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1156  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1157  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1158  expected_gap += temp_gap;
1159  num_addends++;
1160  }
1161  }
1162  expected_gap_found = (num_addends > 0);
1163  if (num_addends > 0) {
1164  expected_gap /= static_cast<float>(num_addends);
1165  }
1166  }
1167  if (expected_gap_found) {
1168  float actual_gap =
1169  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1170  float gap_ratio = expected_gap / actual_gap;
1171  // TODO(rays) The gaps seem to be way off most of the time, saved by
1172  // the error here that the ratio was compared to 1/2, when it should
1173  // have been 0.5f. Find the source of the gaps discrepancy and put
1174  // the 0.5f here in place of 0.0f.
1175  // Test on 2476595.sj, pages 0 to 6. (In French.)
1176  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1177  consistency_info->num_inconsistent_spaces++;
1178  }
1179  if (language_model_debug_level > 1) {
1180  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1181  unicharset.id_to_unichar(parent_b->unichar_id()),
1182  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1183  unichar_id, curr_col, expected_gap, actual_gap);
1184  }
1185  }
1186  }
1187  }
1188 }
1189 
1191  ASSERT_HOST(vse != NULL);
1192  if (params_model_.Initialized()) {
1193  float features[PTRAIN_NUM_FEATURE_TYPES];
1194  ExtractFeaturesFromPath(*vse, features);
1195  float cost = params_model_.ComputeCost(features);
1196  if (language_model_debug_level > 3) {
1197  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1198  if (language_model_debug_level >= 5) {
1199  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1200  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1201  }
1202  }
1203  }
1204  return cost * vse->outline_length;
1205  } else {
1206  float adjustment = 1.0f;
1207  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1209  }
1210  if (vse->dawg_info == NULL) {
1213  adjustment += ((vse->length - language_model_min_compound_length) *
1215  }
1216  }
1217  if (vse->associate_stats.shape_cost > 0) {
1218  adjustment += vse->associate_stats.shape_cost /
1219  static_cast<float>(vse->length);
1220  }
1222  ASSERT_HOST(vse->ngram_info != NULL);
1223  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1224  } else {
1225  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1226  vse->consistency_info);
1227  return vse->ratings_sum * adjustment;
1228  }
1229  }
1230 }
1231 
1233  ViterbiStateEntry *vse,
1234  LMPainPoints *pain_points,
1235  WERD_RES *word_res,
1236  BestChoiceBundle *best_choice_bundle,
1237  BlamerBundle *blamer_bundle) {
1238  bool truth_path;
1239  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1240  blamer_bundle, &truth_path);
1241  ASSERT_HOST(word != NULL);
1242  if (dict_->stopper_debug_level >= 1) {
1243  STRING word_str;
1244  word->string_and_lengths(&word_str, NULL);
1245  vse->Print(word_str.string());
1246  }
1247  if (language_model_debug_level > 0) {
1248  word->print("UpdateBestChoice() constructed word");
1249  }
1250  // Record features from the current path if necessary.
1251  ParamsTrainingHypothesis curr_hyp;
1252  if (blamer_bundle != NULL) {
1253  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1254  static_cast<PermuterType>(word->permuter());
1255  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1256  word->string_and_lengths(&(curr_hyp.str), NULL);
1257  curr_hyp.cost = vse->cost; // record cost for error rate computations
1258  if (language_model_debug_level > 0) {
1259  tprintf("Raw features extracted from %s (cost=%g) [ ",
1260  curr_hyp.str.string(), curr_hyp.cost);
1261  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1262  tprintf("%g ", curr_hyp.features[deb_i]);
1263  }
1264  tprintf("]\n");
1265  }
1266  // Record the current hypothesis in params_training_bundle.
1267  blamer_bundle->AddHypothesis(curr_hyp);
1268  if (truth_path)
1269  blamer_bundle->UpdateBestRating(word->rating());
1270  }
1271  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1272  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1273  // we no longer need it.
1274  delete word;
1275  return;
1276  }
1277  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1278  word->SetScriptPositions(false, word_res->chopped_word);
1279  // Update and log new raw_choice if needed.
1280  if (word_res->raw_choice == NULL ||
1281  word->rating() < word_res->raw_choice->rating()) {
1282  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1283  tprintf("Updated raw choice\n");
1284  }
1285  // Set the modified rating for best choice to vse->cost and log best choice.
1286  word->set_rating(vse->cost);
1287  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1288  // computes adjust_factor that is used by the adaption code (e.g. by
1289  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1290  // Note: the rating of the word is not adjusted.
1291  dict_->adjust_word(word, vse->dawg_info == NULL,
1292  vse->consistency_info.xht_decision, 0.0,
1293  false, language_model_debug_level > 0);
1294  // Hand ownership of the word over to the word_res.
1296  dict_->stopper_debug_level >= 1, word)) {
1297  // The word was so bad that it was deleted.
1298  return;
1299  }
1300  if (word_res->best_choice == word) {
1301  // Word was the new best.
1303  AcceptablePath(*vse)) {
1304  acceptable_choice_found_ = true;
1305  }
1306  // Update best_choice_bundle.
1307  best_choice_bundle->updated = true;
1308  best_choice_bundle->best_vse = vse;
1309  if (language_model_debug_level > 0) {
1310  tprintf("Updated best choice\n");
1311  word->print_state("New state ");
1312  }
1313  // Update hyphen state if we are dealing with a dictionary word.
1314  if (vse->dawg_info != NULL) {
1315  if (dict_->has_hyphen_end(*word)) {
1317  } else {
1318  dict_->reset_hyphen_vars(true);
1319  }
1320  }
1321 
1322  if (blamer_bundle != NULL) {
1324  vse->dawg_info != NULL && vse->top_choice_flags);
1325  }
1326  }
1327  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1328  word->DisplaySegmentation(word_res->chopped_word);
1329  }
1330 }
1331 
1333  const ViterbiStateEntry &vse, float features[]) {
1334  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1335  // Record dictionary match info.
1336  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1337  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1338  if (vse.dawg_info != NULL) {
1339  int permuter = vse.dawg_info->permuter;
1340  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1341  if (vse.consistency_info.num_digits == vse.length) {
1342  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1343  } else {
1344  features[PTRAIN_NUM_SHORT+len] = 1.0;
1345  }
1346  } else if (permuter == DOC_DAWG_PERM) {
1347  features[PTRAIN_DOC_SHORT+len] = 1.0;
1348  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1349  permuter == COMPOUND_PERM) {
1350  features[PTRAIN_DICT_SHORT+len] = 1.0;
1351  } else if (permuter == FREQ_DAWG_PERM) {
1352  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1353  }
1354  }
1355  // Record shape cost feature (normalized by path length).
1356  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1357  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1358  // Record ngram cost. (normalized by the path length).
1359  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1360  if (vse.ngram_info != NULL) {
1361  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1362  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1363  }
1364  // Record consistency-related features.
1365  // Disabled this feature for due to its poor performance.
1366  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1369  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1371  features[PTRAIN_NUM_BAD_SPACING] =
1373  // Disabled this feature for now due to its poor performance.
1374  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1375 
1376  // Classifier-related features.
1377  features[PTRAIN_RATING_PER_CHAR] =
1378  vse.ratings_sum / static_cast<float>(vse.outline_length);
1379 }
1380 
1382  ViterbiStateEntry *vse,
1383  WERD_RES *word_res,
1384  DANGERR *fixpt,
1385  BlamerBundle *blamer_bundle,
1386  bool *truth_path) {
1387  if (truth_path != NULL) {
1388  *truth_path =
1389  (blamer_bundle != NULL &&
1390  vse->length == blamer_bundle->correct_segmentation_length());
1391  }
1392  BLOB_CHOICE *curr_b = vse->curr_b;
1393  ViterbiStateEntry *curr_vse = vse;
1394 
1395  int i;
1396  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1397 
1398  // Re-compute the variance of the width-to-height ratios (since we now
1399  // can compute the mean over the whole word).
1400  float full_wh_ratio_mean = 0.0f;
1401  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1403  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1404  static_cast<float>(vse->length));
1405  vse->associate_stats.full_wh_ratio_var = 0.0f;
1406  }
1407 
1408  // Construct a WERD_CHOICE by tracing parent pointers.
1409  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1410  word->set_length(vse->length);
1411  int total_blobs = 0;
1412  for (i = (vse->length-1); i >= 0; --i) {
1413  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1414  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1415  *truth_path = false;
1416  }
1417  // The number of blobs used for this choice is row - col + 1.
1418  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1419  total_blobs += num_blobs;
1420  word->set_blob_choice(i, num_blobs, curr_b);
1421  // Update the width-to-height ratio variance. Useful non-space delimited
1422  // languages to ensure that the blobs are of uniform width.
1423  // Skip leading and trailing punctuation when computing the variance.
1424  if ((full_wh_ratio_mean != 0.0f &&
1425  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1426  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1428  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1429  if (language_model_debug_level > 2) {
1430  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1431  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1432  }
1433  }
1434 
1435  // Mark the word as compound if compound permuter was set for any of
1436  // the unichars on the path (usually this will happen for unichars
1437  // that are compounding operators, like "-" and "/").
1438  if (!compound && curr_vse->dawg_info &&
1439  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1440 
1441  // Update curr_* pointers.
1442  curr_vse = curr_vse->parent_vse;
1443  if (curr_vse == NULL) break;
1444  curr_b = curr_vse->curr_b;
1445  }
1446  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1447  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1448  // Re-adjust shape cost to include the updated width-to-height variance.
1449  if (full_wh_ratio_mean != 0.0f) {
1451  }
1452 
1453  word->set_rating(vse->ratings_sum);
1454  word->set_certainty(vse->min_certainty);
1457  if (vse->dawg_info != NULL) {
1458  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1459  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1460  word->set_permuter(NGRAM_PERM);
1461  } else if (vse->top_choice_flags) {
1463  } else {
1464  word->set_permuter(NO_PERM);
1465  }
1466  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1467  word_res->ratings));
1468  return word;
1469 }
1470 
1471 } // namespace tesseract
uinT8 permuter() const
Definition: ratngs.h:342
bool empty() const
Definition: genericvector.h:91
static const float kBadRating
Definition: ratngs.h:271
int language_model_viterbi_list_max_num_prunable
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
PointerVector< LanguageModelState > beam
Definition: lm_state.h:231
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:570
int han_sid() const
Definition: unicharset.h:887
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:211
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:672
DawgPositionVector active_dawgs
Definition: lm_state.h:64
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:359
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
AssociateStats associate_stats
Definition: lm_state.h:170
TWERD * chopped_word
Definition: pageres.h:201
static const LanguageModelFlagsType kSmallestRatingFlag
XHeightConsistencyEnum xht_decision
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
const UNICHARSET * uch_set
Definition: pageres.h:192
float features[PTRAIN_NUM_FEATURE_TYPES]
int dimension() const
Definition: matrix.h:528
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
void set_rating(float new_val)
Definition: ratngs.h:365
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:414
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:178
void print_state(const char *msg) const
Definition: ratngs.cpp:741
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:206
DawgType type() const
Definition: dawg.h:128
PermuterType
Definition: ratngs.h:238
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:126
bool language_model_ngram_space_delimited_language
DawgPositionVector * updated_dawgs
Definition: dict.h:81
int katakana_sid() const
Definition: unicharset.h:889
inT64 NODE_REF
Definition: dawg.h:56
static const LanguageModelFlagsType kXhtConsistentFlag
Struct to store information maintained by various language model components.
Definition: lm_state.h:193
float x_height
Definition: pageres.h:295
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:233
float ComputeCost(const float features[]) const
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:531
MATRIX * ratings
Definition: pageres.h:215
float rating() const
Definition: ratngs.h:323
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
LMConsistencyInfo consistency_info
Definition: lm_state.h:169
int size() const
Definition: genericvector.h:72
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
WERD_CHOICE * best_choice
Definition: pageres.h:219
static const LanguageModelFlagsType kUpperCaseFlag
void print() const
Definition: ratngs.h:576
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:50
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:362
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
bool language_model_ngram_use_only_first_uft8_step
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:141
#define tprintf(...)
Definition: tprintf.h:31
PermuterType permuter
Definition: dict.h:82
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int hiragana_sid() const
Definition: unicharset.h:888
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:208
double language_model_penalty_non_dict_word
DawgPositionVector * active_dawgs
Definition: dict.h:80
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:117
static const LanguageModelFlagsType kDigitFlag
int common_sid() const
Definition: unicharset.h:883
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
DawgPositionVector beginning_active_dawgs_
ViterbiStateEntry * competing_vse
Definition: lm_state.h:160
void Print(const char *msg) const
Definition: lm_state.cpp:27
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
int size() const
Definition: unicharset.h:338
const char * string() const
Definition: strngs.cpp:198
bool AcceptablePath(const ViterbiStateEntry &vse)
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
int stopper_debug_level
Definition: dict.h:622
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
Definition: cluster.h:45
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:84
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:151
Definition: strngs.h:45
void set_permuter(uinT8 perm)
Definition: ratngs.h:371
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:420
double language_model_penalty_non_freq_dict_word
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:174
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:483
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:315
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:182
inT16 fontinfo_id() const
Definition: ratngs.h:85
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void set_certainty(float new_val)
Definition: ratngs.h:368
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:133
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:372
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:416
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:209
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:227
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:225
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
float rating() const
Definition: ratngs.h:79
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:293
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:86
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
bool PrunablePath(const ViterbiStateEntry &vse)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:338
ViterbiStateEntry * parent_vse
Definition: lm_state.h:157
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
EDGE_REF dawg_ref
Definition: dawg.h:374
const UnicityTable< FontInfo > * fontinfo_table_
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
int correct_segmentation_length() const
Definition: blamer.h:126
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
GenericVector< int > blob_widths
Definition: pageres.h:205
void UpdateBestRating(float rating)
Definition: blamer.h:122
double language_model_ngram_nonmatch_score
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void set_length(int len)
Definition: ratngs.h:377
DawgPositionVector very_beginning_active_dawgs_
int tessedit_truncate_wordchoice_log
Definition: dict.h:628
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:750
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
static const float kMaxAvgNgramCost
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
float certainty() const
Definition: ratngs.h:82
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:587
float CertaintyScore(float cert)
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
int null_sid() const
Definition: unicharset.h:882
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:156
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:215
inT32 length() const
Definition: strngs.cpp:193
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35
void Print(const char *msg)
Definition: lm_state.cpp:70
static const LanguageModelFlagsType kLowerCaseFlag
const STRING & unichar_string() const
Definition: ratngs.h:537