All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
language_model.cpp
Go to the documentation of this file.
1 // File: language_model.cpp
3 // Description: Functions that utilize the knowledge about the properties,
4 // structure and statistics of the language to help recognition.
5 // Author: Daria Antonova
6 // Created: Mon Nov 11 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include <math.h>
22 
23 #include "language_model.h"
24 
25 #include "dawg.h"
26 #include "freelist.h"
27 #include "intproto.h"
28 #include "helpers.h"
29 #include "lm_state.h"
30 #include "lm_pain_points.h"
31 #include "matrix.h"
32 #include "params.h"
34 
35 #if defined(_MSC_VER) || defined(ANDROID)
36 double log2(double n) {
37  return log(n) / log(2.0);
38 }
39 #endif // _MSC_VER
40 
41 namespace tesseract {
42 
43 const float LanguageModel::kMaxAvgNgramCost = 25.0f;
44 
46  Dict *dict)
47  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
48  dict->getCCUtil()->params()),
49  BOOL_INIT_MEMBER(language_model_ngram_on, false,
50  "Turn on/off the use of character ngram model",
51  dict->getCCUtil()->params()),
52  INT_MEMBER(language_model_ngram_order, 8,
53  "Maximum order of the character ngram model",
54  dict->getCCUtil()->params()),
55  INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
56  "Maximum number of prunable (those for which"
57  " PrunablePath() is true) entries in each viterbi list"
58  " recorded in BLOB_CHOICEs",
59  dict->getCCUtil()->params()),
60  INT_MEMBER(language_model_viterbi_list_max_size, 500,
61  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
62  dict->getCCUtil()->params()),
63  double_MEMBER(language_model_ngram_small_prob, 0.000001,
64  "To avoid overly small denominators use this as the "
65  "floor of the probability returned by the ngram model.",
66  dict->getCCUtil()->params()),
67  double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
68  "Average classifier score of a non-matching unichar.",
69  dict->getCCUtil()->params()),
70  BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
71  "Use only the first UTF8 step of the given string"
72  " when computing log probabilities.",
73  dict->getCCUtil()->params()),
74  double_MEMBER(language_model_ngram_scale_factor, 0.03,
75  "Strength of the character ngram model relative to the"
76  " character classifier ",
77  dict->getCCUtil()->params()),
78  double_MEMBER(language_model_ngram_rating_factor, 16.0,
79  "Factor to bring log-probs into the same range as ratings"
80  " when multiplied by outline length ",
81  dict->getCCUtil()->params()),
82  BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
83  "Words are delimited by space",
84  dict->getCCUtil()->params()),
85  INT_MEMBER(language_model_min_compound_length, 3,
86  "Minimum length of compound words",
87  dict->getCCUtil()->params()),
88  double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
89  "Penalty for words not in the frequent word dictionary",
90  dict->getCCUtil()->params()),
91  double_MEMBER(language_model_penalty_non_dict_word, 0.15,
92  "Penalty for non-dictionary words",
93  dict->getCCUtil()->params()),
94  double_MEMBER(language_model_penalty_punc, 0.2,
95  "Penalty for inconsistent punctuation",
96  dict->getCCUtil()->params()),
97  double_MEMBER(language_model_penalty_case, 0.1,
98  "Penalty for inconsistent case",
99  dict->getCCUtil()->params()),
100  double_MEMBER(language_model_penalty_script, 0.5,
101  "Penalty for inconsistent script",
102  dict->getCCUtil()->params()),
103  double_MEMBER(language_model_penalty_chartype, 0.3,
104  "Penalty for inconsistent character type",
105  dict->getCCUtil()->params()),
106  // TODO(daria, rays): enable font consistency checking
107  // after improving font analysis.
108  double_MEMBER(language_model_penalty_font, 0.00,
109  "Penalty for inconsistent font",
110  dict->getCCUtil()->params()),
111  double_MEMBER(language_model_penalty_spacing, 0.05,
112  "Penalty for inconsistent spacing",
113  dict->getCCUtil()->params()),
114  double_MEMBER(language_model_penalty_increment, 0.01,
115  "Penalty increment",
116  dict->getCCUtil()->params()),
117  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
118  dict->getCCUtil()->params()),
119  BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
120  "Use sigmoidal score for certainty",
121  dict->getCCUtil()->params()),
122  fontinfo_table_(fontinfo_table), dict_(dict),
123  fixed_pitch_(false), max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
129 }
130 
134  delete dawg_args_->updated_dawgs;
135  delete dawg_args_;
136 }
137 
139  bool fixed_pitch, float max_char_wh_ratio,
140  float rating_cert_scale) {
141  fixed_pitch_ = fixed_pitch;
142  max_char_wh_ratio_ = max_char_wh_ratio;
143  rating_cert_scale_ = rating_cert_scale;
144  acceptable_choice_found_ = false;
146 
147  // Initialize vectors with beginning DawgInfos.
152 
153  // Fill prev_word_str_ with the last language_model_ngram_order
154  // unichars from prev_word.
156  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
157  prev_word_str_ = prev_word->unichar_string();
159  } else {
160  prev_word_str_ = " ";
161  }
162  const char *str_ptr = prev_word_str_.string();
163  const char *str_end = str_ptr + prev_word_str_.length();
164  int step;
166  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
167  str_ptr += step;
169  }
170  ASSERT_HOST(str_ptr == str_end);
171  }
172 }
173 
178 static void ScanParentsForCaseMix(const UNICHARSET& unicharset,
179  LanguageModelState* parent_node) {
180  if (parent_node == NULL) return;
181  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
182  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
183  ViterbiStateEntry* vse = vit.data();
184  vse->competing_vse = NULL;
185  UNICHAR_ID unichar_id = vse->curr_b->unichar_id();
186  if (unicharset.get_isupper(unichar_id) ||
187  unicharset.get_islower(unichar_id)) {
188  UNICHAR_ID other_case = unicharset.get_other_case(unichar_id);
189  if (other_case == unichar_id) continue; // Not in unicharset.
190  // Find other case in same list. There could be multiple entries with
191  // the same unichar_id, but in theory, they should all point to the
192  // same BLOB_CHOICE, and that is what we will be using to decide
193  // which to keep.
194  ViterbiStateEntry_IT vit2(&parent_node->viterbi_state_entries);
195  for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
196  vit2.data()->curr_b->unichar_id() != other_case;
197  vit2.forward()) {}
198  if (!vit2.cycled_list()) {
199  vse->competing_vse = vit2.data();
200  }
201  }
202  }
203 }
204 
209 static bool HasBetterCaseVariant(const UNICHARSET& unicharset,
210  const BLOB_CHOICE* choice,
211  BLOB_CHOICE_LIST* choices) {
212  UNICHAR_ID choice_id = choice->unichar_id();
213  UNICHAR_ID other_case = unicharset.get_other_case(choice_id);
214  if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
215  return false; // Not upper or lower or not in unicharset.
216  if (unicharset.SizesDistinct(choice_id, other_case))
217  return false; // Can be separated by size.
218  BLOB_CHOICE_IT bc_it(choices);
219  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
220  BLOB_CHOICE* better_choice = bc_it.data();
221  if (better_choice->unichar_id() == other_case)
222  return true; // Found an earlier instance of other_case.
223  else if (better_choice == choice)
224  return false; // Reached the original choice.
225  }
226  return false; // Should never happen, but just in case.
227 }
228 
256  bool just_classified,
257  int curr_col, int curr_row,
258  BLOB_CHOICE_LIST *curr_list,
259  LanguageModelState *parent_node,
260  LMPainPoints *pain_points,
261  WERD_RES *word_res,
262  BestChoiceBundle *best_choice_bundle,
263  BlamerBundle *blamer_bundle) {
264  if (language_model_debug_level > 0) {
265  tprintf("\nUpdateState: col=%d row=%d %s",
266  curr_col, curr_row, just_classified ? "just_classified" : "");
268  tprintf("(parent=%p)\n", parent_node);
269  else
270  tprintf("\n");
271  }
272  // Initialize helper variables.
273  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
274  bool new_changed = false;
275  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
276  const UNICHARSET& unicharset = dict_->getUnicharset();
277  BLOB_CHOICE *first_lower = NULL;
278  BLOB_CHOICE *first_upper = NULL;
279  BLOB_CHOICE *first_digit = NULL;
280  bool has_alnum_mix = false;
281  if (parent_node != NULL) {
282  int result = SetTopParentLowerUpperDigit(parent_node);
283  if (result < 0) {
285  tprintf("No parents found to process\n");
286  return false;
287  }
288  if (result > 0)
289  has_alnum_mix = true;
290  }
291  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
292  &first_digit))
293  has_alnum_mix = false;;
294  ScanParentsForCaseMix(unicharset, parent_node);
295  if (language_model_debug_level > 3 && parent_node != NULL) {
296  parent_node->Print("Parent viterbi list");
297  }
298  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
299 
300  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
301  ViterbiStateEntry_IT vit;
302  BLOB_CHOICE_IT c_it(curr_list);
303  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
304  BLOB_CHOICE* choice = c_it.data();
305  // TODO(antonova): make sure commenting this out if ok for ngram
306  // model scoring (I think this was introduced to fix ngram model quirks).
307  // Skip NULL unichars unless it is the only choice.
308  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
309  UNICHAR_ID unichar_id = choice->unichar_id();
310  if (unicharset.get_fragment(unichar_id)) {
311  continue; // Skip fragments.
312  }
313  // Set top choice flags.
314  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
315  if (c_it.at_first() || !new_changed)
316  blob_choice_flags |= kSmallestRatingFlag;
317  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
318  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
319  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
320 
321  if (parent_node == NULL) {
322  // Process the beginning of a word.
323  // If there is a better case variant that is not distinguished by size,
324  // skip this blob choice, as we have no choice but to accept the result
325  // of the character classifier to distinguish between them, even if
326  // followed by an upper case.
327  // With words like iPoc, and other CamelBackWords, the lower-upper
328  // transition can only be achieved if the classifier has the correct case
329  // as the top choice, and leaving an initial I lower down the list
330  // increases the chances of choosing IPoc simply because it doesn't
331  // include such a transition. iPoc will beat iPOC and ipoc because
332  // the other words are baseline/x-height inconsistent.
333  if (HasBetterCaseVariant(unicharset, choice, curr_list))
334  continue;
335  // Upper counts as lower at the beginning of a word.
336  if (blob_choice_flags & kUpperCaseFlag)
337  blob_choice_flags |= kLowerCaseFlag;
338  new_changed |= AddViterbiStateEntry(
339  blob_choice_flags, denom, word_end, curr_col, curr_row,
340  choice, curr_state, NULL, pain_points,
341  word_res, best_choice_bundle, blamer_bundle);
342  } else {
343  // Get viterbi entries from each parent ViterbiStateEntry.
344  vit.set_to_list(&parent_node->viterbi_state_entries);
345  int vit_counter = 0;
346  vit.mark_cycle_pt();
347  ViterbiStateEntry* parent_vse = NULL;
348  LanguageModelFlagsType top_choice_flags;
349  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
350  c_it.data(), blob_choice_flags,
351  unicharset, word_res, &vit,
352  &top_choice_flags)) != NULL) {
353  // Skip pruned entries and do not look at prunable entries if already
354  // examined language_model_viterbi_list_max_num_prunable of those.
355  if (PrunablePath(*parent_vse) &&
357  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
358  continue;
359  }
360  // If the parent has no alnum choice, (ie choice is the first in a
361  // string of alnum), and there is a better case variant that is not
362  // distinguished by size, skip this blob choice/parent, as with the
363  // initial blob treatment above.
364  if (!parent_vse->HasAlnumChoice(unicharset) &&
365  HasBetterCaseVariant(unicharset, choice, curr_list))
366  continue;
367  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
368  // looks good according to the Dawgs or character ngram model.
369  new_changed |= AddViterbiStateEntry(
370  top_choice_flags, denom, word_end, curr_col, curr_row,
371  c_it.data(), curr_state, parent_vse, pain_points,
372  word_res, best_choice_bundle, blamer_bundle);
373  }
374  }
375  }
376  return new_changed;
377 }
378 
385 bool LanguageModel::GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list,
386  BLOB_CHOICE **first_lower,
387  BLOB_CHOICE **first_upper,
388  BLOB_CHOICE **first_digit) const {
389  BLOB_CHOICE_IT c_it(curr_list);
390  const UNICHARSET &unicharset = dict_->getUnicharset();
391  BLOB_CHOICE *first_unichar = NULL;
392  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
393  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
394  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
395  if (first_unichar == NULL) first_unichar = c_it.data();
396  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
397  *first_lower = c_it.data();
398  }
399  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
400  !unicharset.get_islower(unichar_id)) {
401  *first_upper = c_it.data();
402  }
403  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
404  *first_digit = c_it.data();
405  }
406  }
407  ASSERT_HOST(first_unichar != NULL);
408  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
409  *first_digit != NULL;
410  if (*first_lower == NULL) *first_lower = first_unichar;
411  if (*first_upper == NULL) *first_upper = first_unichar;
412  if (*first_digit == NULL) *first_digit = first_unichar;
413  return mixed;
414 }
415 
426  LanguageModelState *parent_node) const {
427  if (parent_node == NULL) return -1;
428  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
429  ViterbiStateEntry* top_lower = NULL;
430  ViterbiStateEntry* top_upper = NULL;
431  ViterbiStateEntry* top_digit = NULL;
432  ViterbiStateEntry* top_choice = NULL;
433  float lower_rating = 0.0f;
434  float upper_rating = 0.0f;
435  float digit_rating = 0.0f;
436  float top_rating = 0.0f;
437  const UNICHARSET &unicharset = dict_->getUnicharset();
438  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
439  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
440  ViterbiStateEntry* vse = vit.data();
441  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
442  // back to the real character if needed.
443  ViterbiStateEntry* unichar_vse = vse;
444  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
445  float rating = unichar_vse->curr_b->rating();
446  while (unichar_id == INVALID_UNICHAR_ID &&
447  unichar_vse->parent_vse != NULL) {
448  unichar_vse = unichar_vse->parent_vse;
449  unichar_id = unichar_vse->curr_b->unichar_id();
450  rating = unichar_vse->curr_b->rating();
451  }
452  if (unichar_id != INVALID_UNICHAR_ID) {
453  if (unicharset.get_islower(unichar_id)) {
454  if (top_lower == NULL || lower_rating > rating) {
455  top_lower = vse;
456  lower_rating = rating;
457  }
458  } else if (unicharset.get_isalpha(unichar_id)) {
459  if (top_upper == NULL || upper_rating > rating) {
460  top_upper = vse;
461  upper_rating = rating;
462  }
463  } else if (unicharset.get_isdigit(unichar_id)) {
464  if (top_digit == NULL || digit_rating > rating) {
465  top_digit = vse;
466  digit_rating = rating;
467  }
468  }
469  }
470  if (top_choice == NULL || top_rating > rating) {
471  top_choice = vse;
472  top_rating = rating;
473  top_id = unichar_id;
474  }
475  }
476  if (top_choice == NULL) return -1;
477  bool mixed = (top_lower != NULL || top_upper != NULL) &&
478  top_digit != NULL;
479  if (top_lower == NULL) top_lower = top_choice;
480  top_lower->top_choice_flags |= kLowerCaseFlag;
481  if (top_upper == NULL) top_upper = top_choice;
482  top_upper->top_choice_flags |= kUpperCaseFlag;
483  if (top_digit == NULL) top_digit = top_choice;
484  top_digit->top_choice_flags |= kDigitFlag;
485  top_choice->top_choice_flags |= kSmallestRatingFlag;
486  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
487  (top_choice->top_choice_flags &
489  // If the compound marker top choice carries any of the top alnum flags,
490  // then give it all of them, allowing words like I-295 to be chosen.
491  top_choice->top_choice_flags |=
493  }
494  return mixed ? 1 : 0;
495 }
496 
503  bool just_classified, bool mixed_alnum, const BLOB_CHOICE* bc,
504  LanguageModelFlagsType blob_choice_flags, const UNICHARSET& unicharset,
505  WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
506  LanguageModelFlagsType* top_choice_flags) const {
507  for (; !vse_it->cycled_list(); vse_it->forward()) {
508  ViterbiStateEntry* parent_vse = vse_it->data();
509  // Only consider the parent if it has been updated or
510  // if the current ratings cell has just been classified.
511  if (!just_classified && !parent_vse->updated) continue;
513  parent_vse->Print("Considering");
514  // If the parent is non-alnum, then upper counts as lower.
515  *top_choice_flags = blob_choice_flags;
516  if ((blob_choice_flags & kUpperCaseFlag) &&
517  !parent_vse->HasAlnumChoice(unicharset)) {
518  *top_choice_flags |= kLowerCaseFlag;
519  }
520  *top_choice_flags &= parent_vse->top_choice_flags;
521  UNICHAR_ID unichar_id = bc->unichar_id();
522  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
523  UNICHAR_ID parent_id = parent_b->unichar_id();
524  // Digits do not bind to alphas if there is a mix in both parent and current
525  // or if the alpha is not the top choice.
526  if (unicharset.get_isdigit(unichar_id) &&
527  unicharset.get_isalpha(parent_id) &&
528  (mixed_alnum || *top_choice_flags == 0))
529  continue; // Digits don't bind to alphas.
530  // Likewise alphas do not bind to digits if there is a mix in both or if
531  // the digit is not the top choice.
532  if (unicharset.get_isalpha(unichar_id) &&
533  unicharset.get_isdigit(parent_id) &&
534  (mixed_alnum || *top_choice_flags == 0))
535  continue; // Alphas don't bind to digits.
536  // If there is a case mix of the same alpha in the parent list, then
537  // competing_vse is non-null and will be used to determine whether
538  // or not to bind the current blob choice.
539  if (parent_vse->competing_vse != NULL) {
540  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
541  UNICHAR_ID other_id = competing_b->unichar_id();
542  if (language_model_debug_level >= 5) {
543  tprintf("Parent %s has competition %s\n",
544  unicharset.id_to_unichar(parent_id),
545  unicharset.id_to_unichar(other_id));
546  }
547  if (unicharset.SizesDistinct(parent_id, other_id)) {
548  // If other_id matches bc wrt position and size, and parent_id, doesn't,
549  // don't bind to the current parent.
550  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
552  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
554  continue; // Competing blobchoice has a better vertical match.
555  }
556  }
557  vse_it->forward();
558  return parent_vse; // This one is good!
559  }
560  return NULL; // Ran out of possibilities.
561 }
562 
564  LanguageModelFlagsType top_choice_flags,
565  float denom,
566  bool word_end,
567  int curr_col, int curr_row,
568  BLOB_CHOICE *b,
569  LanguageModelState *curr_state,
570  ViterbiStateEntry *parent_vse,
571  LMPainPoints *pain_points,
572  WERD_RES *word_res,
573  BestChoiceBundle *best_choice_bundle,
574  BlamerBundle *blamer_bundle) {
575  ViterbiStateEntry_IT vit;
576  if (language_model_debug_level > 1) {
577  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
578  " certainty=%.4f top_choice_flags=0x%x",
580  b->rating(), b->certainty(), top_choice_flags);
582  tprintf(" parent_vse=%p\n", parent_vse);
583  else
584  tprintf("\n");
585  }
586  // Check whether the list is full.
587  if (curr_state != NULL &&
588  curr_state->viterbi_state_entries_length >=
590  if (language_model_debug_level > 1) {
591  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
592  }
593  return false;
594  }
595 
596  // Invoke Dawg language model component.
597  LanguageModelDawgInfo *dawg_info =
598  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
599 
600  float outline_length =
602  // Invoke Ngram language model component.
603  LanguageModelNgramInfo *ngram_info = NULL;
605  ngram_info = GenerateNgramInfo(
607  denom, curr_col, curr_row, outline_length, parent_vse);
608  ASSERT_HOST(ngram_info != NULL);
609  }
610  bool liked_by_language_model = dawg_info != NULL ||
611  (ngram_info != NULL && !ngram_info->pruned);
612  // Quick escape if not liked by the language model, can't be consistent
613  // xheight, and not top choice.
614  if (!liked_by_language_model && top_choice_flags == 0) {
615  if (language_model_debug_level > 1) {
616  tprintf("Language model components very early pruned this entry\n");
617  }
618  delete ngram_info;
619  delete dawg_info;
620  return false;
621  }
622 
623  // Check consistency of the path and set the relevant consistency_info.
624  LMConsistencyInfo consistency_info(
625  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
626  // Start with just the x-height consistency, as it provides significant
627  // pruning opportunity.
628  consistency_info.ComputeXheightConsistency(
630  // Turn off xheight consistent flag if not consistent.
631  if (consistency_info.InconsistentXHeight()) {
632  top_choice_flags &= ~kXhtConsistentFlag;
633  }
634 
635  // Quick escape if not liked by the language model, not consistent xheight,
636  // and not top choice.
637  if (!liked_by_language_model && top_choice_flags == 0) {
638  if (language_model_debug_level > 1) {
639  tprintf("Language model components early pruned this entry\n");
640  }
641  delete ngram_info;
642  delete dawg_info;
643  return false;
644  }
645 
646  // Compute the rest of the consistency info.
647  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
648  word_res, &consistency_info);
649  if (dawg_info != NULL && consistency_info.invalid_punc) {
650  consistency_info.invalid_punc = false; // do not penalize dict words
651  }
652 
653  // Compute cost of associating the blobs that represent the current unichar.
654  AssociateStats associate_stats;
655  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
656  parent_vse, word_res, &associate_stats);
657  if (parent_vse != NULL) {
658  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
659  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
660  }
661 
662  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
663  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
664  parent_vse, b, 0.0, outline_length,
665  consistency_info, associate_stats, top_choice_flags, dawg_info,
666  ngram_info, (language_model_debug_level > 0) ?
667  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
668  new_vse->cost = ComputeAdjustedPathCost(new_vse);
670  tprintf("Adjusted cost = %g\n", new_vse->cost);
671 
672  // Invoke Top Choice language model component to make the final adjustments
673  // to new_vse->top_choice_flags.
674  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
675  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
676  }
677 
678  // If language model components did not like this unichar - return.
679  bool keep = new_vse->top_choice_flags || liked_by_language_model;
680  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
681  consistency_info.inconsistent_script) { // with inconsistent script
682  keep = false;
683  }
684  if (!keep) {
685  if (language_model_debug_level > 1) {
686  tprintf("Language model components did not like this entry\n");
687  }
688  delete new_vse;
689  return false;
690  }
691 
692  // Discard this entry if it represents a prunable path and
693  // language_model_viterbi_list_max_num_prunable such entries with a lower
694  // cost have already been recorded.
695  if (PrunablePath(*new_vse) &&
698  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
699  if (language_model_debug_level > 1) {
700  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
701  new_vse->cost,
703  }
704  delete new_vse;
705  return false;
706  }
707 
708  // Update best choice if needed.
709  if (word_end) {
710  UpdateBestChoice(new_vse, pain_points, word_res,
711  best_choice_bundle, blamer_bundle);
712  // Discard the entry if UpdateBestChoice() found flaws in it.
713  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
714  new_vse != best_choice_bundle->best_vse) {
715  if (language_model_debug_level > 1) {
716  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
717  }
718  delete new_vse;
719  return false;
720  }
721  }
722 
723  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
724  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
725  false, new_vse);
726  curr_state->viterbi_state_entries_length++;
727  if (PrunablePath(*new_vse)) {
729  }
730 
731  // Update lms->viterbi_state_entries_prunable_max_cost and clear
732  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
733  if ((curr_state->viterbi_state_entries_prunable_length >=
735  new_vse->top_choice_flags) {
736  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
737  int prunable_counter = language_model_viterbi_list_max_num_prunable;
738  vit.set_to_list(&(curr_state->viterbi_state_entries));
739  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
740  ViterbiStateEntry *curr_vse = vit.data();
741  // Clear the appropriate top choice flags of the entries in the
742  // list that have cost higher thank new_entry->cost
743  // (since they will not be top choices any more).
744  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
745  curr_vse->cost > new_vse->cost) {
746  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
747  }
748  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
749  // Update curr_state->viterbi_state_entries_prunable_max_cost.
750  if (prunable_counter == 0) {
751  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
752  if (language_model_debug_level > 1) {
753  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
755  }
756  prunable_counter = -1; // stop counting
757  }
758  }
759  }
760 
761  // Print the newly created ViterbiStateEntry.
762  if (language_model_debug_level > 2) {
763  new_vse->Print("New");
765  curr_state->Print("Updated viterbi list");
766  }
767 
768  return true;
769 }
770 
772  const ViterbiStateEntry *parent_vse,
773  LanguageModelState *lms) {
774  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
775  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
776  new_vse->cost >= vit.data()->cost; vit.forward()) {
777  // Clear the appropriate flags if the list already contains
778  // a top choice entry with a lower cost.
779  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
780  }
781  if (language_model_debug_level > 2) {
782  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
783  new_vse->top_choice_flags);
784  }
785 }
786 
788  bool word_end,
789  int curr_col, int curr_row,
790  const BLOB_CHOICE &b,
791  const ViterbiStateEntry *parent_vse) {
792  // Initialize active_dawgs from parent_vse if it is not NULL.
793  // Otherwise use very_beginning_active_dawgs_.
794  if (parent_vse == NULL) {
797  } else {
798  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
800  dawg_args_->permuter = parent_vse->dawg_info->permuter;
801  }
802 
803  // Deal with hyphenated words.
804  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
805  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
807  COMPOUND_PERM);
808  }
809 
810  // Deal with compound words.
811  if (dict_->compound_marker(b.unichar_id()) &&
812  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
813  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
814  // Do not allow compound operators at the beginning and end of the word.
815  // Do not allow more than one compound operator per word.
816  // Do not allow compounding of words with lengths shorter than
817  // language_model_min_compound_length
818  if (parent_vse == NULL || word_end ||
820  parent_vse->length < language_model_min_compound_length) return NULL;
821 
822  int i;
823  // Check a that the path terminated before the current character is a word.
824  bool has_word_ending = false;
825  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
826  const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
827  const Dawg *pdawg = pos.dawg_index < 0
828  ? NULL : dict_->GetDawg(pos.dawg_index);
829  if (pdawg == NULL || pos.back_to_punc) continue;;
830  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
831  pdawg->end_of_word(pos.dawg_ref)) {
832  has_word_ending = true;
833  break;
834  }
835  }
836  if (!has_word_ending) return NULL;
837 
838  if (language_model_debug_level > 0) tprintf("Compound word found\n");
840  } // done dealing with compound words
841 
842  LanguageModelDawgInfo *dawg_info = NULL;
843 
844  // Call LetterIsOkay().
845  // Use the normalized IDs so that all shapes of ' can be allowed in words
846  // like don't.
847  const GenericVector<UNICHAR_ID>& normed_ids =
849  DawgPositionVector tmp_active_dawgs;
850  for (int i = 0; i < normed_ids.size(); ++i) {
852  tprintf("Test Letter OK for unichar %d, normed %d\n",
853  b.unichar_id(), normed_ids[i]);
854  dict_->LetterIsOkay(dawg_args_, normed_ids[i],
855  word_end && i == normed_ids.size() - 1);
856  if (dawg_args_->permuter == NO_PERM) {
857  break;
858  } else if (i < normed_ids.size() - 1) {
859  tmp_active_dawgs = *dawg_args_->updated_dawgs;
860  dawg_args_->active_dawgs = &tmp_active_dawgs;
861  }
863  tprintf("Letter was OK for unichar %d, normed %d\n",
864  b.unichar_id(), normed_ids[i]);
865  }
867  if (dawg_args_->permuter != NO_PERM) {
870  } else if (language_model_debug_level > 3) {
871  tprintf("Letter %s not OK!\n",
873  }
874 
875  return dawg_info;
876 }
877 
879  const char *unichar, float certainty, float denom,
880  int curr_col, int curr_row, float outline_length,
881  const ViterbiStateEntry *parent_vse) {
882  // Initialize parent context.
883  const char *pcontext_ptr = "";
884  int pcontext_unichar_step_len = 0;
885  if (parent_vse == NULL) {
886  pcontext_ptr = prev_word_str_.string();
887  pcontext_unichar_step_len = prev_word_unichar_step_len_;
888  } else {
889  pcontext_ptr = parent_vse->ngram_info->context.string();
890  pcontext_unichar_step_len =
892  }
893  // Compute p(unichar | parent context).
894  int unichar_step_len = 0;
895  bool pruned = false;
896  float ngram_cost;
897  float ngram_and_classifier_cost =
898  ComputeNgramCost(unichar, certainty, denom,
899  pcontext_ptr, &unichar_step_len,
900  &pruned, &ngram_cost);
901  // Normalize just the ngram_and_classifier_cost by outline_length.
902  // The ngram_cost is used by the params_model, so it needs to be left as-is,
903  // and the params model cost will be normalized by outline_length.
904  ngram_and_classifier_cost *=
905  outline_length / language_model_ngram_rating_factor;
906  // Add the ngram_cost of the parent.
907  if (parent_vse != NULL) {
908  ngram_and_classifier_cost +=
910  ngram_cost += parent_vse->ngram_info->ngram_cost;
911  }
912 
913  // Shorten parent context string by unichar_step_len unichars.
914  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
916  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
917  while (num_remove > 0 && *pcontext_ptr != '\0') {
918  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
919  --num_remove;
920  }
921 
922  // Decide whether to prune this ngram path and update changed accordingly.
923  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
924 
925  // Construct and return the new LanguageModelNgramInfo.
927  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
928  ngram_and_classifier_cost);
929  ngram_info->context += unichar;
930  ngram_info->context_unichar_step_len += unichar_step_len;
932  return ngram_info;
933 }
934 
935 float LanguageModel::ComputeNgramCost(const char *unichar,
936  float certainty,
937  float denom,
938  const char *context,
939  int *unichar_step_len,
940  bool *found_small_prob,
941  float *ngram_cost) {
942  const char *context_ptr = context;
943  char *modified_context = NULL;
944  char *modified_context_end = NULL;
945  const char *unichar_ptr = unichar;
946  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
947  float prob = 0.0f;
948  int step = 0;
949  while (unichar_ptr < unichar_end &&
950  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
951  if (language_model_debug_level > 1) {
952  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
953  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
954  }
955  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
956  ++(*unichar_step_len);
958  unichar_ptr += step;
959  // If there are multiple UTF8 characters present in unichar, context is
960  // updated to include the previously examined characters from str,
961  // unless use_only_first_uft8_step is true.
962  if (unichar_ptr < unichar_end) {
963  if (modified_context == NULL) {
964  int context_len = strlen(context);
965  modified_context =
966  new char[context_len + strlen(unichar_ptr) + step + 1];
967  strncpy(modified_context, context, context_len);
968  modified_context_end = modified_context + context_len;
969  context_ptr = modified_context;
970  }
971  strncpy(modified_context_end, unichar_ptr - step, step);
972  modified_context_end += step;
973  *modified_context_end = '\0';
974  }
975  }
976  prob /= static_cast<float>(*unichar_step_len); // normalize
977  if (prob < language_model_ngram_small_prob) {
978  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
979  *found_small_prob = true;
981  }
982  *ngram_cost = -1.0*log2(prob);
983  float ngram_and_classifier_cost =
984  -1.0*log2(CertaintyScore(certainty)/denom) +
985  *ngram_cost * language_model_ngram_scale_factor;
986  if (language_model_debug_level > 1) {
987  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
988  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
989  ngram_and_classifier_cost);
990  }
991  if (modified_context != NULL) delete[] modified_context;
992  return ngram_and_classifier_cost;
993 }
994 
995 float LanguageModel::ComputeDenom(BLOB_CHOICE_LIST *curr_list) {
996  if (curr_list->empty()) return 1.0f;
997  float denom = 0.0f;
998  int len = 0;
999  BLOB_CHOICE_IT c_it(curr_list);
1000  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1001  ASSERT_HOST(c_it.data() != NULL);
1002  ++len;
1003  denom += CertaintyScore(c_it.data()->certainty());
1004  }
1005  assert(len != 0);
1006  // The ideal situation would be to have the classifier scores for
1007  // classifying each position as each of the characters in the unicharset.
1008  // Since we can not do this because of speed, we add a very crude estimate
1009  // of what these scores for the "missing" classifications would sum up to.
1010  denom += (dict_->getUnicharset().size() - len) *
1012 
1013  return denom;
1014 }
1015 
1017  int curr_col,
1018  bool word_end,
1019  BLOB_CHOICE *b,
1020  ViterbiStateEntry *parent_vse,
1021  WERD_RES *word_res,
1022  LMConsistencyInfo *consistency_info) {
1023  const UNICHARSET &unicharset = dict_->getUnicharset();
1024  UNICHAR_ID unichar_id = b->unichar_id();
1025  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1026 
1027  // Check punctuation validity.
1028  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1029  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1030  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1031  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1032  unicharset.get_isdigit(parent_b->unichar_id()))) {
1033  // reset punc_ref for compound words
1034  consistency_info->punc_ref = NO_EDGE;
1035  } else {
1036  bool is_apos = dict_->is_apostrophe(unichar_id);
1037  bool prev_is_numalpha = (parent_b != NULL &&
1038  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1039  unicharset.get_isdigit(parent_b->unichar_id())));
1040  UNICHAR_ID pattern_unichar_id =
1041  (unicharset.get_isalpha(unichar_id) ||
1042  unicharset.get_isdigit(unichar_id) ||
1043  (is_apos && prev_is_numalpha)) ?
1044  Dawg::kPatternUnicharID : unichar_id;
1045  if (consistency_info->punc_ref == NO_EDGE ||
1046  pattern_unichar_id != Dawg::kPatternUnicharID ||
1047  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1050  consistency_info->punc_ref);
1051  consistency_info->punc_ref =
1052  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1053  node, pattern_unichar_id, word_end) : NO_EDGE;
1054  if (consistency_info->punc_ref == NO_EDGE) {
1055  consistency_info->invalid_punc = true;
1056  }
1057  }
1058  }
1059  }
1060 
1061  // Update case related counters.
1062  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1063  // Reset counters if we are dealing with a compound word.
1064  consistency_info->num_lower = 0;
1065  consistency_info->num_non_first_upper = 0;
1066  }
1067  else if (unicharset.get_islower(unichar_id)) {
1068  consistency_info->num_lower++;
1069  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1070  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1071  consistency_info->num_lower > 0 ||
1072  consistency_info->num_non_first_upper > 0) {
1073  consistency_info->num_non_first_upper++;
1074  }
1075  }
1076 
1077  // Initialize consistency_info->script_id (use script of unichar_id
1078  // if it is not Common, use script id recorded by the parent otherwise).
1079  // Set inconsistent_script to true if the script of the current unichar
1080  // is not consistent with that of the parent.
1081  consistency_info->script_id = unicharset.get_script(unichar_id);
1082  // Hiragana and Katakana can mix with Han.
1084  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1085  consistency_info->script_id == unicharset.hiragana_sid()) ||
1086  (unicharset.katakana_sid() != unicharset.null_sid() &&
1087  consistency_info->script_id == unicharset.katakana_sid())) {
1088  consistency_info->script_id = dict_->getUnicharset().han_sid();
1089  }
1090  }
1091 
1092  if (parent_vse != NULL &&
1093  (parent_vse->consistency_info.script_id !=
1094  dict_->getUnicharset().common_sid())) {
1095  int parent_script_id = parent_vse->consistency_info.script_id;
1096  // If script_id is Common, use script id of the parent instead.
1097  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1098  consistency_info->script_id = parent_script_id;
1099  }
1100  if (consistency_info->script_id != parent_script_id) {
1101  consistency_info->inconsistent_script = true;
1102  }
1103  }
1104 
1105  // Update chartype related counters.
1106  if (unicharset.get_isalpha(unichar_id)) {
1107  consistency_info->num_alphas++;
1108  } else if (unicharset.get_isdigit(unichar_id)) {
1109  consistency_info->num_digits++;
1110  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1111  consistency_info->num_other++;
1112  }
1113 
1114  // Check font and spacing consistency.
1115  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1116  int fontinfo_id = -1;
1117  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1118  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1119  fontinfo_id = b->fontinfo_id();
1120  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1121  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1122  fontinfo_id = b->fontinfo_id2();
1123  }
1124  if(language_model_debug_level > 1) {
1125  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1126  (parent_b->fontinfo_id() >= 0) ?
1127  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1128  (parent_b->fontinfo_id2() >= 0) ?
1129  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1130  (b->fontinfo_id() >= 0) ?
1131  fontinfo_table_->get(b->fontinfo_id()).name : "",
1132  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1134  fontinfo_id);
1135  }
1136  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1137  bool expected_gap_found = false;
1138  float expected_gap;
1139  int temp_gap;
1140  if (fontinfo_id >= 0) { // found a common font
1141  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1142  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1143  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1144  expected_gap = temp_gap;
1145  expected_gap_found = true;
1146  }
1147  } else {
1148  consistency_info->inconsistent_font = true;
1149  // Get an average of the expected gaps in each font
1150  int num_addends = 0;
1151  expected_gap = 0;
1152  int temp_fid;
1153  for (int i = 0; i < 4; ++i) {
1154  if (i == 0) {
1155  temp_fid = parent_b->fontinfo_id();
1156  } else if (i == 1) {
1157  temp_fid = parent_b->fontinfo_id2();
1158  } else if (i == 2) {
1159  temp_fid = b->fontinfo_id();
1160  } else {
1161  temp_fid = b->fontinfo_id2();
1162  }
1163  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1164  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1165  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1166  expected_gap += temp_gap;
1167  num_addends++;
1168  }
1169  }
1170  expected_gap_found = (num_addends > 0);
1171  if (num_addends > 0) {
1172  expected_gap /= static_cast<float>(num_addends);
1173  }
1174  }
1175  if (expected_gap_found) {
1176  float actual_gap =
1177  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1178  float gap_ratio = expected_gap / actual_gap;
1179  // TODO(rays) The gaps seem to be way off most of the time, saved by
1180  // the error here that the ratio was compared to 1/2, when it should
1181  // have been 0.5f. Find the source of the gaps discrepancy and put
1182  // the 0.5f here in place of 0.0f.
1183  // Test on 2476595.sj, pages 0 to 6. (In French.)
1184  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185  consistency_info->num_inconsistent_spaces++;
1186  }
1187  if (language_model_debug_level > 1) {
1188  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1189  unicharset.id_to_unichar(parent_b->unichar_id()),
1190  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1191  unichar_id, curr_col, expected_gap, actual_gap);
1192  }
1193  }
1194  }
1195  }
1196 }
1197 
1199  ASSERT_HOST(vse != NULL);
1200  if (params_model_.Initialized()) {
1201  float features[PTRAIN_NUM_FEATURE_TYPES];
1202  ExtractFeaturesFromPath(*vse, features);
1203  float cost = params_model_.ComputeCost(features);
1204  if (language_model_debug_level > 3) {
1205  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1206  if (language_model_debug_level >= 5) {
1207  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1208  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1209  }
1210  }
1211  }
1212  return cost * vse->outline_length;
1213  } else {
1214  float adjustment = 1.0f;
1215  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1217  }
1218  if (vse->dawg_info == NULL) {
1221  adjustment += ((vse->length - language_model_min_compound_length) *
1223  }
1224  }
1225  if (vse->associate_stats.shape_cost > 0) {
1226  adjustment += vse->associate_stats.shape_cost /
1227  static_cast<float>(vse->length);
1228  }
1230  ASSERT_HOST(vse->ngram_info != NULL);
1231  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1232  } else {
1233  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1234  vse->consistency_info);
1235  return vse->ratings_sum * adjustment;
1236  }
1237  }
1238 }
1239 
1241  ViterbiStateEntry *vse,
1242  LMPainPoints *pain_points,
1243  WERD_RES *word_res,
1244  BestChoiceBundle *best_choice_bundle,
1245  BlamerBundle *blamer_bundle) {
1246  bool truth_path;
1247  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1248  blamer_bundle, &truth_path);
1249  ASSERT_HOST(word != NULL);
1250  if (dict_->stopper_debug_level >= 1) {
1251  STRING word_str;
1252  word->string_and_lengths(&word_str, NULL);
1253  vse->Print(word_str.string());
1254  }
1255  if (language_model_debug_level > 0) {
1256  word->print("UpdateBestChoice() constructed word");
1257  }
1258  // Record features from the current path if necessary.
1259  ParamsTrainingHypothesis curr_hyp;
1260  if (blamer_bundle != NULL) {
1261  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1262  static_cast<PermuterType>(word->permuter());
1263  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1264  word->string_and_lengths(&(curr_hyp.str), NULL);
1265  curr_hyp.cost = vse->cost; // record cost for error rate computations
1266  if (language_model_debug_level > 0) {
1267  tprintf("Raw features extracted from %s (cost=%g) [ ",
1268  curr_hyp.str.string(), curr_hyp.cost);
1269  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1270  tprintf("%g ", curr_hyp.features[deb_i]);
1271  }
1272  tprintf("]\n");
1273  }
1274  // Record the current hypothesis in params_training_bundle.
1275  blamer_bundle->AddHypothesis(curr_hyp);
1276  if (truth_path)
1277  blamer_bundle->UpdateBestRating(word->rating());
1278  }
1279  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1280  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1281  // we no longer need it.
1282  delete word;
1283  return;
1284  }
1285  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1286  word->SetScriptPositions(false, word_res->chopped_word);
1287  // Update and log new raw_choice if needed.
1288  if (word_res->raw_choice == NULL ||
1289  word->rating() < word_res->raw_choice->rating()) {
1290  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1291  tprintf("Updated raw choice\n");
1292  }
1293  // Set the modified rating for best choice to vse->cost and log best choice.
1294  word->set_rating(vse->cost);
1295  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1296  // computes adjust_factor that is used by the adaption code (e.g. by
1297  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1298  // Note: the rating of the word is not adjusted.
1299  dict_->adjust_word(word, vse->dawg_info == NULL,
1300  vse->consistency_info.xht_decision, 0.0,
1301  false, language_model_debug_level > 0);
1302  // Hand ownership of the word over to the word_res.
1304  dict_->stopper_debug_level >= 1, word)) {
1305  // The word was so bad that it was deleted.
1306  return;
1307  }
1308  if (word_res->best_choice == word) {
1309  // Word was the new best.
1311  AcceptablePath(*vse)) {
1312  acceptable_choice_found_ = true;
1313  }
1314  // Update best_choice_bundle.
1315  best_choice_bundle->updated = true;
1316  best_choice_bundle->best_vse = vse;
1317  if (language_model_debug_level > 0) {
1318  tprintf("Updated best choice\n");
1319  word->print_state("New state ");
1320  }
1321  // Update hyphen state if we are dealing with a dictionary word.
1322  if (vse->dawg_info != NULL) {
1323  if (dict_->has_hyphen_end(*word)) {
1325  } else {
1326  dict_->reset_hyphen_vars(true);
1327  }
1328  }
1329 
1330  if (blamer_bundle != NULL) {
1332  vse->dawg_info != NULL && vse->top_choice_flags);
1333  }
1334  }
1335  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1336  word->DisplaySegmentation(word_res->chopped_word);
1337  }
1338 }
1339 
1341  const ViterbiStateEntry &vse, float features[]) {
1342  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1343  // Record dictionary match info.
1344  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1345  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1346  if (vse.dawg_info != NULL) {
1347  int permuter = vse.dawg_info->permuter;
1348  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1349  if (vse.consistency_info.num_digits == vse.length) {
1350  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1351  } else {
1352  features[PTRAIN_NUM_SHORT+len] = 1.0;
1353  }
1354  } else if (permuter == DOC_DAWG_PERM) {
1355  features[PTRAIN_DOC_SHORT+len] = 1.0;
1356  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1357  permuter == COMPOUND_PERM) {
1358  features[PTRAIN_DICT_SHORT+len] = 1.0;
1359  } else if (permuter == FREQ_DAWG_PERM) {
1360  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1361  }
1362  }
1363  // Record shape cost feature (normalized by path length).
1364  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1365  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1366  // Record ngram cost. (normalized by the path length).
1367  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1368  if (vse.ngram_info != NULL) {
1369  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1370  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1371  }
1372  // Record consistency-related features.
1373  // Disabled this feature for due to its poor performance.
1374  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1377  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1379  features[PTRAIN_NUM_BAD_SPACING] =
1381  // Disabled this feature for now due to its poor performance.
1382  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1383 
1384  // Classifier-related features.
1385  features[PTRAIN_RATING_PER_CHAR] =
1386  vse.ratings_sum / static_cast<float>(vse.outline_length);
1387 }
1388 
1390  ViterbiStateEntry *vse,
1391  WERD_RES *word_res,
1392  DANGERR *fixpt,
1393  BlamerBundle *blamer_bundle,
1394  bool *truth_path) {
1395  if (truth_path != NULL) {
1396  *truth_path =
1397  (blamer_bundle != NULL &&
1398  vse->length == blamer_bundle->correct_segmentation_length());
1399  }
1400  BLOB_CHOICE *curr_b = vse->curr_b;
1401  ViterbiStateEntry *curr_vse = vse;
1402 
1403  int i;
1404  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1405 
1406  // Re-compute the variance of the width-to-height ratios (since we now
1407  // can compute the mean over the whole word).
1408  float full_wh_ratio_mean = 0.0f;
1409  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1411  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1412  static_cast<float>(vse->length));
1413  vse->associate_stats.full_wh_ratio_var = 0.0f;
1414  }
1415 
1416  // Construct a WERD_CHOICE by tracing parent pointers.
1417  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1418  word->set_length(vse->length);
1419  int total_blobs = 0;
1420  for (i = (vse->length-1); i >= 0; --i) {
1421  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1422  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1423  *truth_path = false;
1424  }
1425  // The number of blobs used for this choice is row - col + 1.
1426  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1427  total_blobs += num_blobs;
1428  word->set_blob_choice(i, num_blobs, curr_b);
1429  // Update the width-to-height ratio variance. Useful non-space delimited
1430  // languages to ensure that the blobs are of uniform width.
1431  // Skip leading and trailing punctuation when computing the variance.
1432  if ((full_wh_ratio_mean != 0.0f &&
1433  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1434  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1436  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1437  if (language_model_debug_level > 2) {
1438  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1439  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1440  }
1441  }
1442 
1443  // Mark the word as compound if compound permuter was set for any of
1444  // the unichars on the path (usually this will happen for unichars
1445  // that are compounding operators, like "-" and "/").
1446  if (!compound && curr_vse->dawg_info &&
1447  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1448 
1449  // Update curr_* pointers.
1450  curr_vse = curr_vse->parent_vse;
1451  if (curr_vse == NULL) break;
1452  curr_b = curr_vse->curr_b;
1453  }
1454  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1455  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1456  // Re-adjust shape cost to include the updated width-to-height variance.
1457  if (full_wh_ratio_mean != 0.0f) {
1459  }
1460 
1461  word->set_rating(vse->ratings_sum);
1462  word->set_certainty(vse->min_certainty);
1465  if (vse->dawg_info != NULL) {
1466  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1467  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1468  word->set_permuter(NGRAM_PERM);
1469  } else if (vse->top_choice_flags) {
1471  } else {
1472  word->set_permuter(NO_PERM);
1473  }
1474  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1475  word_res->ratings));
1476  return word;
1477 }
1478 
1479 } // namespace tesseract
int katakana_sid() const
Definition: unicharset.h:838
DawgPositionVector * active_dawgs
Definition: dict.h:81
static const float kBadRating
Definition: ratngs.h:273
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
int size() const
Definition: genericvector.h:72
LMConsistencyInfo consistency_info
Definition: lm_state.h:173
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
Definition: lm_state.h:212
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747
float rating() const
Definition: ratngs.h:324
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
int hiragana_sid() const
Definition: unicharset.h:837
void set_certainty(float new_val)
Definition: ratngs.h:369
double language_model_ngram_nonmatch_score
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
Definition: lm_state.h:237
virtual bool end_of_word(EDGE_REF edge_ref) const =0
MATRIX * ratings
Definition: pageres.h:215
EDGE_REF dawg_ref
Definition: dawg.h:362
WERD_CHOICE * best_choice
Definition: pageres.h:219
static const LanguageModelFlagsType kDigitFlag
const UnicityTable< FontInfo > * fontinfo_table_
TWERD * chopped_word
Definition: pageres.h:201
float CertaintyScore(float cert)
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
int null_sid() const
Definition: unicharset.h:831
#define tprintf(...)
Definition: tprintf.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
Definition: lm_state.h:160
int han_sid() const
Definition: unicharset.h:836
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:339
PermuterType
Definition: ratngs.h:240
void UpdateBestRating(float rating)
Definition: blamer.h:122
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
bool language_model_ngram_space_delimited_language
void Print(const char *msg)
Definition: lm_state.cpp:70
float ComputeCost(const float features[]) const
void set_length(int len)
Definition: ratngs.h:378
double language_model_penalty_non_freq_dict_word
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
inT32 length() const
Definition: strngs.cpp:188
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
LanguageModelNgramInfo * ngram_info
Definition: lm_state.h:186
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
LanguageModelDawgInfo * dawg_info
Definition: lm_state.h:182
float x_height
Definition: pageres.h:295
int dimension() const
Definition: matrix.h:247
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
void print_state(const char *msg) const
Definition: ratngs.cpp:738
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
const STRING & unichar_string() const
Definition: ratngs.h:524
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
bool PrunablePath(const ViterbiStateEntry &vse)
ViterbiStateEntry * parent_vse
Definition: lm_state.h:161
static const LanguageModelFlagsType kSmallestRatingFlag
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
DawgPositionVector * beginning_active_dawgs_
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
LanguageModelFlagsType top_choice_flags
Definition: lm_state.h:178
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
static const float kMaxAvgNgramCost
float rating() const
Definition: ratngs.h:79
double language_model_penalty_non_dict_word
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
Definition: lm_state.h:90
inT16 fontinfo_id() const
Definition: ratngs.h:85
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
bool HasAlnumChoice(const UNICHARSET &unicharset)
Definition: lm_state.h:145
name_table name
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
AssociateStats associate_stats
Definition: lm_state.h:174
const UNICHARSET * uch_set
Definition: pageres.h:192
int NumInconsistentChartype() const
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152
uinT8 permuter() const
Definition: ratngs.h:343
XHeightConsistencyEnum xht_decision
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
WERD_CHOICE * raw_choice
Definition: pageres.h:224
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
int stopper_debug_level
Definition: dict.h:612
DawgPositionVector * very_beginning_active_dawgs_
DawgPositionVector * active_dawgs
Definition: lm_state.h:68
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
int UNICHAR_ID
Definition: unichar.h:33
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
static const LanguageModelFlagsType kUpperCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:130
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
static const LanguageModelFlagsType kLowerCaseFlag
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:231
int language_model_viterbi_list_max_num_prunable
float viterbi_state_entries_prunable_max_cost
Definition: lm_state.h:213
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
Definition: lm_state.h:215
bool empty() const
Definition: genericvector.h:84
PointerVector< LanguageModelState > beam
Definition: lm_state.h:235
Definition: cluster.h:45
ViterbiStateEntry * competing_vse
Definition: lm_state.h:164
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:82
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
int common_sid() const
Definition: unicharset.h:832
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
DawgType type() const
Definition: dawg.h:127
static const LanguageModelFlagsType kXhtConsistentFlag
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
bool AcceptablePath(const ViterbiStateEntry &vse)
bool updated
Flag to indicate whether anything was changed.
Definition: lm_state.h:229
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
DawgPositionVector * updated_dawgs
Definition: dict.h:82
void print() const
Definition: ratngs.h:563
Definition: strngs.h:44
void Print(const char *msg) const
Definition: lm_state.cpp:27
Struct to store information maintained by various language model components.
Definition: lm_state.h:197
#define NULL
Definition: host.h:144
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
float ngram_cost
-ln(P_ngram_model(path))
Definition: lm_state.h:88
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
GenericVector< int > blob_widths
Definition: pageres.h:205
inT64 NODE_REF
Definition: dawg.h:55
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
Definition: lm_state.h:210
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
int size() const
Definition: unicharset.h:297
const char * string() const
Definition: strngs.cpp:193
int correct_segmentation_length() const
Definition: blamer.h:126
float certainty() const
Definition: ratngs.h:82
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
bool language_model_ngram_use_only_first_uft8_step
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
float features[PTRAIN_NUM_FEATURE_TYPES]
void set_rating(float new_val)
Definition: ratngs.h:366
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:219
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:363
PermuterType permuter
Definition: dict.h:83