All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgsdawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVectorvery_beginning_active_dawgs_
 
DawgPositionVectorbeginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Detailed Description

Definition at line 42 of file language_model.h.

Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 45 of file language_model.cpp.

47  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
48  dict->getCCUtil()->params()),
50  "Turn on/off the use of character ngram model",
51  dict->getCCUtil()->params()),
53  "Maximum order of the character ngram model",
54  dict->getCCUtil()->params()),
56  "Maximum number of prunable (those for which"
57  " PrunablePath() is true) entries in each viterbi list"
58  " recorded in BLOB_CHOICEs",
59  dict->getCCUtil()->params()),
61  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
62  dict->getCCUtil()->params()),
64  "To avoid overly small denominators use this as the "
65  "floor of the probability returned by the ngram model.",
66  dict->getCCUtil()->params()),
68  "Average classifier score of a non-matching unichar.",
69  dict->getCCUtil()->params()),
71  "Use only the first UTF8 step of the given string"
72  " when computing log probabilities.",
73  dict->getCCUtil()->params()),
75  "Strength of the character ngram model relative to the"
76  " character classifier ",
77  dict->getCCUtil()->params()),
79  "Factor to bring log-probs into the same range as ratings"
80  " when multiplied by outline length ",
81  dict->getCCUtil()->params()),
83  "Words are delimited by space",
84  dict->getCCUtil()->params()),
86  "Minimum length of compound words",
87  dict->getCCUtil()->params()),
89  "Penalty for words not in the frequent word dictionary",
90  dict->getCCUtil()->params()),
92  "Penalty for non-dictionary words",
93  dict->getCCUtil()->params()),
95  "Penalty for inconsistent punctuation",
96  dict->getCCUtil()->params()),
98  "Penalty for inconsistent case",
99  dict->getCCUtil()->params()),
101  "Penalty for inconsistent script",
102  dict->getCCUtil()->params()),
104  "Penalty for inconsistent character type",
105  dict->getCCUtil()->params()),
106  // TODO(daria, rays): enable font consistency checking
107  // after improving font analysis.
109  "Penalty for inconsistent font",
110  dict->getCCUtil()->params()),
112  "Penalty for inconsistent spacing",
113  dict->getCCUtil()->params()),
115  "Penalty increment",
116  dict->getCCUtil()->params()),
117  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
118  dict->getCCUtil()->params()),
120  "Use sigmoidal score for certainty",
121  dict->getCCUtil()->params()),
122  fontinfo_table_(fontinfo_table), dict_(dict),
123  fixed_pitch_(false), max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
126  dawg_args_ = new DawgArgs(NULL, new DawgPositionVector(), NO_PERM);
127  very_beginning_active_dawgs_ = new DawgPositionVector();
128  beginning_active_dawgs_ = new DawgPositionVector();
129 }
double language_model_ngram_nonmatch_score
const UnicityTable< FontInfo > * fontinfo_table_
bool language_model_ngram_space_delimited_language
double language_model_penalty_non_freq_dict_word
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
#define ASSERT_HOST(x)
Definition: errcode.h:84
DawgPositionVector * beginning_active_dawgs_
double language_model_penalty_non_dict_word
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
DawgPositionVector * very_beginning_active_dawgs_
int language_model_viterbi_list_max_num_prunable
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
#define NULL
Definition: host.h:144
bool language_model_ngram_use_only_first_uft8_step
tesseract::LanguageModel::~LanguageModel ( )

Definition at line 131 of file language_model.cpp.

131  {
134  delete dawg_args_->updated_dawgs;
135  delete dawg_args_;
136 }
DawgPositionVector * beginning_active_dawgs_
DawgPositionVector * very_beginning_active_dawgs_
DawgPositionVector * updated_dawgs
Definition: dict.h:82

Member Function Documentation

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 95 of file language_model.h.

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 301 of file language_model.h.

301  {
302  return (vse.dawg_info != NULL || vse.Consistent() ||
303  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
304  }
#define NULL
Definition: host.h:144
bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 563 of file language_model.cpp.

574  {
575  ViterbiStateEntry_IT vit;
576  if (language_model_debug_level > 1) {
577  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
578  " certainty=%.4f top_choice_flags=0x%x",
580  b->rating(), b->certainty(), top_choice_flags);
582  tprintf(" parent_vse=%p\n", parent_vse);
583  else
584  tprintf("\n");
585  }
586  // Check whether the list is full.
587  if (curr_state != NULL &&
588  curr_state->viterbi_state_entries_length >=
590  if (language_model_debug_level > 1) {
591  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
592  }
593  return false;
594  }
595 
596  // Invoke Dawg language model component.
597  LanguageModelDawgInfo *dawg_info =
598  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
599 
600  float outline_length =
602  // Invoke Ngram language model component.
603  LanguageModelNgramInfo *ngram_info = NULL;
605  ngram_info = GenerateNgramInfo(
607  denom, curr_col, curr_row, outline_length, parent_vse);
608  ASSERT_HOST(ngram_info != NULL);
609  }
610  bool liked_by_language_model = dawg_info != NULL ||
611  (ngram_info != NULL && !ngram_info->pruned);
612  // Quick escape if not liked by the language model, can't be consistent
613  // xheight, and not top choice.
614  if (!liked_by_language_model && top_choice_flags == 0) {
615  if (language_model_debug_level > 1) {
616  tprintf("Language model components very early pruned this entry\n");
617  }
618  delete ngram_info;
619  delete dawg_info;
620  return false;
621  }
622 
623  // Check consistency of the path and set the relevant consistency_info.
624  LMConsistencyInfo consistency_info(
625  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
626  // Start with just the x-height consistency, as it provides significant
627  // pruning opportunity.
628  consistency_info.ComputeXheightConsistency(
630  // Turn off xheight consistent flag if not consistent.
631  if (consistency_info.InconsistentXHeight()) {
632  top_choice_flags &= ~kXhtConsistentFlag;
633  }
634 
635  // Quick escape if not liked by the language model, not consistent xheight,
636  // and not top choice.
637  if (!liked_by_language_model && top_choice_flags == 0) {
638  if (language_model_debug_level > 1) {
639  tprintf("Language model components early pruned this entry\n");
640  }
641  delete ngram_info;
642  delete dawg_info;
643  return false;
644  }
645 
646  // Compute the rest of the consistency info.
647  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
648  word_res, &consistency_info);
649  if (dawg_info != NULL && consistency_info.invalid_punc) {
650  consistency_info.invalid_punc = false; // do not penalize dict words
651  }
652 
653  // Compute cost of associating the blobs that represent the current unichar.
654  AssociateStats associate_stats;
655  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
656  parent_vse, word_res, &associate_stats);
657  if (parent_vse != NULL) {
658  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
659  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
660  }
661 
662  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
663  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
664  parent_vse, b, 0.0, outline_length,
665  consistency_info, associate_stats, top_choice_flags, dawg_info,
666  ngram_info, (language_model_debug_level > 0) ?
668  new_vse->cost = ComputeAdjustedPathCost(new_vse);
670  tprintf("Adjusted cost = %g\n", new_vse->cost);
671 
672  // Invoke Top Choice language model component to make the final adjustments
673  // to new_vse->top_choice_flags.
674  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
675  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
676  }
677 
678  // If language model components did not like this unichar - return.
679  bool keep = new_vse->top_choice_flags || liked_by_language_model;
680  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
681  consistency_info.inconsistent_script) { // with inconsistent script
682  keep = false;
683  }
684  if (!keep) {
685  if (language_model_debug_level > 1) {
686  tprintf("Language model components did not like this entry\n");
687  }
688  delete new_vse;
689  return false;
690  }
691 
692  // Discard this entry if it represents a prunable path and
693  // language_model_viterbi_list_max_num_prunable such entries with a lower
694  // cost have already been recorded.
695  if (PrunablePath(*new_vse) &&
696  (curr_state->viterbi_state_entries_prunable_length >=
698  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
699  if (language_model_debug_level > 1) {
700  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
701  new_vse->cost,
702  curr_state->viterbi_state_entries_prunable_max_cost);
703  }
704  delete new_vse;
705  return false;
706  }
707 
708  // Update best choice if needed.
709  if (word_end) {
710  UpdateBestChoice(new_vse, pain_points, word_res,
711  best_choice_bundle, blamer_bundle);
712  // Discard the entry if UpdateBestChoice() found flaws in it.
713  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
714  new_vse != best_choice_bundle->best_vse) {
715  if (language_model_debug_level > 1) {
716  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
717  }
718  delete new_vse;
719  return false;
720  }
721  }
722 
723  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
724  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
725  false, new_vse);
726  curr_state->viterbi_state_entries_length++;
727  if (PrunablePath(*new_vse)) {
728  curr_state->viterbi_state_entries_prunable_length++;
729  }
730 
731  // Update lms->viterbi_state_entries_prunable_max_cost and clear
732  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
733  if ((curr_state->viterbi_state_entries_prunable_length >=
735  new_vse->top_choice_flags) {
736  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
737  int prunable_counter = language_model_viterbi_list_max_num_prunable;
738  vit.set_to_list(&(curr_state->viterbi_state_entries));
739  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
740  ViterbiStateEntry *curr_vse = vit.data();
741  // Clear the appropriate top choice flags of the entries in the
742  // list that have cost higher thank new_entry->cost
743  // (since they will not be top choices any more).
744  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
745  curr_vse->cost > new_vse->cost) {
746  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
747  }
748  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
749  // Update curr_state->viterbi_state_entries_prunable_max_cost.
750  if (prunable_counter == 0) {
751  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
752  if (language_model_debug_level > 1) {
753  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
754  curr_state->viterbi_state_entries_prunable_max_cost);
755  }
756  prunable_counter = -1; // stop counting
757  }
758  }
759  }
760 
761  // Print the newly created ViterbiStateEntry.
762  if (language_model_debug_level > 2) {
763  new_vse->Print("New");
765  curr_state->Print("Updated viterbi list");
766  }
767 
768  return true;
769 }
static const float kBadRating
Definition: ratngs.h:273
#define tprintf(...)
Definition: tprintf.h:31
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
float rating() const
Definition: ratngs.h:79
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:130
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
int language_model_viterbi_list_max_num_prunable
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:82
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
static const LanguageModelFlagsType kXhtConsistentFlag
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
#define NULL
Definition: host.h:144
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
float certainty() const
Definition: ratngs.h:82
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 104 of file language_model.h.

104  {
106  // cert is assumed to be between 0 and -dict_->certainty_scale.
107  // If you enable language_model_use_sigmoidal_certainty, you
108  // need to adjust language_model_ngram_nonmatch_score as well.
109  cert = -cert / dict_->certainty_scale;
110  return 1.0f / (1.0f + exp(10.0f * cert));
111  } else {
112  return (-1.0f / cert);
113  }
114  }
double certainty_scale
Definition: dict.h:601
float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1198 of file language_model.cpp.

1198  {
1199  ASSERT_HOST(vse != NULL);
1200  if (params_model_.Initialized()) {
1201  float features[PTRAIN_NUM_FEATURE_TYPES];
1202  ExtractFeaturesFromPath(*vse, features);
1203  float cost = params_model_.ComputeCost(features);
1204  if (language_model_debug_level > 3) {
1205  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1206  if (language_model_debug_level >= 5) {
1207  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1208  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1209  }
1210  }
1211  }
1212  return cost * vse->outline_length;
1213  } else {
1214  float adjustment = 1.0f;
1215  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1217  }
1218  if (vse->dawg_info == NULL) {
1220  if (vse->length > language_model_min_compound_length) {
1221  adjustment += ((vse->length - language_model_min_compound_length) *
1223  }
1224  }
1225  if (vse->associate_stats.shape_cost > 0) {
1226  adjustment += vse->associate_stats.shape_cost /
1227  static_cast<float>(vse->length);
1228  }
1230  ASSERT_HOST(vse->ngram_info != NULL);
1231  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1232  } else {
1233  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1234  vse->consistency_info);
1235  return vse->ratings_sum * adjustment;
1236  }
1237  }
1238 }
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
#define tprintf(...)
Definition: tprintf.h:31
float ComputeCost(const float features[]) const
double language_model_penalty_non_freq_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
double language_model_penalty_non_dict_word
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
#define NULL
Definition: host.h:144
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 116 of file language_model.h.

116  {
117  if (num_problems == 0) return 0.0f;
118  if (num_problems == 1) return penalty;
119  return (penalty + (language_model_penalty_increment *
120  static_cast<float>(num_problems-1)));
121  }
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 272 of file language_model.h.

276  {
278  col, row,
279  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
280  (parent_vse != NULL) ? parent_vse->length : 0,
281  fixed_pitch_, max_char_wh_ratio,
282  word_res, language_model_debug_level > 2, associate_stats);
283  }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:37
#define NULL
Definition: host.h:144
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 127 of file language_model.h.

129  {
130  if (dawg_info != NULL) {
131  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
133  (consistency_info.inconsistent_script ?
135  }
136  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
138  ComputeAdjustment(consistency_info.NumInconsistentCase(),
140  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
142  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
144  (consistency_info.inconsistent_script ?
146  (consistency_info.inconsistent_font ?
148  }
float ComputeAdjustment(int num_problems, float penalty)
#define NULL
Definition: host.h:144
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 995 of file language_model.cpp.

995  {
996  if (curr_list->empty()) return 1.0f;
997  float denom = 0.0f;
998  int len = 0;
999  BLOB_CHOICE_IT c_it(curr_list);
1000  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1001  ASSERT_HOST(c_it.data() != NULL);
1002  ++len;
1003  denom += CertaintyScore(c_it.data()->certainty());
1004  }
1005  assert(len != 0);
1006  // The ideal situation would be to have the classifier scores for
1007  // classifying each position as each of the characters in the unicharset.
1008  // Since we can not do this because of speed, we add a very crude estimate
1009  // of what these scores for the "missing" classifications would sum up to.
1010  denom += (dict_->getUnicharset().size() - len) *
1012 
1013  return denom;
1014 }
double language_model_ngram_nonmatch_score
float CertaintyScore(float cert)
#define ASSERT_HOST(x)
Definition: errcode.h:84
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define NULL
Definition: host.h:144
int size() const
Definition: unicharset.h:297
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 935 of file language_model.cpp.

941  {
942  const char *context_ptr = context;
943  char *modified_context = NULL;
944  char *modified_context_end = NULL;
945  const char *unichar_ptr = unichar;
946  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
947  float prob = 0.0f;
948  int step = 0;
949  while (unichar_ptr < unichar_end &&
950  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
951  if (language_model_debug_level > 1) {
952  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
953  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
954  }
955  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
956  ++(*unichar_step_len);
958  unichar_ptr += step;
959  // If there are multiple UTF8 characters present in unichar, context is
960  // updated to include the previously examined characters from str,
961  // unless use_only_first_uft8_step is true.
962  if (unichar_ptr < unichar_end) {
963  if (modified_context == NULL) {
964  int context_len = strlen(context);
965  modified_context =
966  new char[context_len + strlen(unichar_ptr) + step + 1];
967  strncpy(modified_context, context, context_len);
968  modified_context_end = modified_context + context_len;
969  context_ptr = modified_context;
970  }
971  strncpy(modified_context_end, unichar_ptr - step, step);
972  modified_context_end += step;
973  *modified_context_end = '\0';
974  }
975  }
976  prob /= static_cast<float>(*unichar_step_len); // normalize
977  if (prob < language_model_ngram_small_prob) {
978  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
979  *found_small_prob = true;
981  }
982  *ngram_cost = -1.0*log2(prob);
983  float ngram_and_classifier_cost =
984  -1.0*log2(CertaintyScore(certainty)/denom) +
985  *ngram_cost * language_model_ngram_scale_factor;
986  if (language_model_debug_level > 1) {
987  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
988  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
989  ngram_and_classifier_cost);
990  }
991  if (modified_context != NULL) delete[] modified_context;
992  return ngram_and_classifier_cost;
993 }
float CertaintyScore(float cert)
#define tprintf(...)
Definition: tprintf.h:31
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:363
#define NULL
Definition: host.h:144
bool language_model_ngram_use_only_first_uft8_step
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1389 of file language_model.cpp.

1394  {
1395  if (truth_path != NULL) {
1396  *truth_path =
1397  (blamer_bundle != NULL &&
1398  vse->length == blamer_bundle->correct_segmentation_length());
1399  }
1400  BLOB_CHOICE *curr_b = vse->curr_b;
1401  ViterbiStateEntry *curr_vse = vse;
1402 
1403  int i;
1404  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1405 
1406  // Re-compute the variance of the width-to-height ratios (since we now
1407  // can compute the mean over the whole word).
1408  float full_wh_ratio_mean = 0.0f;
1409  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1410  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1411  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1412  static_cast<float>(vse->length));
1413  vse->associate_stats.full_wh_ratio_var = 0.0f;
1414  }
1415 
1416  // Construct a WERD_CHOICE by tracing parent pointers.
1417  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1418  word->set_length(vse->length);
1419  int total_blobs = 0;
1420  for (i = (vse->length-1); i >= 0; --i) {
1421  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1422  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1423  *truth_path = false;
1424  }
1425  // The number of blobs used for this choice is row - col + 1.
1426  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1427  total_blobs += num_blobs;
1428  word->set_blob_choice(i, num_blobs, curr_b);
1429  // Update the width-to-height ratio variance. Useful non-space delimited
1430  // languages to ensure that the blobs are of uniform width.
1431  // Skip leading and trailing punctuation when computing the variance.
1432  if ((full_wh_ratio_mean != 0.0f &&
1433  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1434  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1435  vse->associate_stats.full_wh_ratio_var +=
1436  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1437  if (language_model_debug_level > 2) {
1438  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1439  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1440  }
1441  }
1442 
1443  // Mark the word as compound if compound permuter was set for any of
1444  // the unichars on the path (usually this will happen for unichars
1445  // that are compounding operators, like "-" and "/").
1446  if (!compound && curr_vse->dawg_info &&
1447  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1448 
1449  // Update curr_* pointers.
1450  curr_vse = curr_vse->parent_vse;
1451  if (curr_vse == NULL) break;
1452  curr_b = curr_vse->curr_b;
1453  }
1454  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1455  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1456  // Re-adjust shape cost to include the updated width-to-height variance.
1457  if (full_wh_ratio_mean != 0.0f) {
1458  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1459  }
1460 
1461  word->set_rating(vse->ratings_sum);
1462  word->set_certainty(vse->min_certainty);
1463  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1464  vse->consistency_info.BodyMaxXHeight());
1465  if (vse->dawg_info != NULL) {
1466  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1467  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1468  word->set_permuter(NGRAM_PERM);
1469  } else if (vse->top_choice_flags) {
1471  } else {
1472  word->set_permuter(NO_PERM);
1473  }
1474  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1475  word_res->ratings));
1476  return word;
1477 }
void set_certainty(float new_val)
Definition: ratngs.h:369
MATRIX * ratings
Definition: pageres.h:215
#define tprintf(...)
Definition: tprintf.h:31
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:339
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void set_length(int len)
Definition: ratngs.h:378
int dimension() const
Definition: matrix.h:247
#define ASSERT_HOST(x)
Definition: errcode.h:84
const UNICHARSET * uch_set
Definition: pageres.h:192
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:152
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define NULL
Definition: host.h:144
int correct_segmentation_length() const
Definition: blamer.h:126
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void set_rating(float new_val)
Definition: ratngs.h:366
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:363
void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1340 of file language_model.cpp.

1341  {
1342  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1343  // Record dictionary match info.
1344  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1345  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1346  if (vse.dawg_info != NULL) {
1347  int permuter = vse.dawg_info->permuter;
1348  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1349  if (vse.consistency_info.num_digits == vse.length) {
1350  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1351  } else {
1352  features[PTRAIN_NUM_SHORT+len] = 1.0;
1353  }
1354  } else if (permuter == DOC_DAWG_PERM) {
1355  features[PTRAIN_DOC_SHORT+len] = 1.0;
1356  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1357  permuter == COMPOUND_PERM) {
1358  features[PTRAIN_DICT_SHORT+len] = 1.0;
1359  } else if (permuter == FREQ_DAWG_PERM) {
1360  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1361  }
1362  }
1363  // Record shape cost feature (normalized by path length).
1364  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1365  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1366  // Record ngram cost. (normalized by the path length).
1367  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1368  if (vse.ngram_info != NULL) {
1369  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1370  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1371  }
1372  // Record consistency-related features.
1373  // Disabled this feature for due to its poor performance.
1374  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1375  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1376  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1377  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1378  vse.consistency_info.NumInconsistentChartype() : 0.0;
1379  features[PTRAIN_NUM_BAD_SPACING] =
1380  vse.consistency_info.NumInconsistentSpaces();
1381  // Disabled this feature for now due to its poor performance.
1382  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1383 
1384  // Classifier-related features.
1385  features[PTRAIN_RATING_PER_CHAR] =
1386  vse.ratings_sum / static_cast<float>(vse.outline_length);
1387 }
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1016 of file language_model.cpp.

1022  {
1023  const UNICHARSET &unicharset = dict_->getUnicharset();
1024  UNICHAR_ID unichar_id = b->unichar_id();
1025  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1026 
1027  // Check punctuation validity.
1028  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1029  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1030  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1031  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1032  unicharset.get_isdigit(parent_b->unichar_id()))) {
1033  // reset punc_ref for compound words
1034  consistency_info->punc_ref = NO_EDGE;
1035  } else {
1036  bool is_apos = dict_->is_apostrophe(unichar_id);
1037  bool prev_is_numalpha = (parent_b != NULL &&
1038  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1039  unicharset.get_isdigit(parent_b->unichar_id())));
1040  UNICHAR_ID pattern_unichar_id =
1041  (unicharset.get_isalpha(unichar_id) ||
1042  unicharset.get_isdigit(unichar_id) ||
1043  (is_apos && prev_is_numalpha)) ?
1044  Dawg::kPatternUnicharID : unichar_id;
1045  if (consistency_info->punc_ref == NO_EDGE ||
1046  pattern_unichar_id != Dawg::kPatternUnicharID ||
1047  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1050  consistency_info->punc_ref);
1051  consistency_info->punc_ref =
1052  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1053  node, pattern_unichar_id, word_end) : NO_EDGE;
1054  if (consistency_info->punc_ref == NO_EDGE) {
1055  consistency_info->invalid_punc = true;
1056  }
1057  }
1058  }
1059  }
1060 
1061  // Update case related counters.
1062  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1063  // Reset counters if we are dealing with a compound word.
1064  consistency_info->num_lower = 0;
1065  consistency_info->num_non_first_upper = 0;
1066  }
1067  else if (unicharset.get_islower(unichar_id)) {
1068  consistency_info->num_lower++;
1069  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1070  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1071  consistency_info->num_lower > 0 ||
1072  consistency_info->num_non_first_upper > 0) {
1073  consistency_info->num_non_first_upper++;
1074  }
1075  }
1076 
1077  // Initialize consistency_info->script_id (use script of unichar_id
1078  // if it is not Common, use script id recorded by the parent otherwise).
1079  // Set inconsistent_script to true if the script of the current unichar
1080  // is not consistent with that of the parent.
1081  consistency_info->script_id = unicharset.get_script(unichar_id);
1082  // Hiragana and Katakana can mix with Han.
1084  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1085  consistency_info->script_id == unicharset.hiragana_sid()) ||
1086  (unicharset.katakana_sid() != unicharset.null_sid() &&
1087  consistency_info->script_id == unicharset.katakana_sid())) {
1088  consistency_info->script_id = dict_->getUnicharset().han_sid();
1089  }
1090  }
1091 
1092  if (parent_vse != NULL &&
1093  (parent_vse->consistency_info.script_id !=
1094  dict_->getUnicharset().common_sid())) {
1095  int parent_script_id = parent_vse->consistency_info.script_id;
1096  // If script_id is Common, use script id of the parent instead.
1097  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1098  consistency_info->script_id = parent_script_id;
1099  }
1100  if (consistency_info->script_id != parent_script_id) {
1101  consistency_info->inconsistent_script = true;
1102  }
1103  }
1104 
1105  // Update chartype related counters.
1106  if (unicharset.get_isalpha(unichar_id)) {
1107  consistency_info->num_alphas++;
1108  } else if (unicharset.get_isdigit(unichar_id)) {
1109  consistency_info->num_digits++;
1110  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1111  consistency_info->num_other++;
1112  }
1113 
1114  // Check font and spacing consistency.
1115  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1116  int fontinfo_id = -1;
1117  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1118  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1119  fontinfo_id = b->fontinfo_id();
1120  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1121  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1122  fontinfo_id = b->fontinfo_id2();
1123  }
1124  if(language_model_debug_level > 1) {
1125  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1126  (parent_b->fontinfo_id() >= 0) ?
1127  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1128  (parent_b->fontinfo_id2() >= 0) ?
1129  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1130  (b->fontinfo_id() >= 0) ?
1131  fontinfo_table_->get(b->fontinfo_id()).name : "",
1132  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1133  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1134  fontinfo_id);
1135  }
1136  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1137  bool expected_gap_found = false;
1138  float expected_gap;
1139  int temp_gap;
1140  if (fontinfo_id >= 0) { // found a common font
1141  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1142  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1143  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1144  expected_gap = temp_gap;
1145  expected_gap_found = true;
1146  }
1147  } else {
1148  consistency_info->inconsistent_font = true;
1149  // Get an average of the expected gaps in each font
1150  int num_addends = 0;
1151  expected_gap = 0;
1152  int temp_fid;
1153  for (int i = 0; i < 4; ++i) {
1154  if (i == 0) {
1155  temp_fid = parent_b->fontinfo_id();
1156  } else if (i == 1) {
1157  temp_fid = parent_b->fontinfo_id2();
1158  } else if (i == 2) {
1159  temp_fid = b->fontinfo_id();
1160  } else {
1161  temp_fid = b->fontinfo_id2();
1162  }
1163  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1164  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1165  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1166  expected_gap += temp_gap;
1167  num_addends++;
1168  }
1169  }
1170  expected_gap_found = (num_addends > 0);
1171  if (num_addends > 0) {
1172  expected_gap /= static_cast<float>(num_addends);
1173  }
1174  }
1175  if (expected_gap_found) {
1176  float actual_gap =
1177  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1178  float gap_ratio = expected_gap / actual_gap;
1179  // TODO(rays) The gaps seem to be way off most of the time, saved by
1180  // the error here that the ratio was compared to 1/2, when it should
1181  // have been 0.5f. Find the source of the gaps discrepancy and put
1182  // the 0.5f here in place of 0.0f.
1183  // Test on 2476595.sj, pages 0 to 6. (In French.)
1184  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1185  consistency_info->num_inconsistent_spaces++;
1186  }
1187  if (language_model_debug_level > 1) {
1188  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1189  unicharset.id_to_unichar(parent_b->unichar_id()),
1190  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1191  unichar_id, curr_col, expected_gap, actual_gap);
1192  }
1193  }
1194  }
1195  }
1196 }
int katakana_sid() const
Definition: unicharset.h:838
int hiragana_sid() const
Definition: unicharset.h:837
const UnicityTable< FontInfo > * fontinfo_table_
int null_sid() const
Definition: unicharset.h:831
#define tprintf(...)
Definition: tprintf.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
int han_sid() const
Definition: unicharset.h:836
#define ASSERT_HOST(x)
Definition: errcode.h:84
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
inT16 fontinfo_id() const
Definition: ratngs.h:85
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
name_table name
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:408
int UNICHAR_ID
Definition: unichar.h:33
inT16 fontinfo_id2() const
Definition: ratngs.h:88
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
bool empty() const
Definition: genericvector.h:84
int common_sid() const
Definition: unicharset.h:832
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define NULL
Definition: host.h:144
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
GenericVector< int > blob_widths
Definition: pageres.h:205
inT64 NODE_REF
Definition: dawg.h:55
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:116
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 787 of file language_model.cpp.

791  {
792  // Initialize active_dawgs from parent_vse if it is not NULL.
793  // Otherwise use very_beginning_active_dawgs_.
794  if (parent_vse == NULL) {
797  } else {
798  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
799  dawg_args_->active_dawgs = parent_vse->dawg_info->active_dawgs;
800  dawg_args_->permuter = parent_vse->dawg_info->permuter;
801  }
802 
803  // Deal with hyphenated words.
804  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
805  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
806  return new LanguageModelDawgInfo(dawg_args_->active_dawgs,
807  COMPOUND_PERM);
808  }
809 
810  // Deal with compound words.
811  if (dict_->compound_marker(b.unichar_id()) &&
812  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
813  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
814  // Do not allow compound operators at the beginning and end of the word.
815  // Do not allow more than one compound operator per word.
816  // Do not allow compounding of words with lengths shorter than
817  // language_model_min_compound_length
818  if (parent_vse == NULL || word_end ||
820  parent_vse->length < language_model_min_compound_length) return NULL;
821 
822  int i;
823  // Check a that the path terminated before the current character is a word.
824  bool has_word_ending = false;
825  for (i = 0; i < parent_vse->dawg_info->active_dawgs->size(); ++i) {
826  const DawgPosition &pos = (*parent_vse->dawg_info->active_dawgs)[i];
827  const Dawg *pdawg = pos.dawg_index < 0
828  ? NULL : dict_->GetDawg(pos.dawg_index);
829  if (pdawg == NULL || pos.back_to_punc) continue;;
830  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
831  pdawg->end_of_word(pos.dawg_ref)) {
832  has_word_ending = true;
833  break;
834  }
835  }
836  if (!has_word_ending) return NULL;
837 
838  if (language_model_debug_level > 0) tprintf("Compound word found\n");
839  return new LanguageModelDawgInfo(beginning_active_dawgs_, COMPOUND_PERM);
840  } // done dealing with compound words
841 
842  LanguageModelDawgInfo *dawg_info = NULL;
843 
844  // Call LetterIsOkay().
845  // Use the normalized IDs so that all shapes of ' can be allowed in words
846  // like don't.
847  const GenericVector<UNICHAR_ID>& normed_ids =
849  DawgPositionVector tmp_active_dawgs;
850  for (int i = 0; i < normed_ids.size(); ++i) {
852  tprintf("Test Letter OK for unichar %d, normed %d\n",
853  b.unichar_id(), normed_ids[i]);
854  dict_->LetterIsOkay(dawg_args_, normed_ids[i],
855  word_end && i == normed_ids.size() - 1);
856  if (dawg_args_->permuter == NO_PERM) {
857  break;
858  } else if (i < normed_ids.size() - 1) {
859  tmp_active_dawgs = *dawg_args_->updated_dawgs;
860  dawg_args_->active_dawgs = &tmp_active_dawgs;
861  }
863  tprintf("Letter was OK for unichar %d, normed %d\n",
864  b.unichar_id(), normed_ids[i]);
865  }
867  if (dawg_args_->permuter != NO_PERM) {
868  dawg_info = new LanguageModelDawgInfo(dawg_args_->updated_dawgs,
870  } else if (language_model_debug_level > 3) {
871  tprintf("Letter %s not OK!\n",
873  }
874 
875  return dawg_info;
876 }
DawgPositionVector * active_dawgs
Definition: dict.h:81
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
DawgPositionVector * beginning_active_dawgs_
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
DawgPositionVector * very_beginning_active_dawgs_
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:350
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:406
DawgPositionVector * updated_dawgs
Definition: dict.h:82
#define NULL
Definition: host.h:144
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
PermuterType permuter
Definition: dict.h:83
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 878 of file language_model.cpp.

881  {
882  // Initialize parent context.
883  const char *pcontext_ptr = "";
884  int pcontext_unichar_step_len = 0;
885  if (parent_vse == NULL) {
886  pcontext_ptr = prev_word_str_.string();
887  pcontext_unichar_step_len = prev_word_unichar_step_len_;
888  } else {
889  pcontext_ptr = parent_vse->ngram_info->context.string();
890  pcontext_unichar_step_len =
891  parent_vse->ngram_info->context_unichar_step_len;
892  }
893  // Compute p(unichar | parent context).
894  int unichar_step_len = 0;
895  bool pruned = false;
896  float ngram_cost;
897  float ngram_and_classifier_cost =
898  ComputeNgramCost(unichar, certainty, denom,
899  pcontext_ptr, &unichar_step_len,
900  &pruned, &ngram_cost);
901  // Normalize just the ngram_and_classifier_cost by outline_length.
902  // The ngram_cost is used by the params_model, so it needs to be left as-is,
903  // and the params model cost will be normalized by outline_length.
904  ngram_and_classifier_cost *=
905  outline_length / language_model_ngram_rating_factor;
906  // Add the ngram_cost of the parent.
907  if (parent_vse != NULL) {
908  ngram_and_classifier_cost +=
909  parent_vse->ngram_info->ngram_and_classifier_cost;
910  ngram_cost += parent_vse->ngram_info->ngram_cost;
911  }
912 
913  // Shorten parent context string by unichar_step_len unichars.
914  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
916  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
917  while (num_remove > 0 && *pcontext_ptr != '\0') {
918  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
919  --num_remove;
920  }
921 
922  // Decide whether to prune this ngram path and update changed accordingly.
923  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
924 
925  // Construct and return the new LanguageModelNgramInfo.
926  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
927  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
928  ngram_and_classifier_cost);
929  ngram_info->context += unichar;
930  ngram_info->context_unichar_step_len += unichar_step_len;
931  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
932  return ngram_info;
933 }
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define NULL
Definition: host.h:144
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
const char * string() const
Definition: strngs.cpp:193
void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 771 of file language_model.cpp.

773  {
774  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
775  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
776  new_vse->cost >= vit.data()->cost; vit.forward()) {
777  // Clear the appropriate flags if the list already contains
778  // a top choice entry with a lower cost.
779  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
780  }
781  if (language_model_debug_level > 2) {
782  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
783  new_vse->top_choice_flags);
784  }
785 }
#define tprintf(...)
Definition: tprintf.h:31
ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 502 of file language_model.cpp.

506  {
507  for (; !vse_it->cycled_list(); vse_it->forward()) {
508  ViterbiStateEntry* parent_vse = vse_it->data();
509  // Only consider the parent if it has been updated or
510  // if the current ratings cell has just been classified.
511  if (!just_classified && !parent_vse->updated) continue;
513  parent_vse->Print("Considering");
514  // If the parent is non-alnum, then upper counts as lower.
515  *top_choice_flags = blob_choice_flags;
516  if ((blob_choice_flags & kUpperCaseFlag) &&
517  !parent_vse->HasAlnumChoice(unicharset)) {
518  *top_choice_flags |= kLowerCaseFlag;
519  }
520  *top_choice_flags &= parent_vse->top_choice_flags;
521  UNICHAR_ID unichar_id = bc->unichar_id();
522  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
523  UNICHAR_ID parent_id = parent_b->unichar_id();
524  // Digits do not bind to alphas if there is a mix in both parent and current
525  // or if the alpha is not the top choice.
526  if (unicharset.get_isdigit(unichar_id) &&
527  unicharset.get_isalpha(parent_id) &&
528  (mixed_alnum || *top_choice_flags == 0))
529  continue; // Digits don't bind to alphas.
530  // Likewise alphas do not bind to digits if there is a mix in both or if
531  // the digit is not the top choice.
532  if (unicharset.get_isalpha(unichar_id) &&
533  unicharset.get_isdigit(parent_id) &&
534  (mixed_alnum || *top_choice_flags == 0))
535  continue; // Alphas don't bind to digits.
536  // If there is a case mix of the same alpha in the parent list, then
537  // competing_vse is non-null and will be used to determine whether
538  // or not to bind the current blob choice.
539  if (parent_vse->competing_vse != NULL) {
540  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
541  UNICHAR_ID other_id = competing_b->unichar_id();
542  if (language_model_debug_level >= 5) {
543  tprintf("Parent %s has competition %s\n",
544  unicharset.id_to_unichar(parent_id),
545  unicharset.id_to_unichar(other_id));
546  }
547  if (unicharset.SizesDistinct(parent_id, other_id)) {
548  // If other_id matches bc wrt position and size, and parent_id, doesn't,
549  // don't bind to the current parent.
550  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
552  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
554  continue; // Competing blobchoice has a better vertical match.
555  }
556  }
557  vse_it->forward();
558  return parent_vse; // This one is good!
559  }
560  return NULL; // Ran out of possibilities.
561 }
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:472
#define tprintf(...)
Definition: tprintf.h:31
float x_height
Definition: pageres.h:295
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
int UNICHAR_ID
Definition: unichar.h:33
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kLowerCaseFlag
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:132
#define NULL
Definition: host.h:144
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 100 of file language_model.h.

100 { return params_model_; }
bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 385 of file language_model.cpp.

388  {
389  BLOB_CHOICE_IT c_it(curr_list);
390  const UNICHARSET &unicharset = dict_->getUnicharset();
391  BLOB_CHOICE *first_unichar = NULL;
392  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
393  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
394  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
395  if (first_unichar == NULL) first_unichar = c_it.data();
396  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
397  *first_lower = c_it.data();
398  }
399  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
400  !unicharset.get_islower(unichar_id)) {
401  *first_upper = c_it.data();
402  }
403  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
404  *first_digit = c_it.data();
405  }
406  }
407  ASSERT_HOST(first_unichar != NULL);
408  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
409  *first_digit != NULL;
410  if (*first_lower == NULL) *first_lower = first_unichar;
411  if (*first_upper == NULL) *first_upper = first_unichar;
412  if (*first_digit == NULL) *first_digit = first_unichar;
413  return mixed;
414 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
int UNICHAR_ID
Definition: unichar.h:33
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
Definition: cluster.h:45
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 138 of file language_model.cpp.

140  {
141  fixed_pitch_ = fixed_pitch;
142  max_char_wh_ratio_ = max_char_wh_ratio;
143  rating_cert_scale_ = rating_cert_scale;
144  acceptable_choice_found_ = false;
146 
147  // Initialize vectors with beginning DawgInfos.
152 
153  // Fill prev_word_str_ with the last language_model_ngram_order
154  // unichars from prev_word.
156  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
157  prev_word_str_ = prev_word->unichar_string();
159  } else {
160  prev_word_str_ = " ";
161  }
162  const char *str_ptr = prev_word_str_.string();
163  const char *str_end = str_ptr + prev_word_str_.length();
164  int step;
166  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
167  str_ptr += step;
169  }
170  ASSERT_HOST(str_ptr == str_end);
171  }
172 }
bool language_model_ngram_space_delimited_language
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
inT32 length() const
Definition: strngs.cpp:188
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
#define ASSERT_HOST(x)
Definition: errcode.h:84
const STRING & unichar_string() const
Definition: ratngs.h:524
DawgPositionVector * beginning_active_dawgs_
DawgPositionVector * very_beginning_active_dawgs_
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 291 of file language_model.h.

291  {
292  if (vse.top_choice_flags) return false;
293  if (vse.dawg_info != NULL &&
294  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
295  vse.dawg_info->permuter == USER_DAWG_PERM ||
296  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
297  return true;
298  }
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 96 of file language_model.h.

96  {
98  }
int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 425 of file language_model.cpp.

426  {
427  if (parent_node == NULL) return -1;
428  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
429  ViterbiStateEntry* top_lower = NULL;
430  ViterbiStateEntry* top_upper = NULL;
431  ViterbiStateEntry* top_digit = NULL;
432  ViterbiStateEntry* top_choice = NULL;
433  float lower_rating = 0.0f;
434  float upper_rating = 0.0f;
435  float digit_rating = 0.0f;
436  float top_rating = 0.0f;
437  const UNICHARSET &unicharset = dict_->getUnicharset();
438  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
439  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
440  ViterbiStateEntry* vse = vit.data();
441  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
442  // back to the real character if needed.
443  ViterbiStateEntry* unichar_vse = vse;
444  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
445  float rating = unichar_vse->curr_b->rating();
446  while (unichar_id == INVALID_UNICHAR_ID &&
447  unichar_vse->parent_vse != NULL) {
448  unichar_vse = unichar_vse->parent_vse;
449  unichar_id = unichar_vse->curr_b->unichar_id();
450  rating = unichar_vse->curr_b->rating();
451  }
452  if (unichar_id != INVALID_UNICHAR_ID) {
453  if (unicharset.get_islower(unichar_id)) {
454  if (top_lower == NULL || lower_rating > rating) {
455  top_lower = vse;
456  lower_rating = rating;
457  }
458  } else if (unicharset.get_isalpha(unichar_id)) {
459  if (top_upper == NULL || upper_rating > rating) {
460  top_upper = vse;
461  upper_rating = rating;
462  }
463  } else if (unicharset.get_isdigit(unichar_id)) {
464  if (top_digit == NULL || digit_rating > rating) {
465  top_digit = vse;
466  digit_rating = rating;
467  }
468  }
469  }
470  if (top_choice == NULL || top_rating > rating) {
471  top_choice = vse;
472  top_rating = rating;
473  top_id = unichar_id;
474  }
475  }
476  if (top_choice == NULL) return -1;
477  bool mixed = (top_lower != NULL || top_upper != NULL) &&
478  top_digit != NULL;
479  if (top_lower == NULL) top_lower = top_choice;
480  top_lower->top_choice_flags |= kLowerCaseFlag;
481  if (top_upper == NULL) top_upper = top_choice;
482  top_upper->top_choice_flags |= kUpperCaseFlag;
483  if (top_digit == NULL) top_digit = top_choice;
484  top_digit->top_choice_flags |= kDigitFlag;
485  top_choice->top_choice_flags |= kSmallestRatingFlag;
486  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
487  (top_choice->top_choice_flags &
489  // If the compound marker top choice carries any of the top alnum flags,
490  // then give it all of them, allowing words like I-295 to be chosen.
491  top_choice->top_choice_flags |=
493  }
494  return mixed ? 1 : 0;
495 }
static const LanguageModelFlagsType kDigitFlag
static const LanguageModelFlagsType kSmallestRatingFlag
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:107
int UNICHAR_ID
Definition: unichar.h:33
static const LanguageModelFlagsType kUpperCaseFlag
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
static const LanguageModelFlagsType kLowerCaseFlag
Definition: cluster.h:45
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define NULL
Definition: host.h:144
void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1240 of file language_model.cpp.

1245  {
1246  bool truth_path;
1247  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1248  blamer_bundle, &truth_path);
1249  ASSERT_HOST(word != NULL);
1250  if (dict_->stopper_debug_level >= 1) {
1251  STRING word_str;
1252  word->string_and_lengths(&word_str, NULL);
1253  vse->Print(word_str.string());
1254  }
1255  if (language_model_debug_level > 0) {
1256  word->print("UpdateBestChoice() constructed word");
1257  }
1258  // Record features from the current path if necessary.
1259  ParamsTrainingHypothesis curr_hyp;
1260  if (blamer_bundle != NULL) {
1261  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1262  static_cast<PermuterType>(word->permuter());
1263  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1264  word->string_and_lengths(&(curr_hyp.str), NULL);
1265  curr_hyp.cost = vse->cost; // record cost for error rate computations
1266  if (language_model_debug_level > 0) {
1267  tprintf("Raw features extracted from %s (cost=%g) [ ",
1268  curr_hyp.str.string(), curr_hyp.cost);
1269  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1270  tprintf("%g ", curr_hyp.features[deb_i]);
1271  }
1272  tprintf("]\n");
1273  }
1274  // Record the current hypothesis in params_training_bundle.
1275  blamer_bundle->AddHypothesis(curr_hyp);
1276  if (truth_path)
1277  blamer_bundle->UpdateBestRating(word->rating());
1278  }
1279  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1280  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1281  // we no longer need it.
1282  delete word;
1283  return;
1284  }
1285  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1286  word->SetScriptPositions(false, word_res->chopped_word);
1287  // Update and log new raw_choice if needed.
1288  if (word_res->raw_choice == NULL ||
1289  word->rating() < word_res->raw_choice->rating()) {
1290  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1291  tprintf("Updated raw choice\n");
1292  }
1293  // Set the modified rating for best choice to vse->cost and log best choice.
1294  word->set_rating(vse->cost);
1295  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1296  // computes adjust_factor that is used by the adaption code (e.g. by
1297  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1298  // Note: the rating of the word is not adjusted.
1299  dict_->adjust_word(word, vse->dawg_info == NULL,
1300  vse->consistency_info.xht_decision, 0.0,
1301  false, language_model_debug_level > 0);
1302  // Hand ownership of the word over to the word_res.
1304  dict_->stopper_debug_level >= 1, word)) {
1305  // The word was so bad that it was deleted.
1306  return;
1307  }
1308  if (word_res->best_choice == word) {
1309  // Word was the new best.
1310  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1311  AcceptablePath(*vse)) {
1312  acceptable_choice_found_ = true;
1313  }
1314  // Update best_choice_bundle.
1315  best_choice_bundle->updated = true;
1316  best_choice_bundle->best_vse = vse;
1317  if (language_model_debug_level > 0) {
1318  tprintf("Updated best choice\n");
1319  word->print_state("New state ");
1320  }
1321  // Update hyphen state if we are dealing with a dictionary word.
1322  if (vse->dawg_info != NULL) {
1323  if (dict_->has_hyphen_end(*word)) {
1325  } else {
1326  dict_->reset_hyphen_vars(true);
1327  }
1328  }
1329 
1330  if (blamer_bundle != NULL) {
1332  vse->dawg_info != NULL && vse->top_choice_flags);
1333  }
1334  }
1335  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1336  word->DisplaySegmentation(word_res->chopped_word);
1337  }
1338 }
DawgPositionVector * active_dawgs
Definition: dict.h:81
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:747
float rating() const
Definition: ratngs.h:324
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * chopped_word
Definition: pageres.h:201
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
#define tprintf(...)
Definition: tprintf.h:31
PermuterType
Definition: ratngs.h:240
void UpdateBestRating(float rating)
Definition: blamer.h:122
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
void print_state(const char *msg) const
Definition: ratngs.cpp:738
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:51
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
uinT8 permuter() const
Definition: ratngs.h:343
WERD_CHOICE * raw_choice
Definition: pageres.h:224
int stopper_debug_level
Definition: dict.h:612
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
bool empty() const
Definition: genericvector.h:84
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
bool AcceptablePath(const ViterbiStateEntry &vse)
void print() const
Definition: ratngs.h:563
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
const char * string() const
Definition: strngs.cpp:193
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
void set_rating(float new_val)
Definition: ratngs.h:366
bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 255 of file language_model.cpp.

263  {
264  if (language_model_debug_level > 0) {
265  tprintf("\nUpdateState: col=%d row=%d %s",
266  curr_col, curr_row, just_classified ? "just_classified" : "");
268  tprintf("(parent=%p)\n", parent_node);
269  else
270  tprintf("\n");
271  }
272  // Initialize helper variables.
273  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
274  bool new_changed = false;
275  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
276  const UNICHARSET& unicharset = dict_->getUnicharset();
277  BLOB_CHOICE *first_lower = NULL;
278  BLOB_CHOICE *first_upper = NULL;
279  BLOB_CHOICE *first_digit = NULL;
280  bool has_alnum_mix = false;
281  if (parent_node != NULL) {
282  int result = SetTopParentLowerUpperDigit(parent_node);
283  if (result < 0) {
285  tprintf("No parents found to process\n");
286  return false;
287  }
288  if (result > 0)
289  has_alnum_mix = true;
290  }
291  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
292  &first_digit))
293  has_alnum_mix = false;;
294  ScanParentsForCaseMix(unicharset, parent_node);
295  if (language_model_debug_level > 3 && parent_node != NULL) {
296  parent_node->Print("Parent viterbi list");
297  }
298  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
299 
300  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
301  ViterbiStateEntry_IT vit;
302  BLOB_CHOICE_IT c_it(curr_list);
303  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
304  BLOB_CHOICE* choice = c_it.data();
305  // TODO(antonova): make sure commenting this out if ok for ngram
306  // model scoring (I think this was introduced to fix ngram model quirks).
307  // Skip NULL unichars unless it is the only choice.
308  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
309  UNICHAR_ID unichar_id = choice->unichar_id();
310  if (unicharset.get_fragment(unichar_id)) {
311  continue; // Skip fragments.
312  }
313  // Set top choice flags.
314  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
315  if (c_it.at_first() || !new_changed)
316  blob_choice_flags |= kSmallestRatingFlag;
317  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
318  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
319  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
320 
321  if (parent_node == NULL) {
322  // Process the beginning of a word.
323  // If there is a better case variant that is not distinguished by size,
324  // skip this blob choice, as we have no choice but to accept the result
325  // of the character classifier to distinguish between them, even if
326  // followed by an upper case.
327  // With words like iPoc, and other CamelBackWords, the lower-upper
328  // transition can only be achieved if the classifier has the correct case
329  // as the top choice, and leaving an initial I lower down the list
330  // increases the chances of choosing IPoc simply because it doesn't
331  // include such a transition. iPoc will beat iPOC and ipoc because
332  // the other words are baseline/x-height inconsistent.
333  if (HasBetterCaseVariant(unicharset, choice, curr_list))
334  continue;
335  // Upper counts as lower at the beginning of a word.
336  if (blob_choice_flags & kUpperCaseFlag)
337  blob_choice_flags |= kLowerCaseFlag;
338  new_changed |= AddViterbiStateEntry(
339  blob_choice_flags, denom, word_end, curr_col, curr_row,
340  choice, curr_state, NULL, pain_points,
341  word_res, best_choice_bundle, blamer_bundle);
342  } else {
343  // Get viterbi entries from each parent ViterbiStateEntry.
344  vit.set_to_list(&parent_node->viterbi_state_entries);
345  int vit_counter = 0;
346  vit.mark_cycle_pt();
347  ViterbiStateEntry* parent_vse = NULL;
348  LanguageModelFlagsType top_choice_flags;
349  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
350  c_it.data(), blob_choice_flags,
351  unicharset, word_res, &vit,
352  &top_choice_flags)) != NULL) {
353  // Skip pruned entries and do not look at prunable entries if already
354  // examined language_model_viterbi_list_max_num_prunable of those.
355  if (PrunablePath(*parent_vse) &&
357  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
358  continue;
359  }
360  // If the parent has no alnum choice, (ie choice is the first in a
361  // string of alnum), and there is a better case variant that is not
362  // distinguished by size, skip this blob choice/parent, as with the
363  // initial blob treatment above.
364  if (!parent_vse->HasAlnumChoice(unicharset) &&
365  HasBetterCaseVariant(unicharset, choice, curr_list))
366  continue;
367  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
368  // looks good according to the Dawgs or character ngram model.
369  new_changed |= AddViterbiStateEntry(
370  top_choice_flags, denom, word_end, curr_col, curr_row,
371  c_it.data(), curr_state, parent_vse, pain_points,
372  word_res, best_choice_bundle, blamer_bundle);
373  }
374  }
375  }
376  return new_changed;
377 }
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
MATRIX * ratings
Definition: pageres.h:215
static const LanguageModelFlagsType kDigitFlag
#define tprintf(...)
Definition: tprintf.h:31
int dimension() const
Definition: matrix.h:247
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
int UNICHAR_ID
Definition: unichar.h:33
static const LanguageModelFlagsType kUpperCaseFlag
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static const LanguageModelFlagsType kLowerCaseFlag
int language_model_viterbi_list_max_num_prunable
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
static const LanguageModelFlagsType kXhtConsistentFlag
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
#define NULL
Definition: host.h:144
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

Member Data Documentation

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 408 of file language_model.h.

DawgPositionVector* tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 396 of file language_model.h.

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 410 of file language_model.h.

DawgArgs* tesseract::LanguageModel::dawg_args_
protected

Definition at line 356 of file language_model.h.

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 375 of file language_model.h.

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 382 of file language_model.h.

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 371 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 48 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 46 of file language_model.h.

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 53 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 45 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 47 of file language_model.h.

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 49 of file language_model.h.

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 308 of file language_model.h.

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 335 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 322 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 310 of file language_model.h.

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 312 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 331 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 328 of file language_model.h.

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 320 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 333 of file language_model.h.

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 325 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 344 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 348 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 350 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 353 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 340 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 338 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 342 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 346 of file language_model.h.

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 352 of file language_model.h.

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 356 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 317 of file language_model.h.

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 385 of file language_model.h.

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 413 of file language_model.h.

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 392 of file language_model.h.

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 393 of file language_model.h.

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 366 of file language_model.h.

DawgPositionVector* tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 395 of file language_model.h.

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 354 of file language_model.h.


The documentation for this class was generated from the following files: