tesseract  4.00.00dev
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Public Attributes

int language_model_debug_level = 0
 
bool language_model_ngram_on = false
 
int language_model_ngram_order = 8
 
int language_model_viterbi_list_max_num_prunable = 10
 
int language_model_viterbi_list_max_size = 500
 
double language_model_ngram_small_prob = 0.000001
 
double language_model_ngram_nonmatch_score = -40.0
 
bool language_model_ngram_use_only_first_uft8_step = false
 
double language_model_ngram_scale_factor = 0.03
 
double language_model_ngram_rating_factor = 16.0
 
bool language_model_ngram_space_delimited_language = true
 
int language_model_min_compound_length = 3
 
double language_model_penalty_non_freq_dict_word = 0.1
 
double language_model_penalty_non_dict_word = 0.15
 
double language_model_penalty_punc = 0.2
 
double language_model_penalty_case = 0.1
 
double language_model_penalty_script = 0.5
 
double language_model_penalty_chartype = 0.3
 
double language_model_penalty_font = 0.00
 
double language_model_penalty_spacing = 0.05
 
double language_model_penalty_increment = 0.01
 
int wordrec_display_segmentations = 0
 
bool language_model_use_sigmoidal_certainty = false
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_
 
const UnicityTable< FontInfo > * fontinfo_table_
 
Dictdict_
 
bool fixed_pitch_
 
float max_char_wh_ratio_
 
STRING prev_word_str_
 
int prev_word_unichar_step_len_
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_
 
bool correct_segmentation_explored_
 
ParamsModel params_model_
 

Detailed Description

Definition at line 42 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 44 of file language_model.cpp.

46  : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
47  dict->getCCUtil()->params()),
49  "Turn on/off the use of character ngram model",
50  dict->getCCUtil()->params()),
52  "Maximum order of the character ngram model",
53  dict->getCCUtil()->params()),
55  "Maximum number of prunable (those for which"
56  " PrunablePath() is true) entries in each viterbi list"
57  " recorded in BLOB_CHOICEs",
58  dict->getCCUtil()->params()),
60  "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
61  dict->getCCUtil()->params()),
63  "To avoid overly small denominators use this as the "
64  "floor of the probability returned by the ngram model.",
65  dict->getCCUtil()->params()),
67  "Average classifier score of a non-matching unichar.",
68  dict->getCCUtil()->params()),
70  "Use only the first UTF8 step of the given string"
71  " when computing log probabilities.",
72  dict->getCCUtil()->params()),
74  "Strength of the character ngram model relative to the"
75  " character classifier ",
76  dict->getCCUtil()->params()),
78  "Factor to bring log-probs into the same range as ratings"
79  " when multiplied by outline length ",
80  dict->getCCUtil()->params()),
82  "Words are delimited by space", dict->getCCUtil()->params()),
84  "Minimum length of compound words",
85  dict->getCCUtil()->params()),
87  "Penalty for words not in the frequent word dictionary",
88  dict->getCCUtil()->params()),
90  "Penalty for non-dictionary words",
91  dict->getCCUtil()->params()),
93  "Penalty for inconsistent punctuation",
94  dict->getCCUtil()->params()),
96  "Penalty for inconsistent case",
97  dict->getCCUtil()->params()),
99  "Penalty for inconsistent script",
100  dict->getCCUtil()->params()),
102  "Penalty for inconsistent character type",
103  dict->getCCUtil()->params()),
104  // TODO(daria, rays): enable font consistency checking
105  // after improving font analysis.
107  "Penalty for inconsistent font",
108  dict->getCCUtil()->params()),
110  "Penalty for inconsistent spacing",
111  dict->getCCUtil()->params()),
112  double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
113  dict->getCCUtil()->params()),
114  INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations",
115  dict->getCCUtil()->params()),
117  "Use sigmoidal score for certainty",
118  dict->getCCUtil()->params()),
119  dawg_args_(nullptr, new DawgPositionVector(), NO_PERM),
120  fontinfo_table_(fontinfo_table),
121  dict_(dict),
122  fixed_pitch_(false),
123  max_char_wh_ratio_(0.0),
124  acceptable_choice_found_(false) {
125  ASSERT_HOST(dict_ != NULL);
126 }
int language_model_viterbi_list_max_num_prunable
bool language_model_ngram_space_delimited_language
bool language_model_ngram_use_only_first_uft8_step
double language_model_penalty_non_dict_word
double language_model_penalty_non_freq_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:315
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
const UnicityTable< FontInfo > * fontinfo_table_
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
double language_model_ngram_nonmatch_score

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 128 of file language_model.cpp.

128 { delete dawg_args_.updated_dawgs; }
DawgPositionVector * updated_dawgs
Definition: dict.h:81

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 95 of file language_model.h.

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 301 of file language_model.h.

301  {
302  return (vse.dawg_info != NULL || vse.Consistent() ||
303  (vse.ngram_info != NULL && !vse.ngram_info->pruned));
304  }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 555 of file language_model.cpp.

566  {
567  ViterbiStateEntry_IT vit;
568  if (language_model_debug_level > 1) {
569  tprintf("AddViterbiStateEntry for unichar %s rating=%.4f"
570  " certainty=%.4f top_choice_flags=0x%x",
572  b->rating(), b->certainty(), top_choice_flags);
574  tprintf(" parent_vse=%p\n", parent_vse);
575  else
576  tprintf("\n");
577  }
578  // Check whether the list is full.
579  if (curr_state != NULL &&
580  curr_state->viterbi_state_entries_length >=
582  if (language_model_debug_level > 1) {
583  tprintf("AddViterbiStateEntry: viterbi list is full!\n");
584  }
585  return false;
586  }
587 
588  // Invoke Dawg language model component.
589  LanguageModelDawgInfo *dawg_info =
590  GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
591 
592  float outline_length =
594  // Invoke Ngram language model component.
595  LanguageModelNgramInfo *ngram_info = NULL;
597  ngram_info = GenerateNgramInfo(
599  denom, curr_col, curr_row, outline_length, parent_vse);
600  ASSERT_HOST(ngram_info != NULL);
601  }
602  bool liked_by_language_model = dawg_info != NULL ||
603  (ngram_info != NULL && !ngram_info->pruned);
604  // Quick escape if not liked by the language model, can't be consistent
605  // xheight, and not top choice.
606  if (!liked_by_language_model && top_choice_flags == 0) {
607  if (language_model_debug_level > 1) {
608  tprintf("Language model components very early pruned this entry\n");
609  }
610  delete ngram_info;
611  delete dawg_info;
612  return false;
613  }
614 
615  // Check consistency of the path and set the relevant consistency_info.
616  LMConsistencyInfo consistency_info(
617  parent_vse != NULL ? &parent_vse->consistency_info : NULL);
618  // Start with just the x-height consistency, as it provides significant
619  // pruning opportunity.
620  consistency_info.ComputeXheightConsistency(
622  // Turn off xheight consistent flag if not consistent.
623  if (consistency_info.InconsistentXHeight()) {
624  top_choice_flags &= ~kXhtConsistentFlag;
625  }
626 
627  // Quick escape if not liked by the language model, not consistent xheight,
628  // and not top choice.
629  if (!liked_by_language_model && top_choice_flags == 0) {
630  if (language_model_debug_level > 1) {
631  tprintf("Language model components early pruned this entry\n");
632  }
633  delete ngram_info;
634  delete dawg_info;
635  return false;
636  }
637 
638  // Compute the rest of the consistency info.
639  FillConsistencyInfo(curr_col, word_end, b, parent_vse,
640  word_res, &consistency_info);
641  if (dawg_info != NULL && consistency_info.invalid_punc) {
642  consistency_info.invalid_punc = false; // do not penalize dict words
643  }
644 
645  // Compute cost of associating the blobs that represent the current unichar.
646  AssociateStats associate_stats;
647  ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_,
648  parent_vse, word_res, &associate_stats);
649  if (parent_vse != NULL) {
650  associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
651  associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
652  }
653 
654  // Create the new ViterbiStateEntry compute the adjusted cost of the path.
655  ViterbiStateEntry *new_vse = new ViterbiStateEntry(
656  parent_vse, b, 0.0, outline_length,
657  consistency_info, associate_stats, top_choice_flags, dawg_info,
658  ngram_info, (language_model_debug_level > 0) ?
659  dict_->getUnicharset().id_to_unichar(b->unichar_id()) : NULL);
660  new_vse->cost = ComputeAdjustedPathCost(new_vse);
662  tprintf("Adjusted cost = %g\n", new_vse->cost);
663 
664  // Invoke Top Choice language model component to make the final adjustments
665  // to new_vse->top_choice_flags.
666  if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
667  GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
668  }
669 
670  // If language model components did not like this unichar - return.
671  bool keep = new_vse->top_choice_flags || liked_by_language_model;
672  if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
673  consistency_info.inconsistent_script) { // with inconsistent script
674  keep = false;
675  }
676  if (!keep) {
677  if (language_model_debug_level > 1) {
678  tprintf("Language model components did not like this entry\n");
679  }
680  delete new_vse;
681  return false;
682  }
683 
684  // Discard this entry if it represents a prunable path and
685  // language_model_viterbi_list_max_num_prunable such entries with a lower
686  // cost have already been recorded.
687  if (PrunablePath(*new_vse) &&
688  (curr_state->viterbi_state_entries_prunable_length >=
690  new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
691  if (language_model_debug_level > 1) {
692  tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n",
693  new_vse->cost,
694  curr_state->viterbi_state_entries_prunable_max_cost);
695  }
696  delete new_vse;
697  return false;
698  }
699 
700  // Update best choice if needed.
701  if (word_end) {
702  UpdateBestChoice(new_vse, pain_points, word_res,
703  best_choice_bundle, blamer_bundle);
704  // Discard the entry if UpdateBestChoice() found flaws in it.
705  if (new_vse->cost >= WERD_CHOICE::kBadRating &&
706  new_vse != best_choice_bundle->best_vse) {
707  if (language_model_debug_level > 1) {
708  tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
709  }
710  delete new_vse;
711  return false;
712  }
713  }
714 
715  // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
716  curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare,
717  false, new_vse);
718  curr_state->viterbi_state_entries_length++;
719  if (PrunablePath(*new_vse)) {
720  curr_state->viterbi_state_entries_prunable_length++;
721  }
722 
723  // Update lms->viterbi_state_entries_prunable_max_cost and clear
724  // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
725  if ((curr_state->viterbi_state_entries_prunable_length >=
727  new_vse->top_choice_flags) {
728  ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
729  int prunable_counter = language_model_viterbi_list_max_num_prunable;
730  vit.set_to_list(&(curr_state->viterbi_state_entries));
731  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
732  ViterbiStateEntry *curr_vse = vit.data();
733  // Clear the appropriate top choice flags of the entries in the
734  // list that have cost higher thank new_entry->cost
735  // (since they will not be top choices any more).
736  if (curr_vse->top_choice_flags && curr_vse != new_vse &&
737  curr_vse->cost > new_vse->cost) {
738  curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
739  }
740  if (prunable_counter > 0 && PrunablePath(*curr_vse)) --prunable_counter;
741  // Update curr_state->viterbi_state_entries_prunable_max_cost.
742  if (prunable_counter == 0) {
743  curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
744  if (language_model_debug_level > 1) {
745  tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
746  curr_state->viterbi_state_entries_prunable_max_cost);
747  }
748  prunable_counter = -1; // stop counting
749  }
750  }
751  }
752 
753  // Print the newly created ViterbiStateEntry.
754  if (language_model_debug_level > 2) {
755  new_vse->Print("New");
757  curr_state->Print("Updated viterbi list");
758  }
759 
760  return true;
761 }
static const float kBadRating
Definition: ratngs.h:271
int language_model_viterbi_list_max_num_prunable
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
static const LanguageModelFlagsType kSmallestRatingFlag
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:126
static const LanguageModelFlagsType kXhtConsistentFlag
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
#define tprintf(...)
Definition: tprintf.h:31
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:80
#define ASSERT_HOST(x)
Definition: errcode.h:84
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
float rating() const
Definition: ratngs.h:79
bool PrunablePath(const ViterbiStateEntry &vse)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
float certainty() const
Definition: ratngs.h:82
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 104 of file language_model.h.

104  {
106  // cert is assumed to be between 0 and -dict_->certainty_scale.
107  // If you enable language_model_use_sigmoidal_certainty, you
108  // need to adjust language_model_ngram_nonmatch_score as well.
109  cert = -cert / dict_->certainty_scale;
110  return 1.0f / (1.0f + exp(10.0f * cert));
111  } else {
112  return (-1.0f / cert);
113  }
114  }
double certainty_scale
Definition: dict.h:611

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1190 of file language_model.cpp.

1190  {
1191  ASSERT_HOST(vse != NULL);
1192  if (params_model_.Initialized()) {
1193  float features[PTRAIN_NUM_FEATURE_TYPES];
1194  ExtractFeaturesFromPath(*vse, features);
1195  float cost = params_model_.ComputeCost(features);
1196  if (language_model_debug_level > 3) {
1197  tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1198  if (language_model_debug_level >= 5) {
1199  for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1200  tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1201  }
1202  }
1203  }
1204  return cost * vse->outline_length;
1205  } else {
1206  float adjustment = 1.0f;
1207  if (vse->dawg_info == NULL || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1209  }
1210  if (vse->dawg_info == NULL) {
1212  if (vse->length > language_model_min_compound_length) {
1213  adjustment += ((vse->length - language_model_min_compound_length) *
1215  }
1216  }
1217  if (vse->associate_stats.shape_cost > 0) {
1218  adjustment += vse->associate_stats.shape_cost /
1219  static_cast<float>(vse->length);
1220  }
1222  ASSERT_HOST(vse->ngram_info != NULL);
1223  return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1224  } else {
1225  adjustment += ComputeConsistencyAdjustment(vse->dawg_info,
1226  vse->consistency_info);
1227  return vse->ratings_sum * adjustment;
1228  }
1229  }
1230 }
float ComputeCost(const float features[]) const
#define tprintf(...)
Definition: tprintf.h:31
double language_model_penalty_non_dict_word
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
double language_model_penalty_non_freq_dict_word
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 116 of file language_model.h.

116  {
117  if (num_problems == 0) return 0.0f;
118  if (num_problems == 1) return penalty;
119  return (penalty + (language_model_penalty_increment *
120  static_cast<float>(num_problems-1)));
121  }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 272 of file language_model.h.

276  {
278  col, row,
279  (parent_vse != NULL) ? &(parent_vse->associate_stats) : NULL,
280  (parent_vse != NULL) ? parent_vse->length : 0,
281  fixed_pitch_, max_char_wh_ratio,
282  word_res, language_model_debug_level > 2, associate_stats);
283  }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:37

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 127 of file language_model.h.

129  {
130  if (dawg_info != NULL) {
131  return ComputeAdjustment(consistency_info.NumInconsistentCase(),
133  (consistency_info.inconsistent_script ?
135  }
136  return (ComputeAdjustment(consistency_info.NumInconsistentPunc(),
138  ComputeAdjustment(consistency_info.NumInconsistentCase(),
140  ComputeAdjustment(consistency_info.NumInconsistentChartype(),
142  ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
144  (consistency_info.inconsistent_script ?
146  (consistency_info.inconsistent_font ?
148  }
float ComputeAdjustment(int num_problems, float penalty)

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 987 of file language_model.cpp.

987  {
988  if (curr_list->empty()) return 1.0f;
989  float denom = 0.0f;
990  int len = 0;
991  BLOB_CHOICE_IT c_it(curr_list);
992  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
993  ASSERT_HOST(c_it.data() != NULL);
994  ++len;
995  denom += CertaintyScore(c_it.data()->certainty());
996  }
997  assert(len != 0);
998  // The ideal situation would be to have the classifier scores for
999  // classifying each position as each of the characters in the unicharset.
1000  // Since we can not do this because of speed, we add a very crude estimate
1001  // of what these scores for the "missing" classifications would sum up to.
1002  denom += (dict_->getUnicharset().size() - len) *
1004 
1005  return denom;
1006 }
int size() const
Definition: unicharset.h:338
#define ASSERT_HOST(x)
Definition: errcode.h:84
double language_model_ngram_nonmatch_score
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
float CertaintyScore(float cert)

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 927 of file language_model.cpp.

933  {
934  const char *context_ptr = context;
935  char *modified_context = NULL;
936  char *modified_context_end = NULL;
937  const char *unichar_ptr = unichar;
938  const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
939  float prob = 0.0f;
940  int step = 0;
941  while (unichar_ptr < unichar_end &&
942  (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
943  if (language_model_debug_level > 1) {
944  tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
945  dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
946  }
947  prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
948  ++(*unichar_step_len);
950  unichar_ptr += step;
951  // If there are multiple UTF8 characters present in unichar, context is
952  // updated to include the previously examined characters from str,
953  // unless use_only_first_uft8_step is true.
954  if (unichar_ptr < unichar_end) {
955  if (modified_context == NULL) {
956  int context_len = strlen(context);
957  modified_context =
958  new char[context_len + strlen(unichar_ptr) + step + 1];
959  strncpy(modified_context, context, context_len);
960  modified_context_end = modified_context + context_len;
961  context_ptr = modified_context;
962  }
963  strncpy(modified_context_end, unichar_ptr - step, step);
964  modified_context_end += step;
965  *modified_context_end = '\0';
966  }
967  }
968  prob /= static_cast<float>(*unichar_step_len); // normalize
969  if (prob < language_model_ngram_small_prob) {
970  if (language_model_debug_level > 0) tprintf("Found small prob %g\n", prob);
971  *found_small_prob = true;
973  }
974  *ngram_cost = -1.0*log2(prob);
975  float ngram_and_classifier_cost =
976  -1.0*log2(CertaintyScore(certainty)/denom) +
977  *ngram_cost * language_model_ngram_scale_factor;
978  if (language_model_debug_level > 1) {
979  tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
980  unichar, context_ptr, CertaintyScore(certainty)/denom, prob,
981  ngram_and_classifier_cost);
982  }
983  delete[] modified_context;
984  return ngram_and_classifier_cost;
985 }
bool language_model_ngram_use_only_first_uft8_step
#define tprintf(...)
Definition: tprintf.h:31
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:372
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
float CertaintyScore(float cert)

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1381 of file language_model.cpp.

1386  {
1387  if (truth_path != NULL) {
1388  *truth_path =
1389  (blamer_bundle != NULL &&
1390  vse->length == blamer_bundle->correct_segmentation_length());
1391  }
1392  BLOB_CHOICE *curr_b = vse->curr_b;
1393  ViterbiStateEntry *curr_vse = vse;
1394 
1395  int i;
1396  bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1397 
1398  // Re-compute the variance of the width-to-height ratios (since we now
1399  // can compute the mean over the whole word).
1400  float full_wh_ratio_mean = 0.0f;
1401  if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1402  vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1403  full_wh_ratio_mean = (vse->associate_stats.full_wh_ratio_total /
1404  static_cast<float>(vse->length));
1405  vse->associate_stats.full_wh_ratio_var = 0.0f;
1406  }
1407 
1408  // Construct a WERD_CHOICE by tracing parent pointers.
1409  WERD_CHOICE *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1410  word->set_length(vse->length);
1411  int total_blobs = 0;
1412  for (i = (vse->length-1); i >= 0; --i) {
1413  if (blamer_bundle != NULL && truth_path != NULL && *truth_path &&
1414  !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1415  *truth_path = false;
1416  }
1417  // The number of blobs used for this choice is row - col + 1.
1418  int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1419  total_blobs += num_blobs;
1420  word->set_blob_choice(i, num_blobs, curr_b);
1421  // Update the width-to-height ratio variance. Useful non-space delimited
1422  // languages to ensure that the blobs are of uniform width.
1423  // Skip leading and trailing punctuation when computing the variance.
1424  if ((full_wh_ratio_mean != 0.0f &&
1425  ((curr_vse != vse && curr_vse->parent_vse != NULL) ||
1426  !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1427  vse->associate_stats.full_wh_ratio_var +=
1428  pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1429  if (language_model_debug_level > 2) {
1430  tprintf("full_wh_ratio_var += (%g-%g)^2\n",
1431  full_wh_ratio_mean, curr_vse->associate_stats.full_wh_ratio);
1432  }
1433  }
1434 
1435  // Mark the word as compound if compound permuter was set for any of
1436  // the unichars on the path (usually this will happen for unichars
1437  // that are compounding operators, like "-" and "/").
1438  if (!compound && curr_vse->dawg_info &&
1439  curr_vse->dawg_info->permuter == COMPOUND_PERM) compound = true;
1440 
1441  // Update curr_* pointers.
1442  curr_vse = curr_vse->parent_vse;
1443  if (curr_vse == NULL) break;
1444  curr_b = curr_vse->curr_b;
1445  }
1446  ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1447  ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1448  // Re-adjust shape cost to include the updated width-to-height variance.
1449  if (full_wh_ratio_mean != 0.0f) {
1450  vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1451  }
1452 
1453  word->set_rating(vse->ratings_sum);
1454  word->set_certainty(vse->min_certainty);
1455  word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1456  vse->consistency_info.BodyMaxXHeight());
1457  if (vse->dawg_info != NULL) {
1458  word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1459  } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1460  word->set_permuter(NGRAM_PERM);
1461  } else if (vse->top_choice_flags) {
1463  } else {
1464  word->set_permuter(NO_PERM);
1465  }
1466  word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true,
1467  word_res->ratings));
1468  return word;
1469 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
const UNICHARSET * uch_set
Definition: pageres.h:192
int dimension() const
Definition: matrix.h:528
void set_rating(float new_val)
Definition: ratngs.h:365
MATRIX * ratings
Definition: pageres.h:215
void set_dangerous_ambig_found_(bool value)
Definition: ratngs.h:362
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
#define tprintf(...)
Definition: tprintf.h:31
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
Definition: blamer.h:131
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:151
void set_permuter(uinT8 perm)
Definition: ratngs.h:371
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_certainty(float new_val)
Definition: ratngs.h:368
const MATRIX_COORD & matrix_cell()
Definition: ratngs.h:114
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:293
void set_x_heights(float min_height, float max_height)
Definition: ratngs.h:338
int correct_segmentation_length() const
Definition: blamer.h:126
void set_length(int len)
Definition: ratngs.h:377
const UNICHARSET & getUnicharset() const
Definition: dict.h:97

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1332 of file language_model.cpp.

1333  {
1334  memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1335  // Record dictionary match info.
1336  int len = vse.length <= kMaxSmallWordUnichars ? 0 :
1337  vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1338  if (vse.dawg_info != NULL) {
1339  int permuter = vse.dawg_info->permuter;
1340  if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1341  if (vse.consistency_info.num_digits == vse.length) {
1342  features[PTRAIN_DIGITS_SHORT+len] = 1.0;
1343  } else {
1344  features[PTRAIN_NUM_SHORT+len] = 1.0;
1345  }
1346  } else if (permuter == DOC_DAWG_PERM) {
1347  features[PTRAIN_DOC_SHORT+len] = 1.0;
1348  } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1349  permuter == COMPOUND_PERM) {
1350  features[PTRAIN_DICT_SHORT+len] = 1.0;
1351  } else if (permuter == FREQ_DAWG_PERM) {
1352  features[PTRAIN_FREQ_SHORT+len] = 1.0;
1353  }
1354  }
1355  // Record shape cost feature (normalized by path length).
1356  features[PTRAIN_SHAPE_COST_PER_CHAR] =
1357  vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1358  // Record ngram cost. (normalized by the path length).
1359  features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0;
1360  if (vse.ngram_info != NULL) {
1361  features[PTRAIN_NGRAM_COST_PER_CHAR] =
1362  vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1363  }
1364  // Record consistency-related features.
1365  // Disabled this feature for due to its poor performance.
1366  // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1367  features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1368  features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1369  features[PTRAIN_NUM_BAD_CHAR_TYPE] = vse.dawg_info == NULL ?
1370  vse.consistency_info.NumInconsistentChartype() : 0.0;
1371  features[PTRAIN_NUM_BAD_SPACING] =
1372  vse.consistency_info.NumInconsistentSpaces();
1373  // Disabled this feature for now due to its poor performance.
1374  // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1375 
1376  // Classifier-related features.
1377  features[PTRAIN_RATING_PER_CHAR] =
1378  vse.ratings_sum / static_cast<float>(vse.outline_length);
1379 }

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1008 of file language_model.cpp.

1014  {
1015  const UNICHARSET &unicharset = dict_->getUnicharset();
1016  UNICHAR_ID unichar_id = b->unichar_id();
1017  BLOB_CHOICE* parent_b = parent_vse != NULL ? parent_vse->curr_b : NULL;
1018 
1019  // Check punctuation validity.
1020  if (unicharset.get_ispunctuation(unichar_id)) consistency_info->num_punc++;
1021  if (dict_->GetPuncDawg() != NULL && !consistency_info->invalid_punc) {
1022  if (dict_->compound_marker(unichar_id) && parent_b != NULL &&
1023  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1024  unicharset.get_isdigit(parent_b->unichar_id()))) {
1025  // reset punc_ref for compound words
1026  consistency_info->punc_ref = NO_EDGE;
1027  } else {
1028  bool is_apos = dict_->is_apostrophe(unichar_id);
1029  bool prev_is_numalpha = (parent_b != NULL &&
1030  (unicharset.get_isalpha(parent_b->unichar_id()) ||
1031  unicharset.get_isdigit(parent_b->unichar_id())));
1032  UNICHAR_ID pattern_unichar_id =
1033  (unicharset.get_isalpha(unichar_id) ||
1034  unicharset.get_isdigit(unichar_id) ||
1035  (is_apos && prev_is_numalpha)) ?
1036  Dawg::kPatternUnicharID : unichar_id;
1037  if (consistency_info->punc_ref == NO_EDGE ||
1038  pattern_unichar_id != Dawg::kPatternUnicharID ||
1039  dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1042  consistency_info->punc_ref);
1043  consistency_info->punc_ref =
1044  (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1045  node, pattern_unichar_id, word_end) : NO_EDGE;
1046  if (consistency_info->punc_ref == NO_EDGE) {
1047  consistency_info->invalid_punc = true;
1048  }
1049  }
1050  }
1051  }
1052 
1053  // Update case related counters.
1054  if (parent_vse != NULL && !word_end && dict_->compound_marker(unichar_id)) {
1055  // Reset counters if we are dealing with a compound word.
1056  consistency_info->num_lower = 0;
1057  consistency_info->num_non_first_upper = 0;
1058  }
1059  else if (unicharset.get_islower(unichar_id)) {
1060  consistency_info->num_lower++;
1061  } else if ((parent_b != NULL) && unicharset.get_isupper(unichar_id)) {
1062  if (unicharset.get_isupper(parent_b->unichar_id()) ||
1063  consistency_info->num_lower > 0 ||
1064  consistency_info->num_non_first_upper > 0) {
1065  consistency_info->num_non_first_upper++;
1066  }
1067  }
1068 
1069  // Initialize consistency_info->script_id (use script of unichar_id
1070  // if it is not Common, use script id recorded by the parent otherwise).
1071  // Set inconsistent_script to true if the script of the current unichar
1072  // is not consistent with that of the parent.
1073  consistency_info->script_id = unicharset.get_script(unichar_id);
1074  // Hiragana and Katakana can mix with Han.
1076  if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1077  consistency_info->script_id == unicharset.hiragana_sid()) ||
1078  (unicharset.katakana_sid() != unicharset.null_sid() &&
1079  consistency_info->script_id == unicharset.katakana_sid())) {
1080  consistency_info->script_id = dict_->getUnicharset().han_sid();
1081  }
1082  }
1083 
1084  if (parent_vse != NULL &&
1085  (parent_vse->consistency_info.script_id !=
1086  dict_->getUnicharset().common_sid())) {
1087  int parent_script_id = parent_vse->consistency_info.script_id;
1088  // If script_id is Common, use script id of the parent instead.
1089  if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1090  consistency_info->script_id = parent_script_id;
1091  }
1092  if (consistency_info->script_id != parent_script_id) {
1093  consistency_info->inconsistent_script = true;
1094  }
1095  }
1096 
1097  // Update chartype related counters.
1098  if (unicharset.get_isalpha(unichar_id)) {
1099  consistency_info->num_alphas++;
1100  } else if (unicharset.get_isdigit(unichar_id)) {
1101  consistency_info->num_digits++;
1102  } else if (!unicharset.get_ispunctuation(unichar_id)) {
1103  consistency_info->num_other++;
1104  }
1105 
1106  // Check font and spacing consistency.
1107  if (fontinfo_table_->size() > 0 && parent_b != NULL) {
1108  int fontinfo_id = -1;
1109  if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1110  parent_b->fontinfo_id2() == b->fontinfo_id()) {
1111  fontinfo_id = b->fontinfo_id();
1112  } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1113  parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1114  fontinfo_id = b->fontinfo_id2();
1115  }
1116  if(language_model_debug_level > 1) {
1117  tprintf("pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1118  (parent_b->fontinfo_id() >= 0) ?
1119  fontinfo_table_->get(parent_b->fontinfo_id()).name : "" ,
1120  (parent_b->fontinfo_id2() >= 0) ?
1121  fontinfo_table_->get(parent_b->fontinfo_id2()).name : "",
1122  (b->fontinfo_id() >= 0) ?
1123  fontinfo_table_->get(b->fontinfo_id()).name : "",
1124  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1125  (fontinfo_id >= 0) ? fontinfo_table_->get(fontinfo_id).name : "",
1126  fontinfo_id);
1127  }
1128  if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1129  bool expected_gap_found = false;
1130  float expected_gap;
1131  int temp_gap;
1132  if (fontinfo_id >= 0) { // found a common font
1133  ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1134  if (fontinfo_table_->get(fontinfo_id).get_spacing(
1135  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1136  expected_gap = temp_gap;
1137  expected_gap_found = true;
1138  }
1139  } else {
1140  consistency_info->inconsistent_font = true;
1141  // Get an average of the expected gaps in each font
1142  int num_addends = 0;
1143  expected_gap = 0;
1144  int temp_fid;
1145  for (int i = 0; i < 4; ++i) {
1146  if (i == 0) {
1147  temp_fid = parent_b->fontinfo_id();
1148  } else if (i == 1) {
1149  temp_fid = parent_b->fontinfo_id2();
1150  } else if (i == 2) {
1151  temp_fid = b->fontinfo_id();
1152  } else {
1153  temp_fid = b->fontinfo_id2();
1154  }
1155  ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1156  if (temp_fid >= 0 && fontinfo_table_->get(temp_fid).get_spacing(
1157  parent_b->unichar_id(), unichar_id, &temp_gap)) {
1158  expected_gap += temp_gap;
1159  num_addends++;
1160  }
1161  }
1162  expected_gap_found = (num_addends > 0);
1163  if (num_addends > 0) {
1164  expected_gap /= static_cast<float>(num_addends);
1165  }
1166  }
1167  if (expected_gap_found) {
1168  float actual_gap =
1169  static_cast<float>(word_res->GetBlobsGap(curr_col-1));
1170  float gap_ratio = expected_gap / actual_gap;
1171  // TODO(rays) The gaps seem to be way off most of the time, saved by
1172  // the error here that the ratio was compared to 1/2, when it should
1173  // have been 0.5f. Find the source of the gaps discrepancy and put
1174  // the 0.5f here in place of 0.0f.
1175  // Test on 2476595.sj, pages 0 to 6. (In French.)
1176  if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1177  consistency_info->num_inconsistent_spaces++;
1178  }
1179  if (language_model_debug_level > 1) {
1180  tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %g\n",
1181  unicharset.id_to_unichar(parent_b->unichar_id()),
1182  parent_b->unichar_id(), unicharset.id_to_unichar(unichar_id),
1183  unichar_id, curr_col, expected_gap, actual_gap);
1184  }
1185  }
1186  }
1187  }
1188 }
bool empty() const
Definition: genericvector.h:91
int han_sid() const
Definition: unicharset.h:887
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
int katakana_sid() const
Definition: unicharset.h:889
inT64 NODE_REF
Definition: dawg.h:56
inT16 fontinfo_id2() const
Definition: ratngs.h:88
#define tprintf(...)
Definition: tprintf.h:31
int hiragana_sid() const
Definition: unicharset.h:888
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:117
int common_sid() const
Definition: unicharset.h:883
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:420
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 fontinfo_id() const
Definition: ratngs.h:85
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:416
const UnicityTable< FontInfo > * fontinfo_table_
GenericVector< int > blob_widths
Definition: pageres.h:205
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
int null_sid() const
Definition: unicharset.h:882
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 779 of file language_model.cpp.

783  {
784  // Initialize active_dawgs from parent_vse if it is not NULL.
785  // Otherwise use very_beginning_active_dawgs_.
786  if (parent_vse == NULL) {
789  } else {
790  if (parent_vse->dawg_info == NULL) return NULL; // not a dict word path
791  dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
792  dawg_args_.permuter = parent_vse->dawg_info->permuter;
793  }
794 
795  // Deal with hyphenated words.
796  if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
797  if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
798  return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
799  }
800 
801  // Deal with compound words.
802  if (dict_->compound_marker(b.unichar_id()) &&
803  (parent_vse == NULL || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
804  if (language_model_debug_level > 0) tprintf("Found compound marker\n");
805  // Do not allow compound operators at the beginning and end of the word.
806  // Do not allow more than one compound operator per word.
807  // Do not allow compounding of words with lengths shorter than
808  // language_model_min_compound_length
809  if (parent_vse == NULL || word_end ||
811  parent_vse->length < language_model_min_compound_length)
812  return NULL;
813 
814  int i;
815  // Check a that the path terminated before the current character is a word.
816  bool has_word_ending = false;
817  for (i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
818  const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
819  const Dawg *pdawg = pos.dawg_index < 0
820  ? NULL : dict_->GetDawg(pos.dawg_index);
821  if (pdawg == NULL || pos.back_to_punc) continue;;
822  if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
823  pdawg->end_of_word(pos.dawg_ref)) {
824  has_word_ending = true;
825  break;
826  }
827  }
828  if (!has_word_ending) return NULL;
829 
830  if (language_model_debug_level > 0) tprintf("Compound word found\n");
831  return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
832  } // done dealing with compound words
833 
834  LanguageModelDawgInfo *dawg_info = NULL;
835 
836  // Call LetterIsOkay().
837  // Use the normalized IDs so that all shapes of ' can be allowed in words
838  // like don't.
839  const GenericVector<UNICHAR_ID>& normed_ids =
841  DawgPositionVector tmp_active_dawgs;
842  for (int i = 0; i < normed_ids.size(); ++i) {
844  tprintf("Test Letter OK for unichar %d, normed %d\n",
845  b.unichar_id(), normed_ids[i]);
846  dict_->LetterIsOkay(&dawg_args_, normed_ids[i],
847  word_end && i == normed_ids.size() - 1);
848  if (dawg_args_.permuter == NO_PERM) {
849  break;
850  } else if (i < normed_ids.size() - 1) {
851  tmp_active_dawgs = *dawg_args_.updated_dawgs;
852  dawg_args_.active_dawgs = &tmp_active_dawgs;
853  }
855  tprintf("Letter was OK for unichar %d, normed %d\n",
856  b.unichar_id(), normed_ids[i]);
857  }
858  dawg_args_.active_dawgs = nullptr;
859  if (dawg_args_.permuter != NO_PERM) {
860  dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs,
862  } else if (language_model_debug_level > 3) {
863  tprintf("Letter %s not OK!\n",
865  }
866 
867  return dawg_info;
868 }
int LetterIsOkay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:359
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:414
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
DawgPositionVector * updated_dawgs
Definition: dict.h:81
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
PermuterType permuter
Definition: dict.h:82
DawgPositionVector * active_dawgs
Definition: dict.h:80
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
DawgPositionVector beginning_active_dawgs_
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
DawgPositionVector very_beginning_active_dawgs_
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 870 of file language_model.cpp.

873  {
874  // Initialize parent context.
875  const char *pcontext_ptr = "";
876  int pcontext_unichar_step_len = 0;
877  if (parent_vse == NULL) {
878  pcontext_ptr = prev_word_str_.string();
879  pcontext_unichar_step_len = prev_word_unichar_step_len_;
880  } else {
881  pcontext_ptr = parent_vse->ngram_info->context.string();
882  pcontext_unichar_step_len =
883  parent_vse->ngram_info->context_unichar_step_len;
884  }
885  // Compute p(unichar | parent context).
886  int unichar_step_len = 0;
887  bool pruned = false;
888  float ngram_cost;
889  float ngram_and_classifier_cost =
890  ComputeNgramCost(unichar, certainty, denom,
891  pcontext_ptr, &unichar_step_len,
892  &pruned, &ngram_cost);
893  // Normalize just the ngram_and_classifier_cost by outline_length.
894  // The ngram_cost is used by the params_model, so it needs to be left as-is,
895  // and the params model cost will be normalized by outline_length.
896  ngram_and_classifier_cost *=
897  outline_length / language_model_ngram_rating_factor;
898  // Add the ngram_cost of the parent.
899  if (parent_vse != NULL) {
900  ngram_and_classifier_cost +=
901  parent_vse->ngram_info->ngram_and_classifier_cost;
902  ngram_cost += parent_vse->ngram_info->ngram_cost;
903  }
904 
905  // Shorten parent context string by unichar_step_len unichars.
906  int num_remove = (unichar_step_len + pcontext_unichar_step_len -
908  if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
909  while (num_remove > 0 && *pcontext_ptr != '\0') {
910  pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
911  --num_remove;
912  }
913 
914  // Decide whether to prune this ngram path and update changed accordingly.
915  if (parent_vse != NULL && parent_vse->ngram_info->pruned) pruned = true;
916 
917  // Construct and return the new LanguageModelNgramInfo.
918  LanguageModelNgramInfo *ngram_info = new LanguageModelNgramInfo(
919  pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
920  ngram_and_classifier_cost);
921  ngram_info->context += unichar;
922  ngram_info->context_unichar_step_len += unichar_step_len;
923  assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
924  return ngram_info;
925 }
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
const char * string() const
Definition: strngs.cpp:198
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 763 of file language_model.cpp.

765  {
766  ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
767  for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->top_choice_flags &&
768  new_vse->cost >= vit.data()->cost; vit.forward()) {
769  // Clear the appropriate flags if the list already contains
770  // a top choice entry with a lower cost.
771  new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
772  }
773  if (language_model_debug_level > 2) {
774  tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
775  new_vse->top_choice_flags);
776  }
777 }
#define tprintf(...)
Definition: tprintf.h:31

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 494 of file language_model.cpp.

498  {
499  for (; !vse_it->cycled_list(); vse_it->forward()) {
500  ViterbiStateEntry* parent_vse = vse_it->data();
501  // Only consider the parent if it has been updated or
502  // if the current ratings cell has just been classified.
503  if (!just_classified && !parent_vse->updated) continue;
505  parent_vse->Print("Considering");
506  // If the parent is non-alnum, then upper counts as lower.
507  *top_choice_flags = blob_choice_flags;
508  if ((blob_choice_flags & kUpperCaseFlag) &&
509  !parent_vse->HasAlnumChoice(unicharset)) {
510  *top_choice_flags |= kLowerCaseFlag;
511  }
512  *top_choice_flags &= parent_vse->top_choice_flags;
513  UNICHAR_ID unichar_id = bc->unichar_id();
514  const BLOB_CHOICE* parent_b = parent_vse->curr_b;
515  UNICHAR_ID parent_id = parent_b->unichar_id();
516  // Digits do not bind to alphas if there is a mix in both parent and current
517  // or if the alpha is not the top choice.
518  if (unicharset.get_isdigit(unichar_id) &&
519  unicharset.get_isalpha(parent_id) &&
520  (mixed_alnum || *top_choice_flags == 0))
521  continue; // Digits don't bind to alphas.
522  // Likewise alphas do not bind to digits if there is a mix in both or if
523  // the digit is not the top choice.
524  if (unicharset.get_isalpha(unichar_id) &&
525  unicharset.get_isdigit(parent_id) &&
526  (mixed_alnum || *top_choice_flags == 0))
527  continue; // Alphas don't bind to digits.
528  // If there is a case mix of the same alpha in the parent list, then
529  // competing_vse is non-null and will be used to determine whether
530  // or not to bind the current blob choice.
531  if (parent_vse->competing_vse != NULL) {
532  const BLOB_CHOICE* competing_b = parent_vse->competing_vse->curr_b;
533  UNICHAR_ID other_id = competing_b->unichar_id();
534  if (language_model_debug_level >= 5) {
535  tprintf("Parent %s has competition %s\n",
536  unicharset.id_to_unichar(parent_id),
537  unicharset.id_to_unichar(other_id));
538  }
539  if (unicharset.SizesDistinct(parent_id, other_id)) {
540  // If other_id matches bc wrt position and size, and parent_id, doesn't,
541  // don't bind to the current parent.
542  if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
544  !bc->PosAndSizeAgree(*parent_b, word_res->x_height,
546  continue; // Competing blobchoice has a better vertical match.
547  }
548  }
549  vse_it->forward();
550  return parent_vse; // This one is good!
551  }
552  return NULL; // Ran out of possibilities.
553 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
float x_height
Definition: pageres.h:295
static const LanguageModelFlagsType kUpperCaseFlag
#define tprintf(...)
Definition: tprintf.h:31
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:483
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:133
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35
static const LanguageModelFlagsType kLowerCaseFlag

◆ getParamsModel()

ParamsModel& tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 100 of file language_model.h.

100 { return params_model_; }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 377 of file language_model.cpp.

380  {
381  BLOB_CHOICE_IT c_it(curr_list);
382  const UNICHARSET &unicharset = dict_->getUnicharset();
383  BLOB_CHOICE *first_unichar = NULL;
384  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
385  UNICHAR_ID unichar_id = c_it.data()->unichar_id();
386  if (unicharset.get_fragment(unichar_id)) continue; // skip fragments
387  if (first_unichar == NULL) first_unichar = c_it.data();
388  if (*first_lower == NULL && unicharset.get_islower(unichar_id)) {
389  *first_lower = c_it.data();
390  }
391  if (*first_upper == NULL && unicharset.get_isalpha(unichar_id) &&
392  !unicharset.get_islower(unichar_id)) {
393  *first_upper = c_it.data();
394  }
395  if (*first_digit == NULL && unicharset.get_isdigit(unichar_id)) {
396  *first_digit = c_it.data();
397  }
398  }
399  ASSERT_HOST(first_unichar != NULL);
400  bool mixed = (*first_lower != NULL || *first_upper != NULL) &&
401  *first_digit != NULL;
402  if (*first_lower == NULL) *first_lower = first_unichar;
403  if (*first_upper == NULL) *first_upper = first_unichar;
404  if (*first_digit == NULL) *first_digit = first_unichar;
405  return mixed;
406 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
Definition: cluster.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:35

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 130 of file language_model.cpp.

132  {
133  fixed_pitch_ = fixed_pitch;
134  max_char_wh_ratio_ = max_char_wh_ratio;
135  rating_cert_scale_ = rating_cert_scale;
136  acceptable_choice_found_ = false;
138 
139  // Initialize vectors with beginning DawgInfos.
144 
145  // Fill prev_word_str_ with the last language_model_ngram_order
146  // unichars from prev_word.
148  if (prev_word != NULL && prev_word->unichar_string() != NULL) {
149  prev_word_str_ = prev_word->unichar_string();
151  } else {
152  prev_word_str_ = " ";
153  }
154  const char *str_ptr = prev_word_str_.string();
155  const char *str_end = str_ptr + prev_word_str_.length();
156  int step;
158  while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
159  str_ptr += step;
161  }
162  ASSERT_HOST(str_ptr == str_end);
163  }
164 }
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:570
bool language_model_ngram_space_delimited_language
DawgPositionVector beginning_active_dawgs_
const char * string() const
Definition: strngs.cpp:198
#define ASSERT_HOST(x)
Definition: errcode.h:84
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:136
DawgPositionVector very_beginning_active_dawgs_
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:587
inT32 length() const
Definition: strngs.cpp:193
const STRING & unichar_string() const
Definition: ratngs.h:537

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 291 of file language_model.h.

291  {
292  if (vse.top_choice_flags) return false;
293  if (vse.dawg_info != NULL &&
294  (vse.dawg_info->permuter == SYSTEM_DAWG_PERM ||
295  vse.dawg_info->permuter == USER_DAWG_PERM ||
296  vse.dawg_info->permuter == FREQ_DAWG_PERM)) return false;
297  return true;
298  }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 96 of file language_model.h.

96  {
98  }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 417 of file language_model.cpp.

418  {
419  if (parent_node == NULL) return -1;
420  UNICHAR_ID top_id = INVALID_UNICHAR_ID;
421  ViterbiStateEntry* top_lower = NULL;
422  ViterbiStateEntry* top_upper = NULL;
423  ViterbiStateEntry* top_digit = NULL;
424  ViterbiStateEntry* top_choice = NULL;
425  float lower_rating = 0.0f;
426  float upper_rating = 0.0f;
427  float digit_rating = 0.0f;
428  float top_rating = 0.0f;
429  const UNICHARSET &unicharset = dict_->getUnicharset();
430  ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
431  for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
432  ViterbiStateEntry* vse = vit.data();
433  // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
434  // back to the real character if needed.
435  ViterbiStateEntry* unichar_vse = vse;
436  UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
437  float rating = unichar_vse->curr_b->rating();
438  while (unichar_id == INVALID_UNICHAR_ID &&
439  unichar_vse->parent_vse != NULL) {
440  unichar_vse = unichar_vse->parent_vse;
441  unichar_id = unichar_vse->curr_b->unichar_id();
442  rating = unichar_vse->curr_b->rating();
443  }
444  if (unichar_id != INVALID_UNICHAR_ID) {
445  if (unicharset.get_islower(unichar_id)) {
446  if (top_lower == NULL || lower_rating > rating) {
447  top_lower = vse;
448  lower_rating = rating;
449  }
450  } else if (unicharset.get_isalpha(unichar_id)) {
451  if (top_upper == NULL || upper_rating > rating) {
452  top_upper = vse;
453  upper_rating = rating;
454  }
455  } else if (unicharset.get_isdigit(unichar_id)) {
456  if (top_digit == NULL || digit_rating > rating) {
457  top_digit = vse;
458  digit_rating = rating;
459  }
460  }
461  }
462  if (top_choice == NULL || top_rating > rating) {
463  top_choice = vse;
464  top_rating = rating;
465  top_id = unichar_id;
466  }
467  }
468  if (top_choice == NULL) return -1;
469  bool mixed = (top_lower != NULL || top_upper != NULL) &&
470  top_digit != NULL;
471  if (top_lower == NULL) top_lower = top_choice;
472  top_lower->top_choice_flags |= kLowerCaseFlag;
473  if (top_upper == NULL) top_upper = top_choice;
474  top_upper->top_choice_flags |= kUpperCaseFlag;
475  if (top_digit == NULL) top_digit = top_choice;
476  top_digit->top_choice_flags |= kDigitFlag;
477  top_choice->top_choice_flags |= kSmallestRatingFlag;
478  if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
479  (top_choice->top_choice_flags &
481  // If the compound marker top choice carries any of the top alnum flags,
482  // then give it all of them, allowing words like I-295 to be chosen.
483  top_choice->top_choice_flags |=
485  }
486  return mixed ? 1 : 0;
487 }
static const LanguageModelFlagsType kSmallestRatingFlag
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
static const LanguageModelFlagsType kUpperCaseFlag
static const LanguageModelFlagsType kDigitFlag
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
Definition: cluster.h:45
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:108
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:35
static const LanguageModelFlagsType kLowerCaseFlag

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1232 of file language_model.cpp.

1237  {
1238  bool truth_path;
1239  WERD_CHOICE *word = ConstructWord(vse, word_res, &best_choice_bundle->fixpt,
1240  blamer_bundle, &truth_path);
1241  ASSERT_HOST(word != NULL);
1242  if (dict_->stopper_debug_level >= 1) {
1243  STRING word_str;
1244  word->string_and_lengths(&word_str, NULL);
1245  vse->Print(word_str.string());
1246  }
1247  if (language_model_debug_level > 0) {
1248  word->print("UpdateBestChoice() constructed word");
1249  }
1250  // Record features from the current path if necessary.
1251  ParamsTrainingHypothesis curr_hyp;
1252  if (blamer_bundle != NULL) {
1253  if (vse->dawg_info != NULL) vse->dawg_info->permuter =
1254  static_cast<PermuterType>(word->permuter());
1255  ExtractFeaturesFromPath(*vse, curr_hyp.features);
1256  word->string_and_lengths(&(curr_hyp.str), NULL);
1257  curr_hyp.cost = vse->cost; // record cost for error rate computations
1258  if (language_model_debug_level > 0) {
1259  tprintf("Raw features extracted from %s (cost=%g) [ ",
1260  curr_hyp.str.string(), curr_hyp.cost);
1261  for (int deb_i = 0; deb_i < PTRAIN_NUM_FEATURE_TYPES; ++deb_i) {
1262  tprintf("%g ", curr_hyp.features[deb_i]);
1263  }
1264  tprintf("]\n");
1265  }
1266  // Record the current hypothesis in params_training_bundle.
1267  blamer_bundle->AddHypothesis(curr_hyp);
1268  if (truth_path)
1269  blamer_bundle->UpdateBestRating(word->rating());
1270  }
1271  if (blamer_bundle != NULL && blamer_bundle->GuidedSegsearchStillGoing()) {
1272  // The word was constructed solely for blamer_bundle->AddHypothesis, so
1273  // we no longer need it.
1274  delete word;
1275  return;
1276  }
1277  if (word_res->chopped_word != NULL && !word_res->chopped_word->blobs.empty())
1278  word->SetScriptPositions(false, word_res->chopped_word);
1279  // Update and log new raw_choice if needed.
1280  if (word_res->raw_choice == NULL ||
1281  word->rating() < word_res->raw_choice->rating()) {
1282  if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0)
1283  tprintf("Updated raw choice\n");
1284  }
1285  // Set the modified rating for best choice to vse->cost and log best choice.
1286  word->set_rating(vse->cost);
1287  // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1288  // computes adjust_factor that is used by the adaption code (e.g. by
1289  // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1290  // Note: the rating of the word is not adjusted.
1291  dict_->adjust_word(word, vse->dawg_info == NULL,
1292  vse->consistency_info.xht_decision, 0.0,
1293  false, language_model_debug_level > 0);
1294  // Hand ownership of the word over to the word_res.
1296  dict_->stopper_debug_level >= 1, word)) {
1297  // The word was so bad that it was deleted.
1298  return;
1299  }
1300  if (word_res->best_choice == word) {
1301  // Word was the new best.
1302  if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1303  AcceptablePath(*vse)) {
1304  acceptable_choice_found_ = true;
1305  }
1306  // Update best_choice_bundle.
1307  best_choice_bundle->updated = true;
1308  best_choice_bundle->best_vse = vse;
1309  if (language_model_debug_level > 0) {
1310  tprintf("Updated best choice\n");
1311  word->print_state("New state ");
1312  }
1313  // Update hyphen state if we are dealing with a dictionary word.
1314  if (vse->dawg_info != NULL) {
1315  if (dict_->has_hyphen_end(*word)) {
1317  } else {
1318  dict_->reset_hyphen_vars(true);
1319  }
1320  }
1321 
1322  if (blamer_bundle != NULL) {
1324  vse->dawg_info != NULL && vse->top_choice_flags);
1325  }
1326  }
1327  if (wordrec_display_segmentations && word_res->chopped_word != NULL) {
1328  word->DisplaySegmentation(word_res->chopped_word);
1329  }
1330 }
uinT8 permuter() const
Definition: ratngs.h:342
bool empty() const
Definition: genericvector.h:91
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:672
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
TWERD * chopped_word
Definition: pageres.h:201
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
void set_rating(float new_val)
Definition: ratngs.h:365
void print_state(const char *msg) const
Definition: ratngs.cpp:741
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
PermuterType
Definition: ratngs.h:238
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:531
float rating() const
Definition: ratngs.h:323
WERD_CHOICE * best_choice
Definition: pageres.h:219
void print() const
Definition: ratngs.h:576
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:50
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
#define tprintf(...)
Definition: tprintf.h:31
DawgPositionVector * active_dawgs
Definition: dict.h:80
const char * string() const
Definition: strngs.cpp:198
bool AcceptablePath(const ViterbiStateEntry &vse)
int stopper_debug_level
Definition: dict.h:622
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
Definition: strngs.h:45
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
Definition: blamer.h:154
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_best_choice_is_dict_and_top_choice(bool value)
Definition: blamer.h:135
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:49
WERD_CHOICE * raw_choice
Definition: pageres.h:224
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
void UpdateBestRating(float rating)
Definition: blamer.h:122
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
int tessedit_truncate_wordchoice_log
Definition: dict.h:628
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:750

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 247 of file language_model.cpp.

255  {
256  if (language_model_debug_level > 0) {
257  tprintf("\nUpdateState: col=%d row=%d %s",
258  curr_col, curr_row, just_classified ? "just_classified" : "");
260  tprintf("(parent=%p)\n", parent_node);
261  else
262  tprintf("\n");
263  }
264  // Initialize helper variables.
265  bool word_end = (curr_row+1 >= word_res->ratings->dimension());
266  bool new_changed = false;
267  float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
268  const UNICHARSET& unicharset = dict_->getUnicharset();
269  BLOB_CHOICE *first_lower = NULL;
270  BLOB_CHOICE *first_upper = NULL;
271  BLOB_CHOICE *first_digit = NULL;
272  bool has_alnum_mix = false;
273  if (parent_node != NULL) {
274  int result = SetTopParentLowerUpperDigit(parent_node);
275  if (result < 0) {
277  tprintf("No parents found to process\n");
278  return false;
279  }
280  if (result > 0)
281  has_alnum_mix = true;
282  }
283  if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper,
284  &first_digit))
285  has_alnum_mix = false;;
286  ScanParentsForCaseMix(unicharset, parent_node);
287  if (language_model_debug_level > 3 && parent_node != NULL) {
288  parent_node->Print("Parent viterbi list");
289  }
290  LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
291 
292  // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
293  ViterbiStateEntry_IT vit;
294  BLOB_CHOICE_IT c_it(curr_list);
295  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
296  BLOB_CHOICE* choice = c_it.data();
297  // TODO(antonova): make sure commenting this out if ok for ngram
298  // model scoring (I think this was introduced to fix ngram model quirks).
299  // Skip NULL unichars unless it is the only choice.
300  //if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
301  UNICHAR_ID unichar_id = choice->unichar_id();
302  if (unicharset.get_fragment(unichar_id)) {
303  continue; // Skip fragments.
304  }
305  // Set top choice flags.
306  LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
307  if (c_it.at_first() || !new_changed)
308  blob_choice_flags |= kSmallestRatingFlag;
309  if (first_lower == choice) blob_choice_flags |= kLowerCaseFlag;
310  if (first_upper == choice) blob_choice_flags |= kUpperCaseFlag;
311  if (first_digit == choice) blob_choice_flags |= kDigitFlag;
312 
313  if (parent_node == NULL) {
314  // Process the beginning of a word.
315  // If there is a better case variant that is not distinguished by size,
316  // skip this blob choice, as we have no choice but to accept the result
317  // of the character classifier to distinguish between them, even if
318  // followed by an upper case.
319  // With words like iPoc, and other CamelBackWords, the lower-upper
320  // transition can only be achieved if the classifier has the correct case
321  // as the top choice, and leaving an initial I lower down the list
322  // increases the chances of choosing IPoc simply because it doesn't
323  // include such a transition. iPoc will beat iPOC and ipoc because
324  // the other words are baseline/x-height inconsistent.
325  if (HasBetterCaseVariant(unicharset, choice, curr_list))
326  continue;
327  // Upper counts as lower at the beginning of a word.
328  if (blob_choice_flags & kUpperCaseFlag)
329  blob_choice_flags |= kLowerCaseFlag;
330  new_changed |= AddViterbiStateEntry(
331  blob_choice_flags, denom, word_end, curr_col, curr_row,
332  choice, curr_state, NULL, pain_points,
333  word_res, best_choice_bundle, blamer_bundle);
334  } else {
335  // Get viterbi entries from each parent ViterbiStateEntry.
336  vit.set_to_list(&parent_node->viterbi_state_entries);
337  int vit_counter = 0;
338  vit.mark_cycle_pt();
339  ViterbiStateEntry* parent_vse = NULL;
340  LanguageModelFlagsType top_choice_flags;
341  while ((parent_vse = GetNextParentVSE(just_classified, has_alnum_mix,
342  c_it.data(), blob_choice_flags,
343  unicharset, word_res, &vit,
344  &top_choice_flags)) != NULL) {
345  // Skip pruned entries and do not look at prunable entries if already
346  // examined language_model_viterbi_list_max_num_prunable of those.
347  if (PrunablePath(*parent_vse) &&
349  (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
350  continue;
351  }
352  // If the parent has no alnum choice, (ie choice is the first in a
353  // string of alnum), and there is a better case variant that is not
354  // distinguished by size, skip this blob choice/parent, as with the
355  // initial blob treatment above.
356  if (!parent_vse->HasAlnumChoice(unicharset) &&
357  HasBetterCaseVariant(unicharset, choice, curr_list))
358  continue;
359  // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
360  // looks good according to the Dawgs or character ngram model.
361  new_changed |= AddViterbiStateEntry(
362  top_choice_flags, denom, word_end, curr_col, curr_row,
363  c_it.data(), curr_state, parent_vse, pain_points,
364  word_res, best_choice_bundle, blamer_bundle);
365  }
366  }
367  }
368  return new_changed;
369 }
int language_model_viterbi_list_max_num_prunable
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:37
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static const LanguageModelFlagsType kSmallestRatingFlag
int dimension() const
Definition: matrix.h:528
static const LanguageModelFlagsType kXhtConsistentFlag
MATRIX * ratings
Definition: pageres.h:215
static const LanguageModelFlagsType kUpperCaseFlag
#define tprintf(...)
Definition: tprintf.h:31
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:733
static const LanguageModelFlagsType kDigitFlag
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
bool PrunablePath(const ViterbiStateEntry &vse)
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
int UNICHAR_ID
Definition: unichar.h:35
static const LanguageModelFlagsType kLowerCaseFlag

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_
protected

Definition at line 408 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 396 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_
protected

Definition at line 410 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 356 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_
protected

Definition at line 375 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_
protected

Definition at line 382 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_
protected

Definition at line 371 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 48 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 46 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 53 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 45 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 47 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 49 of file language_model.h.

◆ language_model_debug_level

int tesseract::LanguageModel::language_model_debug_level = 0

"Language model debug level"

Definition at line 308 of file language_model.h.

◆ language_model_min_compound_length

int tesseract::LanguageModel::language_model_min_compound_length = 3

"Minimum length of compound words"

Definition at line 335 of file language_model.h.

◆ language_model_ngram_nonmatch_score

double tesseract::LanguageModel::language_model_ngram_nonmatch_score = -40.0

"Average classifier score of a non-matching unichar"

Definition at line 322 of file language_model.h.

◆ language_model_ngram_on

bool tesseract::LanguageModel::language_model_ngram_on = false

"Turn on/off the use of character ngram model"

Definition at line 310 of file language_model.h.

◆ language_model_ngram_order

int tesseract::LanguageModel::language_model_ngram_order = 8

"Maximum order of the character ngram model"

Definition at line 312 of file language_model.h.

◆ language_model_ngram_rating_factor

double tesseract::LanguageModel::language_model_ngram_rating_factor = 16.0

"Factor to bring log-probs into the same range as ratings" " when multiplied by outline length "

Definition at line 331 of file language_model.h.

◆ language_model_ngram_scale_factor

double tesseract::LanguageModel::language_model_ngram_scale_factor = 0.03

"Strength of the character ngram model relative to the" " character classifier "

Definition at line 328 of file language_model.h.

◆ language_model_ngram_small_prob

double tesseract::LanguageModel::language_model_ngram_small_prob = 0.000001

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

Definition at line 320 of file language_model.h.

◆ language_model_ngram_space_delimited_language

bool tesseract::LanguageModel::language_model_ngram_space_delimited_language = true

"Words are delimited by space"

Definition at line 333 of file language_model.h.

◆ language_model_ngram_use_only_first_uft8_step

bool tesseract::LanguageModel::language_model_ngram_use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities"

Definition at line 325 of file language_model.h.

◆ language_model_penalty_case

double tesseract::LanguageModel::language_model_penalty_case = 0.1

"Penalty for inconsistent case"

Definition at line 344 of file language_model.h.

◆ language_model_penalty_chartype

double tesseract::LanguageModel::language_model_penalty_chartype = 0.3

"Penalty for inconsistent character type"

Definition at line 348 of file language_model.h.

◆ language_model_penalty_font

double tesseract::LanguageModel::language_model_penalty_font = 0.00

"Penalty for inconsistent font"

Definition at line 350 of file language_model.h.

◆ language_model_penalty_increment

double tesseract::LanguageModel::language_model_penalty_increment = 0.01

"Penalty increment"

Definition at line 353 of file language_model.h.

◆ language_model_penalty_non_dict_word

double tesseract::LanguageModel::language_model_penalty_non_dict_word = 0.15

"Penalty for non-dictionary words"

Definition at line 340 of file language_model.h.

◆ language_model_penalty_non_freq_dict_word

double tesseract::LanguageModel::language_model_penalty_non_freq_dict_word = 0.1

"Penalty for words not in the frequent word dictionary"

Definition at line 338 of file language_model.h.

◆ language_model_penalty_punc

double tesseract::LanguageModel::language_model_penalty_punc = 0.2

"Penalty for inconsistent punctuation"

Definition at line 342 of file language_model.h.

◆ language_model_penalty_script

double tesseract::LanguageModel::language_model_penalty_script = 0.5

"Penalty for inconsistent script"

Definition at line 346 of file language_model.h.

◆ language_model_penalty_spacing

double tesseract::LanguageModel::language_model_penalty_spacing = 0.05

"Penalty for inconsistent spacing"

Definition at line 352 of file language_model.h.

◆ language_model_use_sigmoidal_certainty

bool tesseract::LanguageModel::language_model_use_sigmoidal_certainty = false

"Use sigmoidal score for certainty"

Definition at line 356 of file language_model.h.

◆ language_model_viterbi_list_max_num_prunable

int tesseract::LanguageModel::language_model_viterbi_list_max_num_prunable = 10

"Maximum number of prunable (those for which PrunablePath() is" " true) entries in each viterbi list recorded in BLOB_CHOICEs"

Definition at line 315 of file language_model.h.

◆ language_model_viterbi_list_max_size

int tesseract::LanguageModel::language_model_viterbi_list_max_size = 500

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"

Definition at line 317 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_
protected

Definition at line 385 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 413 of file language_model.h.

◆ prev_word_str_

STRING tesseract::LanguageModel::prev_word_str_
protected

Definition at line 392 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_
protected

Definition at line 393 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_
protected

Definition at line 366 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 395 of file language_model.h.

◆ wordrec_display_segmentations

int tesseract::LanguageModel::wordrec_display_segmentations = 0

"Display Segmentations"

Definition at line 354 of file language_model.h.


The documentation for this class was generated from the following files: