tesseract v5.3.3.20231005
tesseract::LanguageModel Class Reference

#include <language_model.h>

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 
 ~LanguageModel ()
 
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
 
bool UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
bool AcceptableChoiceFound ()
 
void SetAcceptableChoiceFound (bool val)
 
ParamsModelgetParamsModel ()
 
 INT_VAR_H (language_model_debug_level)
 
 BOOL_VAR_H (language_model_ngram_on)
 
 INT_VAR_H (language_model_ngram_order)
 
 INT_VAR_H (language_model_viterbi_list_max_num_prunable)
 
 INT_VAR_H (language_model_viterbi_list_max_size)
 
 double_VAR_H (language_model_ngram_small_prob)
 
 double_VAR_H (language_model_ngram_nonmatch_score)
 
 BOOL_VAR_H (language_model_ngram_use_only_first_uft8_step)
 
 double_VAR_H (language_model_ngram_scale_factor)
 
 double_VAR_H (language_model_ngram_rating_factor)
 
 BOOL_VAR_H (language_model_ngram_space_delimited_language)
 
 INT_VAR_H (language_model_min_compound_length)
 
 double_VAR_H (language_model_penalty_non_freq_dict_word)
 
 double_VAR_H (language_model_penalty_non_dict_word)
 
 double_VAR_H (language_model_penalty_punc)
 
 double_VAR_H (language_model_penalty_case)
 
 double_VAR_H (language_model_penalty_script)
 
 double_VAR_H (language_model_penalty_chartype)
 
 double_VAR_H (language_model_penalty_font)
 
 double_VAR_H (language_model_penalty_spacing)
 
 double_VAR_H (language_model_penalty_increment)
 
 INT_VAR_H (wordrec_display_segmentations)
 
 BOOL_VAR_H (language_model_use_sigmoidal_certainty)
 

Static Public Member Functions

static void ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[])
 

Static Public Attributes

static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
 
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
 
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
 
static const LanguageModelFlagsType kDigitFlag = 0x8
 
static const LanguageModelFlagsType kXhtConsistentFlag = 0x10
 
static const float kMaxAvgNgramCost = 25.0f
 

Protected Member Functions

float CertaintyScore (float cert)
 
float ComputeAdjustment (int num_problems, float penalty)
 
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
 
float ComputeAdjustedPathCost (ViterbiStateEntry *vse)
 
bool GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
 
int SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const
 
ViterbiStateEntryGetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
 
bool AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
 
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
 
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
 
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
 
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
 
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
 
void UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
WERD_CHOICEConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
 
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
 
bool PrunablePath (const ViterbiStateEntry &vse)
 
bool AcceptablePath (const ViterbiStateEntry &vse)
 

Protected Attributes

DawgArgs dawg_args_
 
float rating_cert_scale_ = 0.0f
 
const UnicityTable< FontInfo > * fontinfo_table_ = nullptr
 
Dictdict_ = nullptr
 
bool fixed_pitch_ = false
 
float max_char_wh_ratio_ = 0.0f
 
std::string prev_word_str_
 
int prev_word_unichar_step_len_ = 0
 
DawgPositionVector very_beginning_active_dawgs_
 
DawgPositionVector beginning_active_dawgs_
 
bool acceptable_choice_found_ = false
 
bool correct_segmentation_explored_ = false
 
ParamsModel params_model_
 

Detailed Description

Definition at line 51 of file language_model.h.

Constructor & Destructor Documentation

◆ LanguageModel()

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)

Definition at line 53 of file language_model.cpp.

54 : INT_MEMBER(language_model_debug_level, 0, "Language model debug level",
55 dict->getCCUtil()->params())
56 , BOOL_INIT_MEMBER(language_model_ngram_on, false,
57 "Turn on/off the use of character ngram model", dict->getCCUtil()->params())
58 , INT_MEMBER(language_model_ngram_order, 8, "Maximum order of the character ngram model",
59 dict->getCCUtil()->params())
60 , INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
61 "Maximum number of prunable (those for which"
62 " PrunablePath() is true) entries in each viterbi list"
63 " recorded in BLOB_CHOICEs",
64 dict->getCCUtil()->params())
65 , INT_MEMBER(language_model_viterbi_list_max_size, 500,
66 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params())
68 , double_MEMBER(language_model_ngram_small_prob, 0.000001,
69 "To avoid overly small denominators use this as the "
70 "floor of the probability returned by the ngram model.",
71 dict->getCCUtil()->params())
72 , double_MEMBER(language_model_ngram_nonmatch_score, -40.0,
73 "Average classifier score of a non-matching unichar.",
74 dict->getCCUtil()->params())
75 , BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
76 "Use only the first UTF8 step of the given string"
77 " when computing log probabilities.",
78 dict->getCCUtil()->params())
79 , double_MEMBER(language_model_ngram_scale_factor, 0.03,
80 "Strength of the character ngram model relative to the"
81 " character classifier ",
82 dict->getCCUtil()->params())
83 , double_MEMBER(language_model_ngram_rating_factor, 16.0,
84 "Factor to bring log-probs into the same range as ratings"
85 " when multiplied by outline length ",
86 dict->getCCUtil()->params())
87 , BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
88 "Words are delimited by space", dict->getCCUtil()->params())
89 , INT_MEMBER(language_model_min_compound_length, 3, "Minimum length of compound words",
90 dict->getCCUtil()->params())
91 , double_MEMBER(language_model_penalty_non_freq_dict_word, 0.1,
92 "Penalty for words not in the frequent word dictionary",
93 dict->getCCUtil()->params())
94 , double_MEMBER(language_model_penalty_non_dict_word, 0.15, "Penalty for non-dictionary words",
95 dict->getCCUtil()->params())
96 , double_MEMBER(language_model_penalty_punc, 0.2, "Penalty for inconsistent punctuation",
97 dict->getCCUtil()->params())
98 , double_MEMBER(language_model_penalty_case, 0.1, "Penalty for inconsistent case",
99 dict->getCCUtil()->params())
100 , double_MEMBER(language_model_penalty_script, 0.5, "Penalty for inconsistent script",
101 dict->getCCUtil()->params())
102 , double_MEMBER(language_model_penalty_chartype, 0.3, "Penalty for inconsistent character type",
103 dict->getCCUtil()->params())
104 ,
105 // TODO(daria, rays): enable font consistency checking
106 // after improving font analysis.
107 double_MEMBER(language_model_penalty_font, 0.00, "Penalty for inconsistent font",
108 dict->getCCUtil()->params())
109 , double_MEMBER(language_model_penalty_spacing, 0.05, "Penalty for inconsistent spacing",
110 dict->getCCUtil()->params())
111 , double_MEMBER(language_model_penalty_increment, 0.01, "Penalty increment",
112 dict->getCCUtil()->params())
113 , INT_MEMBER(wordrec_display_segmentations, 0, "Display Segmentations (ScrollView)",
114 dict->getCCUtil()->params())
115 , BOOL_INIT_MEMBER(language_model_use_sigmoidal_certainty, false,
116 "Use sigmoidal score for certainty", dict->getCCUtil()->params())
117 , dawg_args_(nullptr, new DawgPositionVector(), NO_PERM)
118 , fontinfo_table_(fontinfo_table)
119 , dict_(dict) {
120 ASSERT_HOST(dict_ != nullptr);
121}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:369
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:379
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:375
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:371
#define ASSERT_HOST(x)
Definition: errcode.h:54
@ NO_PERM
Definition: ratngs.h:236
const UnicityTable< FontInfo > * fontinfo_table_

◆ ~LanguageModel()

tesseract::LanguageModel::~LanguageModel ( )

Definition at line 123 of file language_model.cpp.

123 {
125}
DawgPositionVector * updated_dawgs
Definition: dict.h:88

Member Function Documentation

◆ AcceptableChoiceFound()

bool tesseract::LanguageModel::AcceptableChoiceFound ( )
inline

Definition at line 96 of file language_model.h.

96 {
98 }

◆ AcceptablePath()

bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 285 of file language_model.h.

285 {
286 return (vse.dawg_info != nullptr || vse.Consistent() ||
287 (vse.ngram_info != nullptr && !vse.ngram_info->pruned));
288 }

◆ AddViterbiStateEntry()

bool tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
LanguageModelState curr_state,
ViterbiStateEntry parent_vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 577 of file language_model.cpp.

582 {
583 ViterbiStateEntry_IT vit;
584 if (language_model_debug_level > 1) {
585 tprintf(
586 "AddViterbiStateEntry for unichar %s rating=%.4f"
587 " certainty=%.4f top_choice_flags=0x%x",
588 dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->rating(), b->certainty(),
589 top_choice_flags);
590 if (language_model_debug_level > 5) {
591 tprintf(" parent_vse=%p\n", static_cast<void *>(parent_vse));
592 } else {
593 tprintf("\n");
594 }
595 }
596 ASSERT_HOST(curr_state != nullptr);
597 // Check whether the list is full.
598 if (curr_state->viterbi_state_entries_length >= language_model_viterbi_list_max_size) {
599 if (language_model_debug_level > 1) {
600 tprintf("AddViterbiStateEntry: viterbi list is full!\n");
601 }
602 return false;
603 }
604
605 // Invoke Dawg language model component.
606 LanguageModelDawgInfo *dawg_info = GenerateDawgInfo(word_end, curr_col, curr_row, *b, parent_vse);
607
609 // Invoke Ngram language model component.
610 LanguageModelNgramInfo *ngram_info = nullptr;
611 if (language_model_ngram_on) {
612 ngram_info =
613 GenerateNgramInfo(dict_->getUnicharset().id_to_unichar(b->unichar_id()), b->certainty(),
614 denom, curr_col, curr_row, outline_length, parent_vse);
615 ASSERT_HOST(ngram_info != nullptr);
616 }
617 bool liked_by_language_model =
618 dawg_info != nullptr || (ngram_info != nullptr && !ngram_info->pruned);
619 // Quick escape if not liked by the language model, can't be consistent
620 // xheight, and not top choice.
621 if (!liked_by_language_model && top_choice_flags == 0) {
622 if (language_model_debug_level > 1) {
623 tprintf("Language model components very early pruned this entry\n");
624 }
625 delete ngram_info;
626 delete dawg_info;
627 return false;
628 }
629
630 // Check consistency of the path and set the relevant consistency_info.
631 LMConsistencyInfo consistency_info(parent_vse != nullptr ? &parent_vse->consistency_info
632 : nullptr);
633 // Start with just the x-height consistency, as it provides significant
634 // pruning opportunity.
635 consistency_info.ComputeXheightConsistency(
636 b, dict_->getUnicharset().get_ispunctuation(b->unichar_id()));
637 // Turn off xheight consistent flag if not consistent.
638 if (consistency_info.InconsistentXHeight()) {
639 top_choice_flags &= ~kXhtConsistentFlag;
640 }
641
642 // Quick escape if not liked by the language model, not consistent xheight,
643 // and not top choice.
644 if (!liked_by_language_model && top_choice_flags == 0) {
645 if (language_model_debug_level > 1) {
646 tprintf("Language model components early pruned this entry\n");
647 }
648 delete ngram_info;
649 delete dawg_info;
650 return false;
651 }
652
653 // Compute the rest of the consistency info.
654 FillConsistencyInfo(curr_col, word_end, b, parent_vse, word_res, &consistency_info);
655 if (dawg_info != nullptr && consistency_info.invalid_punc) {
656 consistency_info.invalid_punc = false; // do not penalize dict words
657 }
658
659 // Compute cost of associating the blobs that represent the current unichar.
660 AssociateStats associate_stats;
661 ComputeAssociateStats(curr_col, curr_row, max_char_wh_ratio_, parent_vse, word_res,
662 &associate_stats);
663 if (parent_vse != nullptr) {
664 associate_stats.shape_cost += parent_vse->associate_stats.shape_cost;
665 associate_stats.bad_shape |= parent_vse->associate_stats.bad_shape;
666 }
667
668 // Create the new ViterbiStateEntry compute the adjusted cost of the path.
669 auto *new_vse = new ViterbiStateEntry(parent_vse, b, 0.0, outline_length, consistency_info,
670 associate_stats, top_choice_flags, dawg_info, ngram_info,
671 (language_model_debug_level > 0)
672 ? dict_->getUnicharset().id_to_unichar(b->unichar_id())
673 : nullptr);
674 new_vse->cost = ComputeAdjustedPathCost(new_vse);
675 if (language_model_debug_level >= 3) {
676 tprintf("Adjusted cost = %g\n", new_vse->cost);
677 }
678
679 // Invoke Top Choice language model component to make the final adjustments
680 // to new_vse->top_choice_flags.
681 if (!curr_state->viterbi_state_entries.empty() && new_vse->top_choice_flags) {
682 GenerateTopChoiceInfo(new_vse, parent_vse, curr_state);
683 }
684
685 // If language model components did not like this unichar - return.
686 bool keep = new_vse->top_choice_flags || liked_by_language_model;
687 if (!(top_choice_flags & kSmallestRatingFlag) && // no non-top choice paths
688 consistency_info.inconsistent_script) { // with inconsistent script
689 keep = false;
690 }
691 if (!keep) {
692 if (language_model_debug_level > 1) {
693 tprintf("Language model components did not like this entry\n");
694 }
695 delete new_vse;
696 return false;
697 }
698
699 // Discard this entry if it represents a prunable path and
700 // language_model_viterbi_list_max_num_prunable such entries with a lower
701 // cost have already been recorded.
702 if (PrunablePath(*new_vse) &&
703 (curr_state->viterbi_state_entries_prunable_length >=
704 language_model_viterbi_list_max_num_prunable) &&
705 new_vse->cost >= curr_state->viterbi_state_entries_prunable_max_cost) {
706 if (language_model_debug_level > 1) {
707 tprintf("Discarded ViterbiEntry with high cost %g max cost %g\n", new_vse->cost,
708 curr_state->viterbi_state_entries_prunable_max_cost);
709 }
710 delete new_vse;
711 return false;
712 }
713
714 // Update best choice if needed.
715 if (word_end) {
716 UpdateBestChoice(new_vse, pain_points, word_res, best_choice_bundle, blamer_bundle);
717 // Discard the entry if UpdateBestChoice() found flaws in it.
718 if (new_vse->cost >= WERD_CHOICE::kBadRating && new_vse != best_choice_bundle->best_vse) {
719 if (language_model_debug_level > 1) {
720 tprintf("Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
721 }
722 delete new_vse;
723 return false;
724 }
725 }
726
727 // Add the new ViterbiStateEntry and to curr_state->viterbi_state_entries.
728 curr_state->viterbi_state_entries.add_sorted(ViterbiStateEntry::Compare, false, new_vse);
729 curr_state->viterbi_state_entries_length++;
730 if (PrunablePath(*new_vse)) {
731 curr_state->viterbi_state_entries_prunable_length++;
732 }
733
734 // Update lms->viterbi_state_entries_prunable_max_cost and clear
735 // top_choice_flags of entries with ratings_sum than new_vse->ratings_sum.
736 if ((curr_state->viterbi_state_entries_prunable_length >=
737 language_model_viterbi_list_max_num_prunable) ||
738 new_vse->top_choice_flags) {
739 ASSERT_HOST(!curr_state->viterbi_state_entries.empty());
740 int prunable_counter = language_model_viterbi_list_max_num_prunable;
741 vit.set_to_list(&(curr_state->viterbi_state_entries));
742 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
743 ViterbiStateEntry *curr_vse = vit.data();
744 // Clear the appropriate top choice flags of the entries in the
745 // list that have cost higher thank new_entry->cost
746 // (since they will not be top choices any more).
747 if (curr_vse->top_choice_flags && curr_vse != new_vse && curr_vse->cost > new_vse->cost) {
748 curr_vse->top_choice_flags &= ~(new_vse->top_choice_flags);
749 }
750 if (prunable_counter > 0 && PrunablePath(*curr_vse)) {
751 --prunable_counter;
752 }
753 // Update curr_state->viterbi_state_entries_prunable_max_cost.
754 if (prunable_counter == 0) {
755 curr_state->viterbi_state_entries_prunable_max_cost = vit.data()->cost;
756 if (language_model_debug_level > 1) {
757 tprintf("Set viterbi_state_entries_prunable_max_cost to %g\n",
758 curr_state->viterbi_state_entries_prunable_max_cost);
759 }
760 prunable_counter = -1; // stop counting
761 }
762 }
763 }
764
765 // Print the newly created ViterbiStateEntry.
766 if (language_model_debug_level > 2) {
767 new_vse->Print("New");
768 if (language_model_debug_level > 5) {
769 curr_state->Print("Updated viterbi list");
770 }
771 }
772
773 return true;
774}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static const float kBadRating
Definition: ratngs.h:260
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
Definition: associate.h:84
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
bool PrunablePath(const ViterbiStateEntry &vse)
static const LanguageModelFlagsType kSmallestRatingFlag
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
static int Compare(const void *e1, const void *e2)
Definition: lm_state.h:136

◆ BOOL_VAR_H() [1/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_on  )

◆ BOOL_VAR_H() [2/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_space_delimited_language  )

◆ BOOL_VAR_H() [3/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_ngram_use_only_first_uft8_step  )

◆ BOOL_VAR_H() [4/4]

tesseract::LanguageModel::BOOL_VAR_H ( language_model_use_sigmoidal_certainty  )

◆ CertaintyScore()

float tesseract::LanguageModel::CertaintyScore ( float  cert)
inlineprotected

Definition at line 108 of file language_model.h.

108 {
109 if (language_model_use_sigmoidal_certainty) {
110 // cert is assumed to be between 0 and -dict_->certainty_scale.
111 // If you enable language_model_use_sigmoidal_certainty, you
112 // need to adjust language_model_ngram_nonmatch_score as well.
113 cert = -cert / dict_->certainty_scale;
114 return 1.0f / (1.0f + exp(10.0f * cert));
115 } else {
116 return (-1.0f / cert);
117 }
118 }

◆ ComputeAdjustedPathCost()

float tesseract::LanguageModel::ComputeAdjustedPathCost ( ViterbiStateEntry vse)
protected

Definition at line 1196 of file language_model.cpp.

1196 {
1197 ASSERT_HOST(vse != nullptr);
1198 if (params_model_.Initialized()) {
1199 float features[PTRAIN_NUM_FEATURE_TYPES];
1200 ExtractFeaturesFromPath(*vse, features);
1201 float cost = params_model_.ComputeCost(features);
1202 if (language_model_debug_level > 3) {
1203 tprintf("ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1204 if (language_model_debug_level >= 5) {
1205 for (int f = 0; f < PTRAIN_NUM_FEATURE_TYPES; ++f) {
1206 tprintf("%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1207 }
1208 }
1209 }
1210 return cost * vse->outline_length;
1211 } else {
1212 float adjustment = 1.0f;
1213 if (vse->dawg_info == nullptr || vse->dawg_info->permuter != FREQ_DAWG_PERM) {
1214 adjustment += language_model_penalty_non_freq_dict_word;
1215 }
1216 if (vse->dawg_info == nullptr) {
1217 adjustment += language_model_penalty_non_dict_word;
1218 if (vse->length > language_model_min_compound_length) {
1219 adjustment +=
1220 ((vse->length - language_model_min_compound_length) * language_model_penalty_increment);
1221 }
1222 }
1223 if (vse->associate_stats.shape_cost > 0) {
1224 adjustment += vse->associate_stats.shape_cost / static_cast<float>(vse->length);
1225 }
1226 if (language_model_ngram_on) {
1227 ASSERT_HOST(vse->ngram_info != nullptr);
1228 return vse->ngram_info->ngram_and_classifier_cost * adjustment;
1229 } else {
1230 adjustment += ComputeConsistencyAdjustment(vse->dawg_info, vse->consistency_info);
1231 return vse->ratings_sum * adjustment;
1232 }
1233 }
1234}
@ FREQ_DAWG_PERM
Definition: ratngs.h:247
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
float ComputeCost(const float features[]) const

◆ ComputeAdjustment()

float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
)
inlineprotected

Definition at line 120 of file language_model.h.

120 {
121 if (num_problems == 0) {
122 return 0.0f;
123 }
124 if (num_problems == 1) {
125 return penalty;
126 }
127 return (penalty + (language_model_penalty_increment * static_cast<float>(num_problems - 1)));
128 }

◆ ComputeAssociateStats()

void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
AssociateStats associate_stats 
)
inlineprotected

Definition at line 257 of file language_model.h.

259 {
261 col, row, (parent_vse != nullptr) ? &(parent_vse->associate_stats) : nullptr,
262 (parent_vse != nullptr) ? parent_vse->length : 0, fixed_pitch_, max_char_wh_ratio, word_res,
263 language_model_debug_level > 2, associate_stats);
264 }
static void ComputeStats(int col, int row, const AssociateStats *parent_stats, int parent_path_length, bool fixed_pitch, float max_char_wh_ratio, WERD_RES *word_res, bool debug, AssociateStats *stats)
Definition: associate.cpp:33

◆ ComputeConsistencyAdjustment()

float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LMConsistencyInfo consistency_info 
)
inlineprotected

Definition at line 134 of file language_model.h.

135 {
136 if (dawg_info != nullptr) {
137 return ComputeAdjustment(consistency_info.NumInconsistentCase(),
138 language_model_penalty_case) +
139 (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f);
140 }
141 return (ComputeAdjustment(consistency_info.NumInconsistentPunc(), language_model_penalty_punc) +
142 ComputeAdjustment(consistency_info.NumInconsistentCase(), language_model_penalty_case) +
143 ComputeAdjustment(consistency_info.NumInconsistentChartype(),
144 language_model_penalty_chartype) +
145 ComputeAdjustment(consistency_info.NumInconsistentSpaces(),
146 language_model_penalty_spacing) +
147 (consistency_info.inconsistent_script ? language_model_penalty_script : 0.0f) +
148 (consistency_info.inconsistent_font ? language_model_penalty_font : 0.0f));
149 }
float ComputeAdjustment(int num_problems, float penalty)

◆ ComputeDenom()

float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list)
protected

Definition at line 998 of file language_model.cpp.

998 {
999 if (curr_list->empty()) {
1000 return 1.0f;
1001 }
1002 float denom = 0.0f;
1003 int len = 0;
1004 BLOB_CHOICE_IT c_it(curr_list);
1005 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1006 ASSERT_HOST(c_it.data() != nullptr);
1007 ++len;
1008 denom += CertaintyScore(c_it.data()->certainty());
1009 }
1010 assert(len != 0);
1011 // The ideal situation would be to have the classifier scores for
1012 // classifying each position as each of the characters in the unicharset.
1013 // Since we cannot do this because of speed, we add a very crude estimate
1014 // of what these scores for the "missing" classifications would sum up to.
1015 denom +=
1016 (dict_->getUnicharset().size() - len) * CertaintyScore(language_model_ngram_nonmatch_score);
1017
1018 return denom;
1019}
size_t size() const
Definition: unicharset.h:355
float CertaintyScore(float cert)

◆ ComputeNgramCost()

float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
)
protected

Definition at line 942 of file language_model.cpp.

944 {
945 const char *context_ptr = context;
946 char *modified_context = nullptr;
947 char *modified_context_end = nullptr;
948 const char *unichar_ptr = unichar;
949 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
950 float prob = 0.0f;
951 int step = 0;
952 while (unichar_ptr < unichar_end && (step = UNICHAR::utf8_step(unichar_ptr)) > 0) {
953 if (language_model_debug_level > 1) {
954 tprintf("prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955 dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step));
956 }
957 prob += dict_->ProbabilityInContext(context_ptr, -1, unichar_ptr, step);
958 ++(*unichar_step_len);
959 if (language_model_ngram_use_only_first_uft8_step) {
960 break;
961 }
962 unichar_ptr += step;
963 // If there are multiple UTF8 characters present in unichar, context is
964 // updated to include the previously examined characters from str,
965 // unless use_only_first_uft8_step is true.
966 if (unichar_ptr < unichar_end) {
967 if (modified_context == nullptr) {
968 size_t context_len = strlen(context);
969 modified_context = new char[context_len + strlen(unichar_ptr) + step + 1];
970 memcpy(modified_context, context, context_len);
971 modified_context_end = modified_context + context_len;
972 context_ptr = modified_context;
973 }
974 strncpy(modified_context_end, unichar_ptr - step, step);
975 modified_context_end += step;
976 *modified_context_end = '\0';
977 }
978 }
979 prob /= static_cast<float>(*unichar_step_len); // normalize
980 if (prob < language_model_ngram_small_prob) {
981 if (language_model_debug_level > 0) {
982 tprintf("Found small prob %g\n", prob);
983 }
984 *found_small_prob = true;
985 prob = language_model_ngram_small_prob;
986 }
987 *ngram_cost = -1 * std::log2(prob);
988 float ngram_and_classifier_cost = -1 * std::log2(CertaintyScore(certainty) / denom) +
989 *ngram_cost * language_model_ngram_scale_factor;
990 if (language_model_debug_level > 1) {
991 tprintf("-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar, unichar, context_ptr,
992 CertaintyScore(certainty) / denom, prob, ngram_and_classifier_cost);
993 }
994 delete[] modified_context;
995 return ngram_and_classifier_cost;
996}
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:357

◆ ConstructWord()

WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( ViterbiStateEntry vse,
WERD_RES word_res,
DANGERR fixpt,
BlamerBundle blamer_bundle,
bool *  truth_path 
)
protected

Definition at line 1386 of file language_model.cpp.

1388 {
1389 if (truth_path != nullptr) {
1390 *truth_path =
1391 (blamer_bundle != nullptr && vse->length == blamer_bundle->correct_segmentation_length());
1392 }
1393 BLOB_CHOICE *curr_b = vse->curr_b;
1394 ViterbiStateEntry *curr_vse = vse;
1395
1396 int i;
1397 bool compound = dict_->hyphenated(); // treat hyphenated words as compound
1398
1399 // Re-compute the variance of the width-to-height ratios (since we now
1400 // can compute the mean over the whole word).
1401 float full_wh_ratio_mean = 0.0f;
1402 if (vse->associate_stats.full_wh_ratio_var != 0.0f) {
1403 vse->associate_stats.shape_cost -= vse->associate_stats.full_wh_ratio_var;
1404 full_wh_ratio_mean =
1405 (vse->associate_stats.full_wh_ratio_total / static_cast<float>(vse->length));
1406 vse->associate_stats.full_wh_ratio_var = 0.0f;
1407 }
1408
1409 // Construct a WERD_CHOICE by tracing parent pointers.
1410 auto *word = new WERD_CHOICE(word_res->uch_set, vse->length);
1411 word->set_length(vse->length);
1412 int total_blobs = 0;
1413 for (i = (vse->length - 1); i >= 0; --i) {
1414 if (blamer_bundle != nullptr && truth_path != nullptr && *truth_path &&
1415 !blamer_bundle->MatrixPositionCorrect(i, curr_b->matrix_cell())) {
1416 *truth_path = false;
1417 }
1418 // The number of blobs used for this choice is row - col + 1.
1419 int num_blobs = curr_b->matrix_cell().row - curr_b->matrix_cell().col + 1;
1420 total_blobs += num_blobs;
1421 word->set_blob_choice(i, num_blobs, curr_b);
1422 // Update the width-to-height ratio variance. Useful non-space delimited
1423 // languages to ensure that the blobs are of uniform width.
1424 // Skip leading and trailing punctuation when computing the variance.
1425 if ((full_wh_ratio_mean != 0.0f &&
1426 ((curr_vse != vse && curr_vse->parent_vse != nullptr) ||
1427 !dict_->getUnicharset().get_ispunctuation(curr_b->unichar_id())))) {
1428 vse->associate_stats.full_wh_ratio_var +=
1429 pow(full_wh_ratio_mean - curr_vse->associate_stats.full_wh_ratio, 2);
1430 if (language_model_debug_level > 2) {
1431 tprintf("full_wh_ratio_var += (%g-%g)^2\n", full_wh_ratio_mean,
1432 curr_vse->associate_stats.full_wh_ratio);
1433 }
1434 }
1435
1436 // Mark the word as compound if compound permuter was set for any of
1437 // the unichars on the path (usually this will happen for unichars
1438 // that are compounding operators, like "-" and "/").
1439 if (!compound && curr_vse->dawg_info && curr_vse->dawg_info->permuter == COMPOUND_PERM) {
1440 compound = true;
1441 }
1442
1443 // Update curr_* pointers.
1444 curr_vse = curr_vse->parent_vse;
1445 if (curr_vse == nullptr) {
1446 break;
1447 }
1448 curr_b = curr_vse->curr_b;
1449 }
1450 ASSERT_HOST(i == 0); // check that we recorded all the unichar ids.
1451 ASSERT_HOST(total_blobs == word_res->ratings->dimension());
1452 // Re-adjust shape cost to include the updated width-to-height variance.
1453 if (full_wh_ratio_mean != 0.0f) {
1454 vse->associate_stats.shape_cost += vse->associate_stats.full_wh_ratio_var;
1455 }
1456
1457 word->set_rating(vse->ratings_sum);
1458 word->set_certainty(vse->min_certainty);
1459 word->set_x_heights(vse->consistency_info.BodyMinXHeight(),
1460 vse->consistency_info.BodyMaxXHeight());
1461 if (vse->dawg_info != nullptr) {
1462 word->set_permuter(compound ? COMPOUND_PERM : vse->dawg_info->permuter);
1463 } else if (language_model_ngram_on && !vse->ngram_info->pruned) {
1464 word->set_permuter(NGRAM_PERM);
1465 } else if (vse->top_choice_flags) {
1466 word->set_permuter(TOP_CHOICE_PERM);
1467 } else {
1468 word->set_permuter(NO_PERM);
1469 }
1470 word->set_dangerous_ambig_found_(!dict_->NoDangerousAmbig(word, fixpt, true, word_res->ratings));
1471 return word;
1472}
@ NGRAM_PERM
Definition: ratngs.h:241
@ TOP_CHOICE_PERM
Definition: ratngs.h:238
@ COMPOUND_PERM
Definition: ratngs.h:248
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:158

◆ double_VAR_H() [1/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_nonmatch_score  )

◆ double_VAR_H() [2/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_rating_factor  )

◆ double_VAR_H() [3/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_scale_factor  )

◆ double_VAR_H() [4/13]

tesseract::LanguageModel::double_VAR_H ( language_model_ngram_small_prob  )

◆ double_VAR_H() [5/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_case  )

◆ double_VAR_H() [6/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_chartype  )

◆ double_VAR_H() [7/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_font  )

◆ double_VAR_H() [8/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_increment  )

◆ double_VAR_H() [9/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_non_dict_word  )

◆ double_VAR_H() [10/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_non_freq_dict_word  )

◆ double_VAR_H() [11/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_punc  )

◆ double_VAR_H() [12/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_script  )

◆ double_VAR_H() [13/13]

tesseract::LanguageModel::double_VAR_H ( language_model_penalty_spacing  )

◆ ExtractFeaturesFromPath()

void tesseract::LanguageModel::ExtractFeaturesFromPath ( const ViterbiStateEntry vse,
float  features[] 
)
static

Definition at line 1336 of file language_model.cpp.

1336 {
1337 memset(features, 0, sizeof(float) * PTRAIN_NUM_FEATURE_TYPES);
1338 // Record dictionary match info.
1339 int len = vse.length <= kMaxSmallWordUnichars ? 0 : vse.length <= kMaxMediumWordUnichars ? 1 : 2;
1340 if (vse.dawg_info != nullptr) {
1341 int permuter = vse.dawg_info->permuter;
1342 if (permuter == NUMBER_PERM || permuter == USER_PATTERN_PERM) {
1343 if (vse.consistency_info.num_digits == vse.length) {
1344 features[PTRAIN_DIGITS_SHORT + len] = 1.0f;
1345 } else {
1346 features[PTRAIN_NUM_SHORT + len] = 1.0f;
1347 }
1348 } else if (permuter == DOC_DAWG_PERM) {
1349 features[PTRAIN_DOC_SHORT + len] = 1.0f;
1350 } else if (permuter == SYSTEM_DAWG_PERM || permuter == USER_DAWG_PERM ||
1351 permuter == COMPOUND_PERM) {
1352 features[PTRAIN_DICT_SHORT + len] = 1.0f;
1353 } else if (permuter == FREQ_DAWG_PERM) {
1354 features[PTRAIN_FREQ_SHORT + len] = 1.0f;
1355 }
1356 }
1357 // Record shape cost feature (normalized by path length).
1358 features[PTRAIN_SHAPE_COST_PER_CHAR] =
1359 vse.associate_stats.shape_cost / static_cast<float>(vse.length);
1360 // Record ngram cost. (normalized by the path length).
1361 features[PTRAIN_NGRAM_COST_PER_CHAR] = 0.0f;
1362 if (vse.ngram_info != nullptr) {
1363 features[PTRAIN_NGRAM_COST_PER_CHAR] =
1364 vse.ngram_info->ngram_cost / static_cast<float>(vse.length);
1365 }
1366 // Record consistency-related features.
1367 // Disabled this feature for due to its poor performance.
1368 // features[PTRAIN_NUM_BAD_PUNC] = vse.consistency_info.NumInconsistentPunc();
1369 features[PTRAIN_NUM_BAD_CASE] = vse.consistency_info.NumInconsistentCase();
1370 features[PTRAIN_XHEIGHT_CONSISTENCY] = vse.consistency_info.xht_decision;
1371 features[PTRAIN_NUM_BAD_CHAR_TYPE] =
1372 vse.dawg_info == nullptr ? vse.consistency_info.NumInconsistentChartype() : 0.0f;
1373 features[PTRAIN_NUM_BAD_SPACING] = vse.consistency_info.NumInconsistentSpaces();
1374 // Disabled this feature for now due to its poor performance.
1375 // features[PTRAIN_NUM_BAD_FONT] = vse.consistency_info.inconsistent_font;
1376
1377 // Classifier-related features.
1378 if (vse.outline_length > 0.0f) {
1379 features[PTRAIN_RATING_PER_CHAR] = vse.ratings_sum / vse.outline_length;
1380 } else {
1381 // Avoid FP division by 0.
1382 features[PTRAIN_RATING_PER_CHAR] = 0.0f;
1383 }
1384}
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ NUMBER_PERM
Definition: ratngs.h:242
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ USER_PATTERN_PERM
Definition: ratngs.h:243
@ DOC_DAWG_PERM
Definition: ratngs.h:245

◆ FillConsistencyInfo()

void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
WERD_RES word_res,
LMConsistencyInfo consistency_info 
)
protected

Definition at line 1021 of file language_model.cpp.

1023 {
1024 const UNICHARSET &unicharset = dict_->getUnicharset();
1025 UNICHAR_ID unichar_id = b->unichar_id();
1026 BLOB_CHOICE *parent_b = parent_vse != nullptr ? parent_vse->curr_b : nullptr;
1027
1028 // Check punctuation validity.
1029 if (unicharset.get_ispunctuation(unichar_id)) {
1030 consistency_info->num_punc++;
1031 }
1032 if (dict_->GetPuncDawg() != nullptr && !consistency_info->invalid_punc) {
1033 if (dict_->compound_marker(unichar_id) && parent_b != nullptr &&
1034 (unicharset.get_isalpha(parent_b->unichar_id()) ||
1035 unicharset.get_isdigit(parent_b->unichar_id()))) {
1036 // reset punc_ref for compound words
1037 consistency_info->punc_ref = NO_EDGE;
1038 } else {
1039 bool is_apos = dict_->is_apostrophe(unichar_id);
1040 bool prev_is_numalpha =
1041 (parent_b != nullptr && (unicharset.get_isalpha(parent_b->unichar_id()) ||
1042 unicharset.get_isdigit(parent_b->unichar_id())));
1043 UNICHAR_ID pattern_unichar_id =
1044 (unicharset.get_isalpha(unichar_id) || unicharset.get_isdigit(unichar_id) ||
1045 (is_apos && prev_is_numalpha))
1047 : unichar_id;
1048 if (consistency_info->punc_ref == NO_EDGE || pattern_unichar_id != Dawg::kPatternUnicharID ||
1049 dict_->GetPuncDawg()->edge_letter(consistency_info->punc_ref) !=
1051 NODE_REF node = Dict::GetStartingNode(dict_->GetPuncDawg(), consistency_info->punc_ref);
1052 consistency_info->punc_ref = (node != NO_EDGE) ? dict_->GetPuncDawg()->edge_char_of(
1053 node, pattern_unichar_id, word_end)
1054 : NO_EDGE;
1055 if (consistency_info->punc_ref == NO_EDGE) {
1056 consistency_info->invalid_punc = true;
1057 }
1058 }
1059 }
1060 }
1061
1062 // Update case related counters.
1063 if (parent_vse != nullptr && !word_end && dict_->compound_marker(unichar_id)) {
1064 // Reset counters if we are dealing with a compound word.
1065 consistency_info->num_lower = 0;
1066 consistency_info->num_non_first_upper = 0;
1067 } else if (unicharset.get_islower(unichar_id)) {
1068 consistency_info->num_lower++;
1069 } else if ((parent_b != nullptr) && unicharset.get_isupper(unichar_id)) {
1070 if (unicharset.get_isupper(parent_b->unichar_id()) || consistency_info->num_lower > 0 ||
1071 consistency_info->num_non_first_upper > 0) {
1072 consistency_info->num_non_first_upper++;
1073 }
1074 }
1075
1076 // Initialize consistency_info->script_id (use script of unichar_id
1077 // if it is not Common, use script id recorded by the parent otherwise).
1078 // Set inconsistent_script to true if the script of the current unichar
1079 // is not consistent with that of the parent.
1080 consistency_info->script_id = unicharset.get_script(unichar_id);
1081 // Hiragana and Katakana can mix with Han.
1083 if ((unicharset.hiragana_sid() != unicharset.null_sid() &&
1084 consistency_info->script_id == unicharset.hiragana_sid()) ||
1085 (unicharset.katakana_sid() != unicharset.null_sid() &&
1086 consistency_info->script_id == unicharset.katakana_sid())) {
1087 consistency_info->script_id = dict_->getUnicharset().han_sid();
1088 }
1089 }
1090
1091 if (parent_vse != nullptr &&
1092 (parent_vse->consistency_info.script_id != dict_->getUnicharset().common_sid())) {
1093 int parent_script_id = parent_vse->consistency_info.script_id;
1094 // If script_id is Common, use script id of the parent instead.
1095 if (consistency_info->script_id == dict_->getUnicharset().common_sid()) {
1096 consistency_info->script_id = parent_script_id;
1097 }
1098 if (consistency_info->script_id != parent_script_id) {
1099 consistency_info->inconsistent_script = true;
1100 }
1101 }
1102
1103 // Update chartype related counters.
1104 if (unicharset.get_isalpha(unichar_id)) {
1105 consistency_info->num_alphas++;
1106 } else if (unicharset.get_isdigit(unichar_id)) {
1107 consistency_info->num_digits++;
1108 } else if (!unicharset.get_ispunctuation(unichar_id)) {
1109 consistency_info->num_other++;
1110 }
1111
1112 // Check font and spacing consistency.
1113 if (fontinfo_table_->size() > 0 && parent_b != nullptr) {
1114 int fontinfo_id = -1;
1115 if (parent_b->fontinfo_id() == b->fontinfo_id() ||
1116 parent_b->fontinfo_id2() == b->fontinfo_id()) {
1117 fontinfo_id = b->fontinfo_id();
1118 } else if (parent_b->fontinfo_id() == b->fontinfo_id2() ||
1119 parent_b->fontinfo_id2() == b->fontinfo_id2()) {
1120 fontinfo_id = b->fontinfo_id2();
1121 }
1122 if (language_model_debug_level > 1) {
1123 tprintf(
1124 "pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1125 (parent_b->fontinfo_id() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id()).name : "",
1126 (parent_b->fontinfo_id2() >= 0) ? fontinfo_table_->at(parent_b->fontinfo_id2()).name
1127 : "",
1128 (b->fontinfo_id() >= 0) ? fontinfo_table_->at(b->fontinfo_id()).name : "",
1129 (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : "",
1130 (fontinfo_id >= 0) ? fontinfo_table_->at(fontinfo_id).name : "", fontinfo_id);
1131 }
1132 if (!word_res->blob_widths.empty()) { // if we have widths/gaps info
1133 bool expected_gap_found = false;
1134 float expected_gap = 0.0f;
1135 int temp_gap;
1136 if (fontinfo_id >= 0) { // found a common font
1137 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1138 if (fontinfo_table_->at(fontinfo_id)
1139 .get_spacing(parent_b->unichar_id(), unichar_id, &temp_gap)) {
1140 expected_gap = temp_gap;
1141 expected_gap_found = true;
1142 }
1143 } else {
1144 consistency_info->inconsistent_font = true;
1145 // Get an average of the expected gaps in each font
1146 int num_addends = 0;
1147 int temp_fid;
1148 for (int i = 0; i < 4; ++i) {
1149 if (i == 0) {
1150 temp_fid = parent_b->fontinfo_id();
1151 } else if (i == 1) {
1152 temp_fid = parent_b->fontinfo_id2();
1153 } else if (i == 2) {
1154 temp_fid = b->fontinfo_id();
1155 } else {
1156 temp_fid = b->fontinfo_id2();
1157 }
1158 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1159 if (temp_fid >= 0 && fontinfo_table_->at(temp_fid).get_spacing(parent_b->unichar_id(),
1160 unichar_id, &temp_gap)) {
1161 expected_gap += temp_gap;
1162 num_addends++;
1163 }
1164 }
1165 if (num_addends > 0) {
1166 expected_gap /= static_cast<float>(num_addends);
1167 expected_gap_found = true;
1168 }
1169 }
1170 if (expected_gap_found) {
1171 int actual_gap = word_res->GetBlobsGap(curr_col - 1);
1172 if (actual_gap == 0) {
1173 consistency_info->num_inconsistent_spaces++;
1174 } else {
1175 float gap_ratio = expected_gap / actual_gap;
1176 // TODO(rays) The gaps seem to be way off most of the time, saved by
1177 // the error here that the ratio was compared to 1/2, when it should
1178 // have been 0.5f. Find the source of the gaps discrepancy and put
1179 // the 0.5f here in place of 0.0f.
1180 // Test on 2476595.sj, pages 0 to 6. (In French.)
1181 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1182 consistency_info->num_inconsistent_spaces++;
1183 }
1184 }
1185 if (language_model_debug_level > 1) {
1186 tprintf("spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1187 unicharset.id_to_unichar(parent_b->unichar_id()), parent_b->unichar_id(),
1188 unicharset.id_to_unichar(unichar_id), unichar_id, curr_col, expected_gap,
1189 actual_gap);
1190 }
1191 }
1192 }
1193 }
1194}
int64_t NODE_REF
Definition: dawg.h:50
int UNICHAR_ID
Definition: unichar.h:34
int common_sid() const
Definition: unicharset.h:919
int han_sid() const
Definition: unicharset.h:931
int null_sid() const
Definition: unicharset.h:916
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:116
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:125
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:389

◆ GenerateDawgInfo()

LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 792 of file language_model.cpp.

794 {
795 // Initialize active_dawgs from parent_vse if it is not nullptr.
796 // Otherwise use very_beginning_active_dawgs_.
797 if (parent_vse == nullptr) {
800 } else {
801 if (parent_vse->dawg_info == nullptr) {
802 return nullptr; // not a dict word path
803 }
804 dawg_args_.active_dawgs = &parent_vse->dawg_info->active_dawgs;
805 dawg_args_.permuter = parent_vse->dawg_info->permuter;
806 }
807
808 // Deal with hyphenated words.
809 if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(), b.unichar_id(), curr_col == 0)) {
810 if (language_model_debug_level > 0) {
811 tprintf("Hyphenated word found\n");
812 }
813 return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
814 }
815
816 // Deal with compound words.
817 if (dict_->compound_marker(b.unichar_id()) &&
818 (parent_vse == nullptr || parent_vse->dawg_info->permuter != NUMBER_PERM)) {
819 if (language_model_debug_level > 0) {
820 tprintf("Found compound marker\n");
821 }
822 // Do not allow compound operators at the beginning and end of the word.
823 // Do not allow more than one compound operator per word.
824 // Do not allow compounding of words with lengths shorter than
825 // language_model_min_compound_length
826 if (parent_vse == nullptr || word_end || dawg_args_.permuter == COMPOUND_PERM ||
827 parent_vse->length < language_model_min_compound_length) {
828 return nullptr;
829 }
830
831 // Check that the path terminated before the current character is a word.
832 bool has_word_ending = false;
833 for (unsigned i = 0; i < parent_vse->dawg_info->active_dawgs.size(); ++i) {
834 const DawgPosition &pos = parent_vse->dawg_info->active_dawgs[i];
835 const Dawg *pdawg = pos.dawg_index < 0 ? nullptr : dict_->GetDawg(pos.dawg_index);
836 if (pdawg == nullptr || pos.back_to_punc) {
837 continue;
838 };
839 if (pdawg->type() == DAWG_TYPE_WORD && pos.dawg_ref != NO_EDGE &&
840 pdawg->end_of_word(pos.dawg_ref)) {
841 has_word_ending = true;
842 break;
843 }
844 }
845 if (!has_word_ending) {
846 return nullptr;
847 }
848
849 if (language_model_debug_level > 0) {
850 tprintf("Compound word found\n");
851 }
852 return new LanguageModelDawgInfo(&beginning_active_dawgs_, COMPOUND_PERM);
853 } // done dealing with compound words
854
855 LanguageModelDawgInfo *dawg_info = nullptr;
856
857 // Call LetterIsOkay().
858 // Use the normalized IDs so that all shapes of ' can be allowed in words
859 // like don't.
860 const auto &normed_ids = dict_->getUnicharset().normed_ids(b.unichar_id());
861 DawgPositionVector tmp_active_dawgs;
862 for (unsigned i = 0; i < normed_ids.size(); ++i) {
863 if (language_model_debug_level > 2) {
864 tprintf("Test Letter OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
865 }
867 word_end && i == normed_ids.size() - 1);
868 if (dawg_args_.permuter == NO_PERM) {
869 break;
870 } else if (i < normed_ids.size() - 1) {
871 tmp_active_dawgs = *dawg_args_.updated_dawgs;
872 dawg_args_.active_dawgs = &tmp_active_dawgs;
873 }
874 if (language_model_debug_level > 2) {
875 tprintf("Letter was OK for unichar %d, normed %d\n", b.unichar_id(), normed_ids[i]);
876 }
877 }
878 dawg_args_.active_dawgs = nullptr;
879 if (dawg_args_.permuter != NO_PERM) {
880 dawg_info = new LanguageModelDawgInfo(dawg_args_.updated_dawgs, dawg_args_.permuter);
881 } else if (language_model_debug_level > 3) {
882 tprintf("Letter %s not OK!\n", dict_->getUnicharset().id_to_unichar(b.unichar_id()));
883 }
884
885 return dawg_info;
886}
@ DAWG_TYPE_WORD
Definition: dawg.h:66
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:348
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385
DawgPositionVector beginning_active_dawgs_
DawgPositionVector very_beginning_active_dawgs_

◆ GenerateNgramInfo()

LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
float  outline_length,
const ViterbiStateEntry parent_vse 
)
protected

Definition at line 888 of file language_model.cpp.

891 {
892 // Initialize parent context.
893 const char *pcontext_ptr = "";
894 int pcontext_unichar_step_len = 0;
895 if (parent_vse == nullptr) {
896 pcontext_ptr = prev_word_str_.c_str();
897 pcontext_unichar_step_len = prev_word_unichar_step_len_;
898 } else {
899 pcontext_ptr = parent_vse->ngram_info->context.c_str();
900 pcontext_unichar_step_len = parent_vse->ngram_info->context_unichar_step_len;
901 }
902 // Compute p(unichar | parent context).
903 int unichar_step_len = 0;
904 bool pruned = false;
905 float ngram_cost;
906 float ngram_and_classifier_cost = ComputeNgramCost(unichar, certainty, denom, pcontext_ptr,
907 &unichar_step_len, &pruned, &ngram_cost);
908 // Normalize just the ngram_and_classifier_cost by outline_length.
909 // The ngram_cost is used by the params_model, so it needs to be left as-is,
910 // and the params model cost will be normalized by outline_length.
911 ngram_and_classifier_cost *= outline_length / language_model_ngram_rating_factor;
912 // Add the ngram_cost of the parent.
913 if (parent_vse != nullptr) {
914 ngram_and_classifier_cost += parent_vse->ngram_info->ngram_and_classifier_cost;
915 ngram_cost += parent_vse->ngram_info->ngram_cost;
916 }
917
918 // Shorten parent context string by unichar_step_len unichars.
919 int num_remove = (unichar_step_len + pcontext_unichar_step_len - language_model_ngram_order);
920 if (num_remove > 0) {
921 pcontext_unichar_step_len -= num_remove;
922 }
923 while (num_remove > 0 && *pcontext_ptr != '\0') {
924 pcontext_ptr += UNICHAR::utf8_step(pcontext_ptr);
925 --num_remove;
926 }
927
928 // Decide whether to prune this ngram path and update changed accordingly.
929 if (parent_vse != nullptr && parent_vse->ngram_info->pruned) {
930 pruned = true;
931 }
932
933 // Construct and return the new LanguageModelNgramInfo.
934 auto *ngram_info = new LanguageModelNgramInfo(pcontext_ptr, pcontext_unichar_step_len, pruned,
935 ngram_cost, ngram_and_classifier_cost);
936 ngram_info->context += unichar;
937 ngram_info->context_unichar_step_len += unichar_step_len;
938 assert(ngram_info->context_unichar_step_len <= language_model_ngram_order);
939 return ngram_info;
940}
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)

◆ GenerateTopChoiceInfo()

void tesseract::LanguageModel::GenerateTopChoiceInfo ( ViterbiStateEntry new_vse,
const ViterbiStateEntry parent_vse,
LanguageModelState lms 
)
protected

Definition at line 776 of file language_model.cpp.

778 {
779 ViterbiStateEntry_IT vit(&(lms->viterbi_state_entries));
780 for (vit.mark_cycle_pt();
781 !vit.cycled_list() && new_vse->top_choice_flags && new_vse->cost >= vit.data()->cost;
782 vit.forward()) {
783 // Clear the appropriate flags if the list already contains
784 // a top choice entry with a lower cost.
785 new_vse->top_choice_flags &= ~(vit.data()->top_choice_flags);
786 }
787 if (language_model_debug_level > 2) {
788 tprintf("GenerateTopChoiceInfo: top_choice_flags=0x%x\n", new_vse->top_choice_flags);
789 }
790}

◆ GetNextParentVSE()

ViterbiStateEntry * tesseract::LanguageModel::GetNextParentVSE ( bool  just_classified,
bool  mixed_alnum,
const BLOB_CHOICE bc,
LanguageModelFlagsType  blob_choice_flags,
const UNICHARSET unicharset,
WERD_RES word_res,
ViterbiStateEntry_IT *  vse_it,
LanguageModelFlagsType top_choice_flags 
) const
protected

Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.

Definition at line 514 of file language_model.cpp.

519 {
520 for (; !vse_it->cycled_list(); vse_it->forward()) {
521 ViterbiStateEntry *parent_vse = vse_it->data();
522 // Only consider the parent if it has been updated or
523 // if the current ratings cell has just been classified.
524 if (!just_classified && !parent_vse->updated) {
525 continue;
526 }
527 if (language_model_debug_level > 2) {
528 parent_vse->Print("Considering");
529 }
530 // If the parent is non-alnum, then upper counts as lower.
531 *top_choice_flags = blob_choice_flags;
532 if ((blob_choice_flags & kUpperCaseFlag) && !parent_vse->HasAlnumChoice(unicharset)) {
533 *top_choice_flags |= kLowerCaseFlag;
534 }
535 *top_choice_flags &= parent_vse->top_choice_flags;
536 UNICHAR_ID unichar_id = bc->unichar_id();
537 const BLOB_CHOICE *parent_b = parent_vse->curr_b;
538 UNICHAR_ID parent_id = parent_b->unichar_id();
539 // Digits do not bind to alphas if there is a mix in both parent and current
540 // or if the alpha is not the top choice.
541 if (unicharset.get_isdigit(unichar_id) && unicharset.get_isalpha(parent_id) &&
542 (mixed_alnum || *top_choice_flags == 0)) {
543 continue; // Digits don't bind to alphas.
544 }
545 // Likewise alphas do not bind to digits if there is a mix in both or if
546 // the digit is not the top choice.
547 if (unicharset.get_isalpha(unichar_id) && unicharset.get_isdigit(parent_id) &&
548 (mixed_alnum || *top_choice_flags == 0)) {
549 continue; // Alphas don't bind to digits.
550 }
551 // If there is a case mix of the same alpha in the parent list, then
552 // competing_vse is non-null and will be used to determine whether
553 // or not to bind the current blob choice.
554 if (parent_vse->competing_vse != nullptr) {
555 const BLOB_CHOICE *competing_b = parent_vse->competing_vse->curr_b;
556 UNICHAR_ID other_id = competing_b->unichar_id();
557 if (language_model_debug_level >= 5) {
558 tprintf("Parent %s has competition %s\n", unicharset.id_to_unichar(parent_id),
559 unicharset.id_to_unichar(other_id));
560 }
561 if (unicharset.SizesDistinct(parent_id, other_id)) {
562 // If other_id matches bc wrt position and size, and parent_id, doesn't,
563 // don't bind to the current parent.
564 if (bc->PosAndSizeAgree(*competing_b, word_res->x_height,
565 language_model_debug_level >= 5) &&
566 !bc->PosAndSizeAgree(*parent_b, word_res->x_height, language_model_debug_level >= 5)) {
567 continue; // Competing blobchoice has a better vertical match.
568 }
569 }
570 }
571 vse_it->forward();
572 return parent_vse; // This one is good!
573 }
574 return nullptr; // Ran out of possibilities.
575}
static const LanguageModelFlagsType kLowerCaseFlag
static const LanguageModelFlagsType kUpperCaseFlag

◆ getParamsModel()

ParamsModel & tesseract::LanguageModel::getParamsModel ( )
inline

Definition at line 103 of file language_model.h.

103 {
104 return params_model_;
105 }

◆ GetTopLowerUpperDigit()

bool tesseract::LanguageModel::GetTopLowerUpperDigit ( BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper,
BLOB_CHOICE **  first_digit 
) const
protected

Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.

Definition at line 384 of file language_model.cpp.

386 {
387 BLOB_CHOICE_IT c_it(curr_list);
388 const UNICHARSET &unicharset = dict_->getUnicharset();
389 BLOB_CHOICE *first_unichar = nullptr;
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
391 UNICHAR_ID unichar_id = c_it.data()->unichar_id();
392 if (unicharset.get_fragment(unichar_id)) {
393 continue; // skip fragments
394 }
395 if (first_unichar == nullptr) {
396 first_unichar = c_it.data();
397 }
398 if (*first_lower == nullptr && unicharset.get_islower(unichar_id)) {
399 *first_lower = c_it.data();
400 }
401 if (*first_upper == nullptr && unicharset.get_isalpha(unichar_id) &&
402 !unicharset.get_islower(unichar_id)) {
403 *first_upper = c_it.data();
404 }
405 if (*first_digit == nullptr && unicharset.get_isdigit(unichar_id)) {
406 *first_digit = c_it.data();
407 }
408 }
409 ASSERT_HOST(first_unichar != nullptr);
410 bool mixed = (*first_lower != nullptr || *first_upper != nullptr) && *first_digit != nullptr;
411 if (*first_lower == nullptr) {
412 *first_lower = first_unichar;
413 }
414 if (*first_upper == nullptr) {
415 *first_upper = first_unichar;
416 }
417 if (*first_digit == nullptr) {
418 *first_digit = first_unichar;
419 }
420 return mixed;
421}
@ mixed
Definition: cluster.h:53

◆ InitForWord()

void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  max_char_wh_ratio,
float  rating_cert_scale 
)

Definition at line 127 of file language_model.cpp.

128 {
129 fixed_pitch_ = fixed_pitch;
130 max_char_wh_ratio_ = max_char_wh_ratio;
131 rating_cert_scale_ = rating_cert_scale;
134
135 // Initialize vectors with beginning DawgInfos.
140
141 // Fill prev_word_str_ with the last language_model_ngram_order
142 // unichars from prev_word.
143 if (language_model_ngram_on) {
144 if (prev_word != nullptr && !prev_word->unichar_string().empty()) {
145 prev_word_str_ = prev_word->unichar_string();
146 if (language_model_ngram_space_delimited_language) {
147 prev_word_str_ += ' ';
148 }
149 } else {
150 prev_word_str_ = " ";
151 }
152 const char *str_ptr = prev_word_str_.c_str();
153 const char *str_end = str_ptr + prev_word_str_.length();
154 int step;
156 while (str_ptr != str_end && (step = UNICHAR::utf8_step(str_ptr))) {
157 str_ptr += step;
159 }
160 ASSERT_HOST(str_ptr == str_end);
161 }
162}
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:624
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610

◆ INT_VAR_H() [1/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_debug_level  )

◆ INT_VAR_H() [2/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_min_compound_length  )

◆ INT_VAR_H() [3/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_ngram_order  )

◆ INT_VAR_H() [4/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_viterbi_list_max_num_prunable  )

◆ INT_VAR_H() [5/6]

tesseract::LanguageModel::INT_VAR_H ( language_model_viterbi_list_max_size  )

◆ INT_VAR_H() [6/6]

tesseract::LanguageModel::INT_VAR_H ( wordrec_display_segmentations  )

◆ PrunablePath()

bool tesseract::LanguageModel::PrunablePath ( const ViterbiStateEntry vse)
inlineprotected

Definition at line 272 of file language_model.h.

272 {
273 if (vse.top_choice_flags) {
274 return false;
275 }
276 if (vse.dawg_info != nullptr &&
277 (vse.dawg_info->permuter == SYSTEM_DAWG_PERM || vse.dawg_info->permuter == USER_DAWG_PERM ||
278 vse.dawg_info->permuter == FREQ_DAWG_PERM)) {
279 return false;
280 }
281 return true;
282 }

◆ SetAcceptableChoiceFound()

void tesseract::LanguageModel::SetAcceptableChoiceFound ( bool  val)
inline

Definition at line 99 of file language_model.h.

99 {
101 }

◆ SetTopParentLowerUpperDigit()

int tesseract::LanguageModel::SetTopParentLowerUpperDigit ( LanguageModelState parent_node) const
protected

Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.

Definition at line 432 of file language_model.cpp.

432 {
433 if (parent_node == nullptr) {
434 return -1;
435 }
436 UNICHAR_ID top_id = INVALID_UNICHAR_ID;
437 ViterbiStateEntry *top_lower = nullptr;
438 ViterbiStateEntry *top_upper = nullptr;
439 ViterbiStateEntry *top_digit = nullptr;
440 ViterbiStateEntry *top_choice = nullptr;
441 float lower_rating = 0.0f;
442 float upper_rating = 0.0f;
443 float digit_rating = 0.0f;
444 float top_rating = 0.0f;
445 const UNICHARSET &unicharset = dict_->getUnicharset();
446 ViterbiStateEntry_IT vit(&parent_node->viterbi_state_entries);
447 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
448 ViterbiStateEntry *vse = vit.data();
449 // INVALID_UNICHAR_ID should be treated like a zero-width joiner, so scan
450 // back to the real character if needed.
451 ViterbiStateEntry *unichar_vse = vse;
452 UNICHAR_ID unichar_id = unichar_vse->curr_b->unichar_id();
453 float rating = unichar_vse->curr_b->rating();
454 while (unichar_id == INVALID_UNICHAR_ID && unichar_vse->parent_vse != nullptr) {
455 unichar_vse = unichar_vse->parent_vse;
456 unichar_id = unichar_vse->curr_b->unichar_id();
457 rating = unichar_vse->curr_b->rating();
458 }
459 if (unichar_id != INVALID_UNICHAR_ID) {
460 if (unicharset.get_islower(unichar_id)) {
461 if (top_lower == nullptr || lower_rating > rating) {
462 top_lower = vse;
463 lower_rating = rating;
464 }
465 } else if (unicharset.get_isalpha(unichar_id)) {
466 if (top_upper == nullptr || upper_rating > rating) {
467 top_upper = vse;
468 upper_rating = rating;
469 }
470 } else if (unicharset.get_isdigit(unichar_id)) {
471 if (top_digit == nullptr || digit_rating > rating) {
472 top_digit = vse;
473 digit_rating = rating;
474 }
475 }
476 }
477 if (top_choice == nullptr || top_rating > rating) {
478 top_choice = vse;
479 top_rating = rating;
480 top_id = unichar_id;
481 }
482 }
483 if (top_choice == nullptr) {
484 return -1;
485 }
486 bool mixed = (top_lower != nullptr || top_upper != nullptr) && top_digit != nullptr;
487 if (top_lower == nullptr) {
488 top_lower = top_choice;
489 }
490 top_lower->top_choice_flags |= kLowerCaseFlag;
491 if (top_upper == nullptr) {
492 top_upper = top_choice;
493 }
494 top_upper->top_choice_flags |= kUpperCaseFlag;
495 if (top_digit == nullptr) {
496 top_digit = top_choice;
497 }
498 top_digit->top_choice_flags |= kDigitFlag;
499 top_choice->top_choice_flags |= kSmallestRatingFlag;
500 if (top_id != INVALID_UNICHAR_ID && dict_->compound_marker(top_id) &&
501 (top_choice->top_choice_flags & (kLowerCaseFlag | kUpperCaseFlag | kDigitFlag))) {
502 // If the compound marker top choice carries any of the top alnum flags,
503 // then give it all of them, allowing words like I-295 to be chosen.
504 top_choice->top_choice_flags |= kLowerCaseFlag | kUpperCaseFlag | kDigitFlag;
505 }
506 return mixed ? 1 : 0;
507}
static const LanguageModelFlagsType kDigitFlag

◆ UpdateBestChoice()

void tesseract::LanguageModel::UpdateBestChoice ( ViterbiStateEntry vse,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 1236 of file language_model.cpp.

1238 {
1239 bool truth_path;
1240 WERD_CHOICE *word =
1241 ConstructWord(vse, word_res, &best_choice_bundle->fixpt, blamer_bundle, &truth_path);
1242 ASSERT_HOST(word != nullptr);
1243 if (dict_->stopper_debug_level >= 1) {
1244 std::string word_str;
1245 word->string_and_lengths(&word_str, nullptr);
1246 vse->Print(word_str.c_str());
1247 }
1248 if (language_model_debug_level > 0) {
1249 word->print("UpdateBestChoice() constructed word");
1250 }
1251 // Record features from the current path if necessary.
1252 ParamsTrainingHypothesis curr_hyp;
1253 if (blamer_bundle != nullptr) {
1254 if (vse->dawg_info != nullptr) {
1255 vse->dawg_info->permuter = static_cast<PermuterType>(word->permuter());
1256 }
1257 ExtractFeaturesFromPath(*vse, curr_hyp.features);
1258 word->string_and_lengths(&(curr_hyp.str), nullptr);
1259 curr_hyp.cost = vse->cost; // record cost for error rate computations
1260 if (language_model_debug_level > 0) {
1261 tprintf("Raw features extracted from %s (cost=%g) [ ", curr_hyp.str.c_str(), curr_hyp.cost);
1262 for (float feature : curr_hyp.features) {
1263 tprintf("%g ", feature);
1264 }
1265 tprintf("]\n");
1266 }
1267 // Record the current hypothesis in params_training_bundle.
1268 blamer_bundle->AddHypothesis(curr_hyp);
1269 if (truth_path) {
1270 blamer_bundle->UpdateBestRating(word->rating());
1271 }
1272 }
1273 if (blamer_bundle != nullptr && blamer_bundle->GuidedSegsearchStillGoing()) {
1274 // The word was constructed solely for blamer_bundle->AddHypothesis, so
1275 // we no longer need it.
1276 delete word;
1277 return;
1278 }
1279 if (word_res->chopped_word != nullptr && !word_res->chopped_word->blobs.empty()) {
1280 word->SetScriptPositions(false, word_res->chopped_word, language_model_debug_level);
1281 }
1282 // Update and log new raw_choice if needed.
1283 if (word_res->raw_choice == nullptr || word->rating() < word_res->raw_choice->rating()) {
1284 if (word_res->LogNewRawChoice(word) && language_model_debug_level > 0) {
1285 tprintf("Updated raw choice\n");
1286 }
1287 }
1288 // Set the modified rating for best choice to vse->cost and log best choice.
1289 word->set_rating(vse->cost);
1290 // Call LogNewChoice() for best choice from Dict::adjust_word() since it
1291 // computes adjust_factor that is used by the adaption code (e.g. by
1292 // ClassifyAdaptableWord() to compute adaption acceptance thresholds).
1293 // Note: the rating of the word is not adjusted.
1294 dict_->adjust_word(word, vse->dawg_info == nullptr, vse->consistency_info.xht_decision, 0.0,
1295 false, language_model_debug_level > 0);
1296 // Hand ownership of the word over to the word_res.
1297 if (!word_res->LogNewCookedChoice(dict_->tessedit_truncate_wordchoice_log,
1298 dict_->stopper_debug_level >= 1, word)) {
1299 // The word was so bad that it was deleted.
1300 return;
1301 }
1302 if (word_res->best_choice == word) {
1303 // Word was the new best.
1304 if (dict_->AcceptableChoice(*word, vse->consistency_info.xht_decision) &&
1305 AcceptablePath(*vse)) {
1307 }
1308 // Update best_choice_bundle.
1309 best_choice_bundle->updated = true;
1310 best_choice_bundle->best_vse = vse;
1311 if (language_model_debug_level > 0) {
1312 tprintf("Updated best choice\n");
1313 word->print_state("New state ");
1314 }
1315 // Update hyphen state if we are dealing with a dictionary word.
1316 if (vse->dawg_info != nullptr) {
1317 if (dict_->has_hyphen_end(*word)) {
1319 } else {
1320 dict_->reset_hyphen_vars(true);
1321 }
1322 }
1323
1324 if (blamer_bundle != nullptr) {
1325 blamer_bundle->set_best_choice_is_dict_and_top_choice(vse->dawg_info != nullptr &&
1326 vse->top_choice_flags);
1327 }
1328 }
1329#ifndef GRAPHICS_DISABLED
1330 if (wordrec_display_segmentations && word_res->chopped_word != nullptr) {
1331 word->DisplaySegmentation(word_res->chopped_word);
1332 }
1333#endif
1334}
PermuterType
Definition: ratngs.h:235
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:27
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
Definition: stopper.cpp:42
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:710
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
Definition: hyphen.cpp:44
bool AcceptablePath(const ViterbiStateEntry &vse)
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)

◆ UpdateState()

bool tesseract::LanguageModel::UpdateState ( bool  just_classified,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
LanguageModelState parent_node,
LMPainPoints pain_points,
WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.

This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:

  • paths that are liked by the language model: either a DAWG or the n-gram model, where active.
  • paths that represent some kind of top choice. The old permuter permuted the top raw classifier score, the top upper case word and the top lower- case word. UpdateState now concentrates its top-choice paths on top lower-case, top upper-case (or caseless alpha), and top digit sequence, with allowance for continuation of these paths through blobs where such a character does not appear in the choices list.

GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.

Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.

Definition at line 249 of file language_model.cpp.

252 {
253 if (language_model_debug_level > 0) {
254 tprintf("\nUpdateState: col=%d row=%d %s", curr_col, curr_row,
255 just_classified ? "just_classified" : "");
256 if (language_model_debug_level > 5) {
257 tprintf("(parent=%p)\n", static_cast<void *>(parent_node));
258 } else {
259 tprintf("\n");
260 }
261 }
262 // Initialize helper variables.
263 bool word_end = (curr_row + 1 >= word_res->ratings->dimension());
264 bool new_changed = false;
265 float denom = (language_model_ngram_on) ? ComputeDenom(curr_list) : 1.0f;
266 const UNICHARSET &unicharset = dict_->getUnicharset();
267 BLOB_CHOICE *first_lower = nullptr;
268 BLOB_CHOICE *first_upper = nullptr;
269 BLOB_CHOICE *first_digit = nullptr;
270 bool has_alnum_mix = false;
271 if (parent_node != nullptr) {
272 int result = SetTopParentLowerUpperDigit(parent_node);
273 if (result < 0) {
274 if (language_model_debug_level > 0) {
275 tprintf("No parents found to process\n");
276 }
277 return false;
278 }
279 if (result > 0) {
280 has_alnum_mix = true;
281 }
282 }
283 if (!GetTopLowerUpperDigit(curr_list, &first_lower, &first_upper, &first_digit)) {
284 has_alnum_mix = false;
285 };
286 ScanParentsForCaseMix(unicharset, parent_node);
287 if (language_model_debug_level > 3 && parent_node != nullptr) {
288 parent_node->Print("Parent viterbi list");
289 }
290 LanguageModelState *curr_state = best_choice_bundle->beam[curr_row];
291
292 // Call AddViterbiStateEntry() for each parent+child ViterbiStateEntry.
293 ViterbiStateEntry_IT vit;
294 BLOB_CHOICE_IT c_it(curr_list);
295 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
296 BLOB_CHOICE *choice = c_it.data();
297 // TODO(antonova): make sure commenting this out if ok for ngram
298 // model scoring (I think this was introduced to fix ngram model quirks).
299 // Skip nullptr unichars unless it is the only choice.
300 // if (!curr_list->singleton() && c_it.data()->unichar_id() == 0) continue;
301 UNICHAR_ID unichar_id = choice->unichar_id();
302 if (unicharset.get_fragment(unichar_id)) {
303 continue; // Skip fragments.
304 }
305 // Set top choice flags.
306 LanguageModelFlagsType blob_choice_flags = kXhtConsistentFlag;
307 if (c_it.at_first() || !new_changed) {
308 blob_choice_flags |= kSmallestRatingFlag;
309 }
310 if (first_lower == choice) {
311 blob_choice_flags |= kLowerCaseFlag;
312 }
313 if (first_upper == choice) {
314 blob_choice_flags |= kUpperCaseFlag;
315 }
316 if (first_digit == choice) {
317 blob_choice_flags |= kDigitFlag;
318 }
319
320 if (parent_node == nullptr) {
321 // Process the beginning of a word.
322 // If there is a better case variant that is not distinguished by size,
323 // skip this blob choice, as we have no choice but to accept the result
324 // of the character classifier to distinguish between them, even if
325 // followed by an upper case.
326 // With words like iPoc, and other CamelBackWords, the lower-upper
327 // transition can only be achieved if the classifier has the correct case
328 // as the top choice, and leaving an initial I lower down the list
329 // increases the chances of choosing IPoc simply because it doesn't
330 // include such a transition. iPoc will beat iPOC and ipoc because
331 // the other words are baseline/x-height inconsistent.
332 if (HasBetterCaseVariant(unicharset, choice, curr_list)) {
333 continue;
334 }
335 // Upper counts as lower at the beginning of a word.
336 if (blob_choice_flags & kUpperCaseFlag) {
337 blob_choice_flags |= kLowerCaseFlag;
338 }
339 new_changed |= AddViterbiStateEntry(blob_choice_flags, denom, word_end, curr_col, curr_row,
340 choice, curr_state, nullptr, pain_points, word_res,
341 best_choice_bundle, blamer_bundle);
342 } else {
343 // Get viterbi entries from each parent ViterbiStateEntry.
344 vit.set_to_list(&parent_node->viterbi_state_entries);
345 int vit_counter = 0;
346 vit.mark_cycle_pt();
347 ViterbiStateEntry *parent_vse = nullptr;
348 LanguageModelFlagsType top_choice_flags;
349 while ((parent_vse =
350 GetNextParentVSE(just_classified, has_alnum_mix, c_it.data(), blob_choice_flags,
351 unicharset, word_res, &vit, &top_choice_flags)) != nullptr) {
352 // Skip pruned entries and do not look at prunable entries if already
353 // examined language_model_viterbi_list_max_num_prunable of those.
354 if (PrunablePath(*parent_vse) &&
355 (++vit_counter > language_model_viterbi_list_max_num_prunable ||
356 (language_model_ngram_on && parent_vse->ngram_info->pruned))) {
357 continue;
358 }
359 // If the parent has no alnum choice, (ie choice is the first in a
360 // string of alnum), and there is a better case variant that is not
361 // distinguished by size, skip this blob choice/parent, as with the
362 // initial blob treatment above.
363 if (!parent_vse->HasAlnumChoice(unicharset) &&
364 HasBetterCaseVariant(unicharset, choice, curr_list)) {
365 continue;
366 }
367 // Create a new ViterbiStateEntry if BLOB_CHOICE in c_it.data()
368 // looks good according to the Dawgs or character ngram model.
369 new_changed |= AddViterbiStateEntry(top_choice_flags, denom, word_end, curr_col, curr_row,
370 c_it.data(), curr_state, parent_vse, pain_points,
371 word_res, best_choice_bundle, blamer_bundle);
372 }
373 }
374 }
375 return new_changed;
376}
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
Definition: lm_state.h:35
static const LanguageModelFlagsType kXhtConsistentFlag
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const

Member Data Documentation

◆ acceptable_choice_found_

bool tesseract::LanguageModel::acceptable_choice_found_ = false
protected

Definition at line 366 of file language_model.h.

◆ beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::beginning_active_dawgs_
protected

Definition at line 354 of file language_model.h.

◆ correct_segmentation_explored_

bool tesseract::LanguageModel::correct_segmentation_explored_ = false
protected

Definition at line 368 of file language_model.h.

◆ dawg_args_

DawgArgs tesseract::LanguageModel::dawg_args_
protected

Definition at line 322 of file language_model.h.

◆ dict_

Dict* tesseract::LanguageModel::dict_ = nullptr
protected

Definition at line 333 of file language_model.h.

◆ fixed_pitch_

bool tesseract::LanguageModel::fixed_pitch_ = false
protected

Definition at line 340 of file language_model.h.

◆ fontinfo_table_

const UnicityTable<FontInfo>* tesseract::LanguageModel::fontinfo_table_ = nullptr
protected

Definition at line 329 of file language_model.h.

◆ kDigitFlag

const LanguageModelFlagsType tesseract::LanguageModel::kDigitFlag = 0x8
static

Definition at line 57 of file language_model.h.

◆ kLowerCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kLowerCaseFlag = 0x2
static

Definition at line 55 of file language_model.h.

◆ kMaxAvgNgramCost

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f
static

Definition at line 62 of file language_model.h.

◆ kSmallestRatingFlag

const LanguageModelFlagsType tesseract::LanguageModel::kSmallestRatingFlag = 0x1
static

Definition at line 54 of file language_model.h.

◆ kUpperCaseFlag

const LanguageModelFlagsType tesseract::LanguageModel::kUpperCaseFlag = 0x4
static

Definition at line 56 of file language_model.h.

◆ kXhtConsistentFlag

const LanguageModelFlagsType tesseract::LanguageModel::kXhtConsistentFlag = 0x10
static

Definition at line 58 of file language_model.h.

◆ max_char_wh_ratio_

float tesseract::LanguageModel::max_char_wh_ratio_ = 0.0f
protected

Definition at line 343 of file language_model.h.

◆ params_model_

ParamsModel tesseract::LanguageModel::params_model_
protected

Definition at line 371 of file language_model.h.

◆ prev_word_str_

std::string tesseract::LanguageModel::prev_word_str_
protected

Definition at line 350 of file language_model.h.

◆ prev_word_unichar_step_len_

int tesseract::LanguageModel::prev_word_unichar_step_len_ = 0
protected

Definition at line 351 of file language_model.h.

◆ rating_cert_scale_

float tesseract::LanguageModel::rating_cert_scale_ = 0.0f
protected

Definition at line 324 of file language_model.h.

◆ very_beginning_active_dawgs_

DawgPositionVector tesseract::LanguageModel::very_beginning_active_dawgs_
protected

Definition at line 353 of file language_model.h.


The documentation for this class was generated from the following files: