tesseract v5.3.3.20231005
|
#include <language_model.h>
Public Member Functions | |
LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict) | |
~LanguageModel () | |
void | InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale) |
bool | UpdateState (bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
bool | AcceptableChoiceFound () |
void | SetAcceptableChoiceFound (bool val) |
ParamsModel & | getParamsModel () |
INT_VAR_H (language_model_debug_level) | |
BOOL_VAR_H (language_model_ngram_on) | |
INT_VAR_H (language_model_ngram_order) | |
INT_VAR_H (language_model_viterbi_list_max_num_prunable) | |
INT_VAR_H (language_model_viterbi_list_max_size) | |
double_VAR_H (language_model_ngram_small_prob) | |
double_VAR_H (language_model_ngram_nonmatch_score) | |
BOOL_VAR_H (language_model_ngram_use_only_first_uft8_step) | |
double_VAR_H (language_model_ngram_scale_factor) | |
double_VAR_H (language_model_ngram_rating_factor) | |
BOOL_VAR_H (language_model_ngram_space_delimited_language) | |
INT_VAR_H (language_model_min_compound_length) | |
double_VAR_H (language_model_penalty_non_freq_dict_word) | |
double_VAR_H (language_model_penalty_non_dict_word) | |
double_VAR_H (language_model_penalty_punc) | |
double_VAR_H (language_model_penalty_case) | |
double_VAR_H (language_model_penalty_script) | |
double_VAR_H (language_model_penalty_chartype) | |
double_VAR_H (language_model_penalty_font) | |
double_VAR_H (language_model_penalty_spacing) | |
double_VAR_H (language_model_penalty_increment) | |
INT_VAR_H (wordrec_display_segmentations) | |
BOOL_VAR_H (language_model_use_sigmoidal_certainty) | |
Static Public Member Functions | |
static void | ExtractFeaturesFromPath (const ViterbiStateEntry &vse, float features[]) |
Static Public Attributes | |
static const LanguageModelFlagsType | kSmallestRatingFlag = 0x1 |
static const LanguageModelFlagsType | kLowerCaseFlag = 0x2 |
static const LanguageModelFlagsType | kUpperCaseFlag = 0x4 |
static const LanguageModelFlagsType | kDigitFlag = 0x8 |
static const LanguageModelFlagsType | kXhtConsistentFlag = 0x10 |
static const float | kMaxAvgNgramCost = 25.0f |
Protected Member Functions | |
float | CertaintyScore (float cert) |
float | ComputeAdjustment (int num_problems, float penalty) |
float | ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info) |
float | ComputeAdjustedPathCost (ViterbiStateEntry *vse) |
bool | GetTopLowerUpperDigit (BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const |
int | SetTopParentLowerUpperDigit (LanguageModelState *parent_node) const |
ViterbiStateEntry * | GetNextParentVSE (bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const |
bool | AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | GenerateTopChoiceInfo (ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms) |
LanguageModelDawgInfo * | GenerateDawgInfo (bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse) |
LanguageModelNgramInfo * | GenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse) |
float | ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob) |
float | ComputeDenom (BLOB_CHOICE_LIST *curr_list) |
void | FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info) |
void | UpdateBestChoice (ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
WERD_CHOICE * | ConstructWord (ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path) |
void | ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats) |
bool | PrunablePath (const ViterbiStateEntry &vse) |
bool | AcceptablePath (const ViterbiStateEntry &vse) |
Protected Attributes | |
DawgArgs | dawg_args_ |
float | rating_cert_scale_ = 0.0f |
const UnicityTable< FontInfo > * | fontinfo_table_ = nullptr |
Dict * | dict_ = nullptr |
bool | fixed_pitch_ = false |
float | max_char_wh_ratio_ = 0.0f |
std::string | prev_word_str_ |
int | prev_word_unichar_step_len_ = 0 |
DawgPositionVector | very_beginning_active_dawgs_ |
DawgPositionVector | beginning_active_dawgs_ |
bool | acceptable_choice_found_ = false |
bool | correct_segmentation_explored_ = false |
ParamsModel | params_model_ |
Definition at line 51 of file language_model.h.
tesseract::LanguageModel::LanguageModel | ( | const UnicityTable< FontInfo > * | fontinfo_table, |
Dict * | dict | ||
) |
Definition at line 53 of file language_model.cpp.
tesseract::LanguageModel::~LanguageModel | ( | ) |
Definition at line 123 of file language_model.cpp.
|
inline |
Definition at line 96 of file language_model.h.
|
inlineprotected |
Definition at line 285 of file language_model.h.
|
protected |
Definition at line 577 of file language_model.cpp.
tesseract::LanguageModel::BOOL_VAR_H | ( | language_model_ngram_on | ) |
tesseract::LanguageModel::BOOL_VAR_H | ( | language_model_ngram_space_delimited_language | ) |
tesseract::LanguageModel::BOOL_VAR_H | ( | language_model_ngram_use_only_first_uft8_step | ) |
tesseract::LanguageModel::BOOL_VAR_H | ( | language_model_use_sigmoidal_certainty | ) |
|
inlineprotected |
Definition at line 108 of file language_model.h.
|
protected |
Definition at line 1196 of file language_model.cpp.
|
inlineprotected |
Definition at line 120 of file language_model.h.
|
inlineprotected |
Definition at line 257 of file language_model.h.
|
inlineprotected |
Definition at line 134 of file language_model.h.
|
protected |
Definition at line 998 of file language_model.cpp.
|
protected |
Definition at line 942 of file language_model.cpp.
|
protected |
Definition at line 1386 of file language_model.cpp.
tesseract::LanguageModel::double_VAR_H | ( | language_model_ngram_nonmatch_score | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_ngram_rating_factor | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_ngram_scale_factor | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_ngram_small_prob | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_case | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_chartype | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_font | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_increment | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_non_dict_word | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_non_freq_dict_word | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_punc | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_script | ) |
tesseract::LanguageModel::double_VAR_H | ( | language_model_penalty_spacing | ) |
|
static |
Definition at line 1336 of file language_model.cpp.
|
protected |
Definition at line 1021 of file language_model.cpp.
|
protected |
Definition at line 792 of file language_model.cpp.
|
protected |
Definition at line 888 of file language_model.cpp.
|
protected |
Definition at line 776 of file language_model.cpp.
|
protected |
Finds the next ViterbiStateEntry with which the given unichar_id can combine sensibly, taking into account any mixed alnum/mixed case situation, and whether this combination has been inspected before.
Definition at line 514 of file language_model.cpp.
|
inline |
Definition at line 103 of file language_model.h.
|
protected |
Finds the first lower and upper case letter and first digit in curr_list. For non-upper/lower languages, alpha counts as upper. Uses the first character in the list in place of empty results. Returns true if both alpha and digits are found.
Definition at line 384 of file language_model.cpp.
void tesseract::LanguageModel::InitForWord | ( | const WERD_CHOICE * | prev_word, |
bool | fixed_pitch, | ||
float | max_char_wh_ratio, | ||
float | rating_cert_scale | ||
) |
Definition at line 127 of file language_model.cpp.
tesseract::LanguageModel::INT_VAR_H | ( | language_model_debug_level | ) |
tesseract::LanguageModel::INT_VAR_H | ( | language_model_min_compound_length | ) |
tesseract::LanguageModel::INT_VAR_H | ( | language_model_ngram_order | ) |
tesseract::LanguageModel::INT_VAR_H | ( | language_model_viterbi_list_max_num_prunable | ) |
tesseract::LanguageModel::INT_VAR_H | ( | language_model_viterbi_list_max_size | ) |
tesseract::LanguageModel::INT_VAR_H | ( | wordrec_display_segmentations | ) |
|
inlineprotected |
Definition at line 272 of file language_model.h.
|
inline |
Definition at line 99 of file language_model.h.
|
protected |
Forces there to be at least one entry in the overall set of the viterbi_state_entries of each element of parent_node that has the top_choice_flag set for lower, upper and digit using the same rules as GetTopLowerUpperDigit, setting the flag on the first found suitable candidate, whether or not the flag is set on some other parent. Returns 1 if both alpha and digits are found among the parents, -1 if no parents are found at all (a legitimate case), and 0 otherwise.
Definition at line 432 of file language_model.cpp.
|
protected |
Definition at line 1236 of file language_model.cpp.
bool tesseract::LanguageModel::UpdateState | ( | bool | just_classified, |
int | curr_col, | ||
int | curr_row, | ||
BLOB_CHOICE_LIST * | curr_list, | ||
LanguageModelState * | parent_node, | ||
LMPainPoints * | pain_points, | ||
WERD_RES * | word_res, | ||
BestChoiceBundle * | best_choice_bundle, | ||
BlamerBundle * | blamer_bundle | ||
) |
UpdateState has the job of combining the ViterbiStateEntry lists on each of the choices on parent_list with each of the blob choices in curr_list, making a new ViterbiStateEntry for each sensible path.
This could be a huge set of combinations, creating a lot of work only to be truncated by some beam limit, but only certain kinds of paths will continue at the next step:
GetNextParentVSE enforces some of these models to minimize the number of calls to AddViterbiStateEntry, even prior to looking at the language model. Thus an n-blob sequence of [l1I] will produce 3n calls to AddViterbiStateEntry instead of 3^n.
Of course it isn't quite that simple as Title Case is handled by allowing lower case to continue an upper case initial, but it has to be detected in the combiner so it knows which upper case letters are initial alphas.
Definition at line 249 of file language_model.cpp.
|
protected |
Definition at line 366 of file language_model.h.
|
protected |
Definition at line 354 of file language_model.h.
|
protected |
Definition at line 368 of file language_model.h.
|
protected |
Definition at line 322 of file language_model.h.
|
protected |
Definition at line 333 of file language_model.h.
|
protected |
Definition at line 340 of file language_model.h.
|
protected |
Definition at line 329 of file language_model.h.
|
static |
Definition at line 57 of file language_model.h.
|
static |
Definition at line 55 of file language_model.h.
|
static |
Definition at line 62 of file language_model.h.
|
static |
Definition at line 54 of file language_model.h.
|
static |
Definition at line 56 of file language_model.h.
|
static |
Definition at line 58 of file language_model.h.
|
protected |
Definition at line 343 of file language_model.h.
|
protected |
Definition at line 371 of file language_model.h.
|
protected |
Definition at line 350 of file language_model.h.
|
protected |
Definition at line 351 of file language_model.h.
|
protected |
Definition at line 324 of file language_model.h.
|
protected |
Definition at line 353 of file language_model.h.