19#ifndef TESSERACT_DICT_DICT_H_
20#define TESSERACT_DICT_DICT_H_
23# include "config_auto.h"
26#ifndef DISABLED_LEGACY_ENGINE
35#ifndef DISABLED_LEGACY_ENGINE
44#define CHARS_PER_LINE 500
45#define MAX_WERD_LENGTH (int64_t)128
62static const int kRatingPad = 4;
63static const int kDictMaxWildcards = 2;
66static const char kHyphenSymbol[] =
"-";
67static const char kSlashSymbol[] =
"/";
68static const char kQuestionSymbol[] =
"?";
69static const char kApostropheSymbol[] =
"'";
70static const float kSimCertaintyScale = -10.0;
71static const float kSimCertaintyOffset = -10.0;
72static const float kSimilarityFloor = 100.0;
73static const int kDocDictMaxRepChars = 4;
105 return getCCUtil()->unicharset;
108 return getCCUtil()->unicharset;
110#ifndef DISABLED_LEGACY_ENGINE
112 return getCCUtil()->unichar_ambigs;
117 const UNICHARSET &unicharset = getUnicharset();
119 const auto &normed_ids = unicharset.
normed_ids(unichar_id);
120 return normed_ids.size() == 1 &&
121 (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
126 const UNICHARSET &unicharset = getUnicharset();
128 const auto &normed_ids = unicharset.
normed_ids(unichar_id);
129 return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
136 return !last_word_on_line_ && hyphen_word_;
140 return this->hyphenated() ? hyphen_word_->length() : 0;
146 if (this->hyphenated()) {
147 *word = *hyphen_word_;
148 if (hyphen_debug_level) {
149 word->
print(
"copy_hyphen_info: ");
155 bool first_pos)
const {
156 if (!last_word_on_line_ || first_pos) {
160 const auto &normed_ids = unicharset->
normed_ids(unichar_id);
161 return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
165 int word_index = word.
length() - 1;
171 void reset_hyphen_vars(
bool last_word_on_line);
192 void default_dawgs(
DawgPositionVector *anylength_dawgs,
bool suppress_patterns)
const;
206 bool word_ending,
WERD_CHOICE *word,
float certainties[],
float *limit,
207 WERD_CHOICE *best_choice,
int *attempts_left,
void *void_more_args);
212 bool word_ending,
WERD_CHOICE *word,
float certainties[],
213 float *limit,
WERD_CHOICE *best_choice,
int *attempts_left,
214 void *void_more_args);
220 WERD_CHOICE *word,
float certainties[],
float *limit,
221 WERD_CHOICE *best_choice,
int *attempts_left,
void *more_args);
224 const BLOB_CHOICE &blob_choice,
int char_choice_index,
226 float certainties[],
float *limit,
WERD_CHOICE *best_choice,
227 int *attempts_left,
void *more_args);
229 bool fragment_state_okay(
UNICHAR_ID curr_unichar_id,
float curr_rating,
float curr_certainty,
234#if !defined(DISABLED_LEGACY_ENGINE)
245 void ReplaceAmbig(
int wrong_ngram_begin_index,
int wrong_ngram_size,
UNICHAR_ID correct_ngram_id,
249 int LengthOfShortestAlphaRun(
const WERD_CHOICE &WordChoice)
const;
263 bool AcceptableResult(
WERD_RES *word)
const;
264#if !defined(DISABLED_LEGACY_ENGINE)
265 void EndDangerousAmbigs();
268 void DebugWordChoices();
270 void SettupStopperPass1();
272 void SettupStopperPass2();
286 void SetupForLoad(
DawgCache *dawg_cache);
298 if (pending_words_ !=
nullptr) {
299 pending_words_->clear();
301 if (document_words_ !=
nullptr) {
302 document_words_->clear();
342 int def_letter_is_okay(
void *void_dawg_args,
const UNICHARSET &unicharset,
UNICHAR_ID unichar_id,
343 bool word_end)
const;
349 bool word_end)
const {
350 return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
354 double (
Dict::*probability_in_context_)(
const char *lang,
const char *context,
int context_bytes,
355 const char *
character,
int character_bytes);
358 int character_bytes) {
359 return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
365 const char *
character,
int character_bytes) {
370 (void)character_bytes;
375 wildcard_unichar_id_ = id;
378 return wildcard_unichar_id_;
382 return dawgs_.size();
386 return dawgs_[index];
394 return unambig_dawg_;
398 if (edge_ref == NO_EDGE) {
415 switch (dawg->
type()) {
442 int valid_word(
const WERD_CHOICE &word,
bool numbers_ok)
const;
444 return valid_word(word,
false);
447 return valid_word(word,
true);
452 return valid_word(word);
464 void add_document_word(
const WERD_CHOICE &best_choice);
467 float additional_adjust,
bool modify_rating,
bool debug);
470 wordseg_rating_adjust_factor_ = f;
473 bool IsSpaceDelimitedLang()
const;
484#ifndef DISABLED_LEGACY_ENGINE
490 float reject_offset_;
500 bool last_word_on_line_;
505 std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
508 bool dawg_cache_is_ours_;
512 Trie *pending_words_;
526 Trie *document_words_;
529 float wordseg_rating_adjust_factor_;
531 FILE *output_ambig_words_file_;
std::vector< SuccessorList * > SuccessorListsVector
std::vector< Dawg * > DawgVector
std::vector< DANGERR_INFO > DANGERR
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
UNICHAR_ID unichar_id(unsigned index) const
const UNICHARSET * unicharset() const
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
static const UNICHAR_ID kPatternUnicharID
const CHAR_FRAGMENT * fragment
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
DawgPositionVector * updated_dawgs
DawgPositionVector * active_dawgs
BOOL_VAR_H(use_only_first_uft8_step)
void copy_hyphen_info(WERD_CHOICE *word) const
double_VAR_H(segment_penalty_dict_case_ok)
UNICHAR_ID WildcardID() const
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
STRING_VAR_H(user_words_file)
double_VAR_H(doc_dict_certainty_threshold)
BOOL_VAR_H(load_punc_dawg)
double_VAR_H(stopper_nondict_certainty_base)
const CCUtil * getCCUtil() const
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
int valid_word(const WERD_CHOICE &word) const
BOOL_VAR_H(load_bigram_dawg)
STRING_VAR_H(word_to_debug)
INT_VAR_H(dawg_debug_level)
STRING_VAR_H(user_patterns_suffix)
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
double_VAR_H(segment_penalty_dict_case_bad)
STRING_VAR_H(user_words_suffix)
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
INT_VAR_H(hyphen_debug_level)
UNICHARSET & getUnicharset()
double_VAR_H(certainty_scale)
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
double_VAR_H(xheight_penalty_subscripts)
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
BOOL_VAR_H(load_unambig_dawg)
INT_VAR_H(tessedit_truncate_wordchoice_log)
bool compound_marker(UNICHAR_ID unichar_id)
double_VAR_H(stopper_phase2_certainty_rejection_offset)
STRING_VAR_H(output_ambig_words_file)
double_VAR_H(doc_dict_pending_threshold)
double_VAR_H(segment_penalty_dict_nonword)
const UnicharAmbigs & getUnicharAmbigs() const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
bool is_apostrophe(UNICHAR_ID unichar_id)
void ResetDocumentDictionary()
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
BOOL_VAR_H(load_system_dawg)
BOOL_VAR_H(load_number_dawg)
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.
BOOL_VAR_H(segment_nonalphabetic_script)
double_VAR_H(segment_penalty_garbage)
INT_VAR_H(stopper_debug_level)
double_VAR_H(segment_penalty_dict_frequent_word)
STRING_VAR_H(user_patterns_file)
BOOL_VAR_H(stopper_no_acceptable_choices)
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
INT_VAR_H(stopper_smallword_size)
double_VAR_H(stopper_certainty_per_char)
BOOL_VAR_H(save_doc_words)
double_VAR_H(stopper_allowable_character_badness)
BOOL_VAR_H(load_freq_dawg)
const UNICHARSET & getUnicharset() const
INT_VAR_H(max_permuter_attempts)
void SetWildcardID(UNICHAR_ID id)
double_VAR_H(xheight_penalty_inconsistent)
int valid_word_or_number(const WERD_CHOICE &word) const