All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::Dict Class Reference

#include <dict.h>

Public Member Functions

 Dict (CCUtil *image_ptr)
 
 ~Dict ()
 
const CCUtilgetCCUtil () const
 
CCUtilgetCCUtil ()
 
const UNICHARSETgetUnicharset () const
 
UNICHARSETgetUnicharset ()
 
const UnicharAmbigsgetUnicharAmbigs () const
 
bool compound_marker (UNICHAR_ID unichar_id)
 
bool is_apostrophe (UNICHAR_ID unichar_id)
 
bool hyphenated () const
 Returns true if we've recorded the beginning of a hyphenated word. More...
 
int hyphen_base_size () const
 Size of the base word (the part on the line before) of a hyphenated word. More...
 
void copy_hyphen_info (WERD_CHOICE *word) const
 
bool has_hyphen_end (UNICHAR_ID unichar_id, bool first_pos) const
 Check whether the word has a hyphen at the end. More...
 
bool has_hyphen_end (const WERD_CHOICE &word) const
 Same as above, but check the unichar at the end of the word. More...
 
void reset_hyphen_vars (bool last_word_on_line)
 
void set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
 
void update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)
 
void init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const
 
void default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
 
bool NoDangerousAmbig (WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
 
void ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
 
int LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice)
 Returns the length of the shortest alpha run in WordChoice. More...
 
int UniformCertainties (const WERD_CHOICE &word)
 
bool AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
 Returns true if the given best_choice is good enough to stop. More...
 
bool AcceptableResult (WERD_RES *word)
 
void EndDangerousAmbigs ()
 
void DebugWordChoices ()
 Prints the current choices for this word to stdout. More...
 
void SettupStopperPass1 ()
 Sets up stopper variables in preparation for the first pass. More...
 
void SettupStopperPass2 ()
 Sets up stopper variables in preparation for the second pass. More...
 
int case_ok (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 Check a string to see if it matches a set of lexical rules. More...
 
bool absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)
 
void Load (DawgCache *dawg_cache)
 
void End ()
 
void ResetDocumentDictionary ()
 
int def_letter_is_okay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
int LetterIsOkay (void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 Calls letter_is_okay_ member function. More...
 
double ProbabilityInContext (const char *context, int context_bytes, const char *character, int character_bytes)
 Calls probability_in_context_ member function. More...
 
double def_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Default (no-op) implementation of probability in context function. More...
 
double ngram_probability_in_context (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 
float ParamsModelClassify (const char *lang, void *path)
 
float CallParamsModelClassify (void *path)
 
void SetWildcardID (UNICHAR_ID id)
 
const UNICHAR_ID WildcardID () const
 
const int NumDawgs () const
 Return the number of dawgs in the dawgs_ vector. More...
 
const DawgGetDawg (int index) const
 Return i-th dawg pointer recorded in the dawgs_ vector. More...
 
const DawgGetPuncDawg () const
 Return the points to the punctuation dawg. More...
 
const DawgGetUnambigDawg () const
 Return the points to the unambiguous words dawg. More...
 
UNICHAR_ID char_for_dawg (UNICHAR_ID ch, const Dawg *dawg) const
 
void ProcessPatternEdges (const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
 
int valid_word (const WERD_CHOICE &word, bool numbers_ok) const
 
int valid_word (const WERD_CHOICE &word) const
 
int valid_word_or_number (const WERD_CHOICE &word) const
 
int valid_word (const char *string) const
 This function is used by api/tesseract_cube_combiner.cpp. More...
 
bool valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
 
bool valid_punctuation (const WERD_CHOICE &word)
 
int good_choice (const WERD_CHOICE &choice)
 Returns true if a good answer is found for the unknown blob rating. More...
 
void add_document_word (const WERD_CHOICE &best_choice)
 Adds a word found on this document to the document specific dictionary. More...
 
void adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
 Adjusts the rating of the given word. More...
 
void SetWordsegRatingAdjustFactor (float f)
 Set wordseg_rating_adjust_factor_ to the given value. More...
 
go_deeper_dawg_fxn

If the choice being composed so far could be a dictionary word keep exploring choices.

WERD_CHOICEdawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
 
void go_deeper_dawg_fxn (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 
void permute_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
void append_choices (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
 
fragment_state

Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated.

The given prev_char_frag_info contains:

  • fragment: if not NULL contains information about immediately preceeding fragmented character choice
  • num_fragments: number of fragments that have been used so far to construct a character
  • certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far
  • rating: rating of the current choice or sum of fragment ratings concatenated so far

The output char_frag_info is filled in as follows:

  • character: is set to be NULL if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment
  • fragment,num_fragments,certainty,rating are set as described above
Returns
false if a non-matching fragment is discovered, true otherwise.
bool fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
 

Static Public Member Functions

static DawgCacheGlobalDawgCache ()
 
static NODE_REF GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
 Returns the appropriate next node given the EDGE_REF. More...
 
static bool valid_word_permuter (uinT8 perm, bool numbers_ok)
 Check all the DAWGs to see if this word is in any of them. More...
 

Public Attributes

void(Dict::* go_deeper_fxn_ )(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
 Pointer to go_deeper function. More...
 
int(Dict::* letter_is_okay_ )(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
 
double(Dict::* probability_in_context_ )(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
 Probability in context function used by the ngram permuter. More...
 
float(Dict::* params_model_classify_ )(const char *lang, void *path)
 
char * user_words_file = ""
 
char * user_words_suffix = ""
 
char * user_patterns_file = ""
 
char * user_patterns_suffix = ""
 
bool load_system_dawg = true
 
bool load_freq_dawg = true
 
bool load_unambig_dawg = true
 
bool load_punc_dawg = true
 
bool load_number_dawg = true
 
bool load_bigram_dawg = true
 
double xheight_penalty_subscripts = 0.125
 
double xheight_penalty_inconsistent = 0.25
 
double segment_penalty_dict_frequent_word = 1.0
 
double segment_penalty_dict_case_ok = 1.1
 
double segment_penalty_dict_case_bad = 1.3125
 
double segment_penalty_ngram_best_choice = 1.24
 
double segment_penalty_dict_nonword = 1.25
 
double segment_penalty_garbage = 1.50
 
char * output_ambig_words_file = ""
 
int dawg_debug_level = 0
 
int hyphen_debug_level = 0
 
int max_viterbi_list_size = 10
 
bool use_only_first_uft8_step = false
 
double certainty_scale = 20.0
 
double stopper_nondict_certainty_base = -2.50
 
double stopper_phase2_certainty_rejection_offset = 1.0
 
int stopper_smallword_size = 2
 
double stopper_certainty_per_char = -0.50
 
double stopper_allowable_character_badness = 3.0
 
int stopper_debug_level = 0
 
bool stopper_no_acceptable_choices = false
 
bool save_raw_choices = false
 
int tessedit_truncate_wordchoice_log = 10
 
char * word_to_debug = ""
 
char * word_to_debug_lengths = ""
 
int fragments_debug = 0
 
bool segment_nonalphabetic_script = false
 
bool save_doc_words = 0
 
double doc_dict_pending_threshold = 0.0
 
double doc_dict_certainty_threshold = -2.25
 
int max_permuter_attempts = 10000
 

Detailed Description

Definition at line 86 of file dict.h.

Constructor & Destructor Documentation

tesseract::Dict::Dict ( CCUtil image_ptr)

Definition at line 33 of file dict.cpp.

37  ccutil_(ccutil),
39  "A filename of user-provided words.",
40  getCCUtil()->params()),
42  "A suffix of user-provided words located in tessdata.",
43  getCCUtil()->params()),
45  "A filename of user-provided patterns.",
46  getCCUtil()->params()),
48  "A suffix of user-provided patterns located in "
49  "tessdata.",
50  getCCUtil()->params()),
51  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
52  getCCUtil()->params()),
53  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
54  getCCUtil()->params()),
55  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
56  getCCUtil()->params()),
57  BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
58  " patterns.", getCCUtil()->params()),
59  BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
60  " patterns.", getCCUtil()->params()),
61  BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word "
62  "bigrams.", getCCUtil()->params()),
64  "Score penalty (0.1 = 10%) added if there are subscripts "
65  "or superscripts in a word, but it is otherwise OK.",
66  getCCUtil()->params()),
68  "Score penalty (0.1 = 10%) added if an xheight is "
69  "inconsistent.", getCCUtil()->params()),
71  "Score multiplier for word matches which have good case and"
72  "are frequent in the given language (lower is better).",
73  getCCUtil()->params()),
75  "Score multiplier for word matches that have good case "
76  "(lower is better).", getCCUtil()->params()),
78  "Default score multiplier for word matches, which may have "
79  "case issues (lower is better).",
80  getCCUtil()->params()),
82  "Multipler to for the best choice from the ngram model.",
83  getCCUtil()->params()),
85  "Score multiplier for glyph fragment segmentations which "
86  "do not match a dictionary word (lower is better).",
87  getCCUtil()->params()),
89  "Score multiplier for poorly cased strings that are not in"
90  " the dictionary and generally look like garbage (lower is"
91  " better).", getCCUtil()->params()),
93  "Output file for ambiguities found in the dictionary",
94  getCCUtil()->params()),
95  INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
96  ", to 2 for more details, to 3 to see all the debug messages",
97  getCCUtil()->params()),
98  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
99  getCCUtil()->params()),
100  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
101  getCCUtil()->params()),
103  "Use only the first UTF8 step of the given string"
104  " when computing log probabilities.",
105  getCCUtil()->params()),
106  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
107  getCCUtil()->params()),
109  "Certainty threshold for non-dict words",
110  getCCUtil()->params()),
112  "Reject certainty offset",
113  getCCUtil()->params()),
115  "Size of dict word to be treated as non-dict word",
116  getCCUtil()->params()),
117  double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
118  " for each dict char above small word size.",
119  getCCUtil()->params()),
121  "Max certaintly variation allowed in a word (in sigma)",
122  getCCUtil()->params()),
123  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
124  getCCUtil()->params()),
126  "Make AcceptableChoice() always return false. Useful"
127  " when there is a need to explore all segmentations",
128  getCCUtil()->params()),
130  "Deprecated- backward compatablity only",
131  getCCUtil()->params()),
133  "Max words to keep in list",
134  getCCUtil()->params()),
135  STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
136  " information should be printed to stdout",
137  getCCUtil()->params()),
139  "Lengths of unichars in word_to_debug",
140  getCCUtil()->params()),
141  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
142  getCCUtil()->params()),
144  "Don't use any alphabetic-specific tricks."
145  "Set to true in the traineddata config file for"
146  " scripts that are cursive or inherently fixed-pitch",
147  getCCUtil()->params()),
148  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
149  getCCUtil()->params()),
151  "Worst certainty for using pending dictionary",
152  getCCUtil()->params()),
154  "Worst certainty for words that can be inserted into the"
155  "document dictionary", getCCUtil()->params()),
156  INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
157  " character choices to consider during permutation."
158  " This limit is especially useful when user patterns"
159  " are specified, since overly generic patterns can result in"
160  " dawg search exploring an overly large number of options.",
161  getCCUtil()->params()) {
162  dang_ambigs_table_ = NULL;
163  replace_ambigs_table_ = NULL;
164  reject_offset_ = 0.0;
166  hyphen_word_ = NULL;
167  last_word_on_line_ = false;
168  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
169  document_words_ = NULL;
170  dawg_cache_ = NULL;
171  dawg_cache_is_ours_ = false;
172  pending_words_ = NULL;
173  bigram_dawg_ = NULL;
174  freq_dawg_ = NULL;
175  punc_dawg_ = NULL;
176  unambig_dawg_ = NULL;
177  wordseg_rating_adjust_factor_ = -1.0f;
178  output_ambig_words_file_ = NULL;
179 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:605
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:357
int hyphen_debug_level
Definition: dict.h:596
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:374
int max_permuter_attempts
Definition: dict.h:637
char * word_to_debug_lengths
Definition: dict.h:622
const CCUtil * getCCUtil() const
Definition: dict.h:90
char * word_to_debug
Definition: dict.h:620
bool load_bigram_dawg
Definition: dict.h:561
double stopper_allowable_character_badness
Definition: dict.h:611
double segment_penalty_dict_case_ok
Definition: dict.h:574
bool segment_nonalphabetic_script
Definition: dict.h:627
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:336
double xheight_penalty_inconsistent
Definition: dict.h:567
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
double segment_penalty_ngram_best_choice
Definition: dict.h:582
double segment_penalty_dict_case_bad
Definition: dict.h:578
bool save_doc_words
Definition: dict.h:628
int stopper_smallword_size
Definition: dict.h:607
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
int fragments_debug
Definition: dict.h:623
char * user_patterns_file
Definition: dict.h:551
double stopper_certainty_per_char
Definition: dict.h:609
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:390
double doc_dict_pending_threshold
Definition: dict.h:630
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
double xheight_penalty_subscripts
Definition: dict.h:564
int max_viterbi_list_size
Definition: dict.h:597
bool use_only_first_uft8_step
Definition: dict.h:600
double stopper_nondict_certainty_base
Definition: dict.h:603
double segment_penalty_dict_frequent_word
Definition: dict.h:570
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
int stopper_debug_level
Definition: dict.h:612
char * user_patterns_suffix
Definition: dict.h:553
bool load_system_dawg
Definition: dict.h:554
bool stopper_no_acceptable_choices
Definition: dict.h:615
double doc_dict_certainty_threshold
Definition: dict.h:632
int dawg_debug_level
Definition: dict.h:595
bool load_unambig_dawg
Definition: dict.h:556
bool load_freq_dawg
Definition: dict.h:555
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
char * user_words_suffix
Definition: dict.h:549
char * output_ambig_words_file
Definition: dict.h:593
double segment_penalty_dict_nonword
Definition: dict.h:586
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:319
double segment_penalty_garbage
Definition: dict.h:591
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
char * user_words_file
Definition: dict.h:547
bool save_raw_choices
Definition: dict.h:617
#define NULL
Definition: host.h:144
int tessedit_truncate_wordchoice_log
Definition: dict.h:618
CCUtil ccutil
bool load_number_dawg
Definition: dict.h:559
bool load_punc_dawg
Definition: dict.h:558
double certainty_scale
Definition: dict.h:601
tesseract::Dict::~Dict ( )

Definition at line 181 of file dict.cpp.

181  {
182  if (hyphen_word_ != NULL) delete hyphen_word_;
183  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
184 }
#define NULL
Definition: host.h:144

Member Function Documentation

bool tesseract::Dict::absolute_garbage ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 76 of file context.cpp.

77  {
78  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
79  int num_alphanum = 0;
80  for (int x = 0; x < word.length(); ++x) {
81  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
82  unicharset.get_isdigit(word.unichar_id(x)));
83  }
84  return (static_cast<float>(num_alphanum) /
85  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
86 }
int length() const
Definition: ratngs.h:300
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
bool tesseract::Dict::AcceptableChoice ( const WERD_CHOICE best_choice,
XHeightConsistencyEnum  xheight_consistency 
)

Returns true if the given best_choice is good enough to stop.

Definition at line 51 of file stopper.cpp.

52  {
53  float CertaintyThreshold = stopper_nondict_certainty_base;
54  int WordSize;
55 
56  if (stopper_no_acceptable_choices) return false;
57 
58  if (best_choice.length() == 0) return false;
59 
60  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
61  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
62  bool is_case_ok = case_ok(best_choice, getUnicharset());
63 
64  if (stopper_debug_level >= 1) {
65  const char *xht = "UNKNOWN";
66  switch (xheight_consistency) {
67  case XH_GOOD: xht = "NORMAL"; break;
68  case XH_SUBNORMAL: xht = "SUBNORMAL"; break;
69  case XH_INCONSISTENT: xht = "INCONSISTENT"; break;
70  default: xht = "UNKNOWN";
71  }
72  tprintf("\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
73  best_choice.unichar_string().string(),
74  (is_valid_word ? 'y' : 'n'),
75  (is_case_ok ? 'y' : 'n'),
76  xht,
77  best_choice.min_x_height(),
78  best_choice.max_x_height());
79  }
80  // Do not accept invalid words in PASS1.
81  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
82  if (is_valid_word && is_case_ok) {
83  WordSize = LengthOfShortestAlphaRun(best_choice);
84  WordSize -= stopper_smallword_size;
85  if (WordSize < 0)
86  WordSize = 0;
87  CertaintyThreshold += WordSize * stopper_certainty_per_char;
88  }
89 
90  if (stopper_debug_level >= 1)
91  tprintf("Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
92  best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
93 
94  if (no_dang_ambigs &&
95  best_choice.certainty() > CertaintyThreshold &&
96  xheight_consistency < XH_INCONSISTENT &&
97  UniformCertainties(best_choice)) {
98  return true;
99  } else {
100  if (stopper_debug_level >= 1) {
101  tprintf("AcceptableChoice() returned false"
102  " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
103  no_dang_ambigs, best_choice.certainty(),
104  CertaintyThreshold,
105  UniformCertainties(best_choice));
106  }
107  return false;
108  }
109 }
float rating() const
Definition: ratngs.h:324
int length() const
Definition: ratngs.h:300
#define tprintf(...)
Definition: tprintf.h:31
int stopper_smallword_size
Definition: dict.h:607
float min_x_height() const
Definition: ratngs.h:333
bool dangerous_ambig_found() const
Definition: ratngs.h:360
const STRING & unichar_string() const
Definition: ratngs.h:524
double stopper_certainty_per_char
Definition: dict.h:609
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:452
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
float certainty() const
Definition: ratngs.h:327
double stopper_nondict_certainty_base
Definition: dict.h:603
uinT8 permuter() const
Definition: ratngs.h:343
int stopper_debug_level
Definition: dict.h:612
bool stopper_no_acceptable_choices
Definition: dict.h:615
float max_x_height() const
Definition: ratngs.h:336
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
const char * string() const
Definition: strngs.cpp:193
int UniformCertainties(const WERD_CHOICE &word)
Definition: stopper.cpp:471
bool tesseract::Dict::AcceptableResult ( WERD_RES word)

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 111 of file stopper.cpp.

111  {
112  if (word->best_choice == NULL) return false;
113  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
114  int WordSize;
115 
116  if (stopper_debug_level >= 1) {
117  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
118  word->best_choice->debug_string().string(),
119  (valid_word(*word->best_choice) ? 'y' : 'n'),
120  (case_ok(*word->best_choice, getUnicharset()) ? 'y' : 'n'),
121  word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
122  word->best_choices.singleton() ? 'n' : 'y');
123  }
124 
125  if (word->best_choice->length() == 0 || !word->best_choices.singleton())
126  return false;
127  if (valid_word(*word->best_choice) &&
128  case_ok(*word->best_choice, getUnicharset())) {
129  WordSize = LengthOfShortestAlphaRun(*word->best_choice);
130  WordSize -= stopper_smallword_size;
131  if (WordSize < 0)
132  WordSize = 0;
133  CertaintyThreshold += WordSize * stopper_certainty_per_char;
134  }
135 
136  if (stopper_debug_level >= 1)
137  tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
138  word->best_choice->certainty(), CertaintyThreshold);
139 
140  if (word->best_choice->certainty() > CertaintyThreshold &&
142  if (stopper_debug_level >= 1)
143  tprintf("ACCEPTED\n");
144  return true;
145  } else {
146  if (stopper_debug_level >= 1)
147  tprintf("REJECTED\n");
148  return false;
149  }
150 }
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
#define tprintf(...)
Definition: tprintf.h:31
int stopper_smallword_size
Definition: dict.h:607
bool dangerous_ambig_found() const
Definition: ratngs.h:360
double stopper_certainty_per_char
Definition: dict.h:609
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice)
Returns the length of the shortest alpha run in WordChoice.
Definition: stopper.cpp:452
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
float certainty() const
Definition: ratngs.h:327
double stopper_nondict_certainty_base
Definition: dict.h:603
const STRING debug_string() const
Definition: ratngs.h:502
int stopper_debug_level
Definition: dict.h:612
bool stopper_no_acceptable_choices
Definition: dict.h:615
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void tesseract::Dict::add_document_word ( const WERD_CHOICE best_choice)

Adds a word found on this document to the document specific dictionary.

Definition at line 567 of file dict.cpp.

567  {
568  // Do not add hyphenated word parts to the document dawg.
569  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
570  // called when the first part of the hyphenated word is
571  // discovered and while the second part of the word is recognized.
572  // hyphen_word_ is cleared in cc_recg() before the next word on
573  // the line is recognized.
574  if (hyphen_word_) return;
575 
576  char filename[CHARS_PER_LINE];
577  FILE *doc_word_file;
578  int stringlen = best_choice.length();
579 
580  if (valid_word(best_choice) || stringlen < 2)
581  return;
582 
583  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
584  if (best_choice.length() >= kDocDictMaxRepChars) {
585  int num_rep_chars = 1;
586  UNICHAR_ID uch_id = best_choice.unichar_id(0);
587  for (int i = 1; i < best_choice.length(); ++i) {
588  if (best_choice.unichar_id(i) != uch_id) {
589  num_rep_chars = 1;
590  uch_id = best_choice.unichar_id(i);
591  } else {
592  ++num_rep_chars;
593  if (num_rep_chars == kDocDictMaxRepChars) return;
594  }
595  }
596  }
597 
598  if (best_choice.certainty() < doc_dict_certainty_threshold ||
599  stringlen == 2) {
600  if (best_choice.certainty() < doc_dict_pending_threshold)
601  return;
602 
603  if (!pending_words_->word_in_dawg(best_choice)) {
604  if (stringlen > 2 ||
605  (stringlen == 2 &&
606  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
607  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
608  pending_words_->add_word_to_dawg(best_choice);
609  }
610  return;
611  }
612  }
613 
614  if (save_doc_words) {
615  strcpy(filename, getCCUtil()->imagefile.string());
616  strcat(filename, ".doc");
617  doc_word_file = open_file (filename, "a");
618  fprintf(doc_word_file, "%s\n",
619  best_choice.debug_string().string());
620  fclose(doc_word_file);
621  }
622  document_words_->add_word_to_dawg(best_choice);
623 }
const CCUtil * getCCUtil() const
Definition: dict.h:90
int length() const
Definition: ratngs.h:300
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
bool save_doc_words
Definition: dict.h:628
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
double doc_dict_pending_threshold
Definition: dict.h:630
float certainty() const
Definition: ratngs.h:327
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const STRING debug_string() const
Definition: ratngs.h:502
int UNICHAR_ID
Definition: unichar.h:33
double doc_dict_certainty_threshold
Definition: dict.h:632
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
#define CHARS_PER_LINE
Definition: cutil.h:57
const char * string() const
Definition: strngs.cpp:193
void tesseract::Dict::adjust_word ( WERD_CHOICE word,
bool  nonword,
XHeightConsistencyEnum  xheight_consistency,
float  additional_adjust,
bool  modify_rating,
bool  debug 
)

Adjusts the rating of the given word.

Definition at line 625 of file dict.cpp.

630  {
631  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
632  word->GetTopScriptID() == getUnicharset().han_sid());
633  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
634  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
635 
636  float adjust_factor = additional_adjust;
637  float new_rating = word->rating();
638  new_rating += kRatingPad;
639  const char *xheight_triggered = "";
640  if (word->length() > 1) {
641  // Calculate x-height and y-offset consistency penalties.
642  switch (xheight_consistency) {
643  case XH_INCONSISTENT:
644  adjust_factor += xheight_penalty_inconsistent;
645  xheight_triggered = ", xhtBAD";
646  break;
647  case XH_SUBNORMAL:
648  adjust_factor += xheight_penalty_subscripts;
649  xheight_triggered = ", xhtSUB";
650  break;
651  case XH_GOOD:
652  // leave the factor alone - all good!
653  break;
654  }
655  // TODO(eger): if nonword is true, but there is a "core" thats' a dict
656  // word, negate nonword status.
657  } else {
658  if (debug) {
659  tprintf("Consistency could not be calculated.\n");
660  }
661  }
662  if (debug) {
663  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
664  word->unichar_string().string(), word->rating(),
665  xheight_triggered);
666  }
667 
668  if (nonword) { // non-dictionary word
669  if (case_is_ok && punc_is_ok) {
670  adjust_factor += segment_penalty_dict_nonword;
671  new_rating *= adjust_factor;
672  if (debug) tprintf(", W");
673  } else {
674  adjust_factor += segment_penalty_garbage;
675  new_rating *= adjust_factor;
676  if (debug) {
677  if (!case_is_ok) tprintf(", C");
678  if (!punc_is_ok) tprintf(", P");
679  }
680  }
681  } else { // dictionary word
682  if (case_is_ok) {
683  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
685  adjust_factor += segment_penalty_dict_frequent_word;
686  new_rating *= adjust_factor;
687  if (debug) tprintf(", F");
688  } else {
689  adjust_factor += segment_penalty_dict_case_ok;
690  new_rating *= adjust_factor;
691  if (debug) tprintf(", ");
692  }
693  } else {
694  adjust_factor += segment_penalty_dict_case_bad;
695  new_rating *= adjust_factor;
696  if (debug) tprintf(", C");
697  }
698  }
699  new_rating -= kRatingPad;
700  if (modify_rating) word->set_rating(new_rating);
701  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
702  word->set_adjust_factor(adjust_factor);
703 }
float rating() const
Definition: ratngs.h:324
int length() const
Definition: ratngs.h:300
int null_sid() const
Definition: unicharset.h:831
#define tprintf(...)
Definition: tprintf.h:31
int han_sid() const
Definition: unicharset.h:836
double segment_penalty_dict_case_ok
Definition: dict.h:574
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
double xheight_penalty_inconsistent
Definition: dict.h:567
double segment_penalty_dict_case_bad
Definition: dict.h:578
const STRING & unichar_string() const
Definition: ratngs.h:524
int GetTopScriptID() const
Definition: ratngs.cpp:653
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:781
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
double xheight_penalty_subscripts
Definition: dict.h:564
double segment_penalty_dict_frequent_word
Definition: dict.h:570
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
double segment_penalty_dict_nonword
Definition: dict.h:586
double segment_penalty_garbage
Definition: dict.h:591
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void set_adjust_factor(float factor)
Definition: ratngs.h:306
void set_rating(float new_val)
Definition: ratngs.h:366
void tesseract::Dict::append_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
const BLOB_CHOICE blob_choice,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 246 of file permdawg.cpp.

257  {
258  int word_ending =
259  (char_choice_index == char_choices.length() - 1) ? true : false;
260 
261  // Deal with fragments.
262  CHAR_FRAGMENT_INFO char_frag_info;
263  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
264  blob_choice.certainty(), prev_char_frag_info, debug,
265  word_ending, &char_frag_info)) {
266  return; // blob_choice must be an invalid fragment
267  }
268  // Search the next letter if this character is a fragment.
269  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
270  permute_choices(debug, char_choices, char_choice_index + 1,
271  &char_frag_info, word, certainties, limit,
272  best_choice, attempts_left, more_args);
273  return;
274  }
275 
276  // Add the next unichar.
277  float old_rating = word->rating();
278  float old_certainty = word->certainty();
279  uinT8 old_permuter = word->permuter();
280  certainties[word->length()] = char_frag_info.certainty;
282  char_frag_info.unichar_id, char_frag_info.num_fragments,
283  char_frag_info.rating, char_frag_info.certainty);
284 
285  // Explore the next unichar.
286  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
287  &char_frag_info, word_ending, word, certainties,
288  limit, best_choice, attempts_left, more_args);
289 
290  // Remove the unichar we added to explore other choices in it's place.
291  word->remove_last_unichar_id();
292  word->set_rating(old_rating);
293  word->set_certainty(old_certainty);
294  word->set_permuter(old_permuter);
295 }
float rating() const
Definition: ratngs.h:324
int length() const
Definition: genericvector.h:79
void set_certainty(float new_val)
Definition: ratngs.h:369
int length() const
Definition: ratngs.h:300
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
UNICHAR_ID unichar_id
Definition: dict.h:41
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
float rating() const
Definition: ratngs.h:79
void remove_last_unichar_id()
Definition: ratngs.h:480
float certainty() const
Definition: ratngs.h:327
uinT8 permuter() const
Definition: ratngs.h:343
int num_fragments
Definition: dict.h:43
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:322
float rating
Definition: dict.h:44
float certainty
Definition: dict.h:45
float certainty() const
Definition: ratngs.h:82
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void set_rating(float new_val)
Definition: ratngs.h:366
unsigned char uinT8
Definition: host.h:99
float tesseract::Dict::CallParamsModelClassify ( void *  path)
inline

Definition at line 393 of file dict.h.

393  {
394  ASSERT_HOST(params_model_classify_ != NULL); // ASSERT_HOST -> assert
395  return (this->*params_model_classify_)(
396  getCCUtil()->lang.string(), path);
397  }
const CCUtil * getCCUtil() const
Definition: dict.h:90
#define ASSERT_HOST(x)
Definition: errcode.h:84
float(Dict::* params_model_classify_)(const char *lang, void *path)
Definition: dict.h:390
STRING lang
Definition: ccutil.h:69
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int tesseract::Dict::case_ok ( const WERD_CHOICE word,
const UNICHARSET unicharset 
)

Check a string to see if it matches a set of lexical rules.

Definition at line 58 of file context.cpp.

58  {
59  int state = 0;
60  int x;
61  for (x = 0; x < word.length(); ++x) {
62  UNICHAR_ID ch_id = word.unichar_id(x);
63  if (unicharset.get_isupper(ch_id))
64  state = case_state_table[state][1];
65  else if (unicharset.get_islower(ch_id))
66  state = case_state_table[state][2];
67  else if (unicharset.get_isdigit(ch_id))
68  state = case_state_table[state][3];
69  else
70  state = case_state_table[state][0];
71  if (state == -1) return false;
72  }
73  return state != 5; // single lower is bad
74 }
int length() const
Definition: ratngs.h:300
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
const int case_state_table[6][4]
Definition: context.cpp:35
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int UNICHAR_ID
Definition: unichar.h:33
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
UNICHAR_ID tesseract::Dict::char_for_dawg ( UNICHAR_ID  ch,
const Dawg dawg 
) const
inline

Definition at line 422 of file dict.h.

422  {
423  if (!dawg) return ch;
424  switch (dawg->type()) {
425  case DAWG_TYPE_NUMBER:
427  default:
428  return ch;
429  }
430  }
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool tesseract::Dict::compound_marker ( UNICHAR_ID  unichar_id)
inline

Definition at line 107 of file dict.h.

107  {
108  const GenericVector<UNICHAR_ID>& normed_ids =
109  getUnicharset().normed_ids(unichar_id);
110  return normed_ids.size() == 1 &&
111  (normed_ids[0] == hyphen_unichar_id_ ||
112  normed_ids[0] == slash_unichar_id_);
113  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE word) const
inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not NULL.

Definition at line 135 of file dict.h.

135  {
136  if (this->hyphenated()) {
137  *word = *hyphen_word_;
138  if (hyphen_debug_level) word->print("copy_hyphen_info: ");
139  }
140  }
int hyphen_debug_level
Definition: dict.h:596
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
void print() const
Definition: ratngs.h:563
WERD_CHOICE * tesseract::Dict::dawg_permute_and_select ( const BLOB_CHOICE_LIST_VECTOR char_choices,
float  rating_limit 
)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 175 of file permdawg.cpp.

176  {
177  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
178  best_choice->make_bad();
179  best_choice->set_rating(rating_limit);
180  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
181  return best_choice;
182  DawgPositionVector *active_dawgs =
183  new DawgPositionVector[char_choices.length() + 1];
184  init_active_dawgs(&(active_dawgs[0]), true);
185  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
187 
188  float certainties[MAX_WERD_LENGTH];
190  int attempts_left = max_permuter_attempts;
191  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL,
192  char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice,
193  &attempts_left, &dawg_args);
194  delete[] active_dawgs;
195  return best_choice;
196 }
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:52
int length() const
Definition: genericvector.h:79
int max_permuter_attempts
Definition: dict.h:637
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:440
int dawg_debug_level
Definition: dict.h:595
#define MAX_WERD_LENGTH
Definition: dict.h:36
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define NULL
Definition: host.h:144
void set_rating(float new_val)
Definition: ratngs.h:366
void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

int tesseract::Dict::def_letter_is_okay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable therefrom – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, nd updated_dawgs member variables of dawg_args are not NULL.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 336 of file dict.cpp.

338  {
339  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
340 
341  if (dawg_debug_level >= 3) {
342  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
343  " num active dawgs=%d\n",
344  getUnicharset().debug_str(unichar_id).string(), word_end,
345  dawg_args->active_dawgs->length());
346  }
347 
348  // Do not accept words that contain kPatternUnicharID.
349  // (otherwise pattern dawgs would not function correctly).
350  // Do not accept words containing INVALID_UNICHAR_IDs.
351  if (unichar_id == Dawg::kPatternUnicharID ||
352  unichar_id == INVALID_UNICHAR_ID) {
353  dawg_args->permuter = NO_PERM;
354  return NO_PERM;
355  }
356 
357  // Initialization.
358  PermuterType curr_perm = NO_PERM;
359  dawg_args->updated_dawgs->clear();
360 
361  // Go over the active_dawgs vector and insert DawgPosition records
362  // with the updated ref (an edge with the corresponding unichar id) into
363  // dawg_args->updated_pos.
364  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
365  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
366  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
367  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
368 
369  if (!dawg && !punc_dawg) {
370  // shouldn't happen.
371  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
372  continue;
373  }
374  if (!dawg) {
375  // We're in the punctuation dawg. A core dawg has not been chosen.
376  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
377  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
378  punc_node, Dawg::kPatternUnicharID, word_end);
379  if (punc_transition_edge != NO_EDGE) {
380  // Find all successors, and see which can transition.
381  const SuccessorList &slist = *(successors_[pos.punc_index]);
382  for (int s = 0; s < slist.length(); ++s) {
383  int sdawg_index = slist[s];
384  const Dawg *sdawg = dawgs_[sdawg_index];
385  UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
386  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
387  if (dawg_edge != NO_EDGE) {
388  if (dawg_debug_level >=3) {
389  tprintf("Letter found in dawg %d\n", sdawg_index);
390  }
391  dawg_args->updated_dawgs->add_unique(
392  DawgPosition(sdawg_index, dawg_edge,
393  pos.punc_index, punc_transition_edge, false),
394  dawg_debug_level > 0,
395  "Append transition from punc dawg to current dawgs: ");
396  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
397  }
398  }
399  }
400  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
401  word_end);
402  if (punc_edge != NO_EDGE) {
403  if (dawg_debug_level >=3) {
404  tprintf("Letter found in punctuation dawg\n");
405  }
406  dawg_args->updated_dawgs->add_unique(
407  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
408  dawg_debug_level > 0,
409  "Extend punctuation dawg: ");
410  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
411  }
412  continue;
413  }
414 
415  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
416  // We can end the main word here.
417  // If we can continue on the punc ref, add that possibility.
418  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
419  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
420  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
421  if (punc_edge != NO_EDGE) {
422  dawg_args->updated_dawgs->add_unique(
423  DawgPosition(pos.dawg_index, pos.dawg_ref,
424  pos.punc_index, punc_edge, true),
425  dawg_debug_level > 0,
426  "Return to punctuation dawg: ");
427  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
428  }
429  }
430 
431  if (pos.back_to_punc) continue;
432 
433  // If we are dealing with the pattern dawg, look up all the
434  // possible edges, not only for the exact unichar_id, but also
435  // for all its character classes (alpha, digit, etc).
436  if (dawg->type() == DAWG_TYPE_PATTERN) {
437  ProcessPatternEdges(dawg, pos, unichar_id, word_end,
438  dawg_args->updated_dawgs, &curr_perm);
439  // There can't be any successors to dawg that is of type
440  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
441  continue;
442  }
443 
444  // Find the edge out of the node for the unichar_id.
445  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
446  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
447  : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
448 
449  if (dawg_debug_level >= 3) {
450  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
451  pos.dawg_index, node, edge);
452  }
453 
454  if (edge != NO_EDGE) { // the unichar was found in the current dawg
455  if (dawg_debug_level >=3) {
456  tprintf("Letter found in dawg %d\n", pos.dawg_index);
457  }
458  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
459  if (dawg_debug_level >= 3) {
460  tprintf("Punctuation constraint not satisfied at end of word.\n");
461  }
462  continue;
463  }
464  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
465  dawg_args->updated_dawgs->add_unique(
466  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
467  false),
468  dawg_debug_level > 0,
469  "Append current dawg to updated active dawgs: ");
470  }
471  } // end for
472  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
473  // or if we found the current letter in a non-punctuation dawg. This
474  // allows preserving information on which dawg the "core" word came from.
475  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
476  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
477  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
478  dawg_args->permuter = curr_perm;
479  }
480  if (dawg_debug_level >= 2) {
481  tprintf("Returning %d for permuter code for this character.\n");
482  }
483  return dawg_args->permuter;
484 }
#define tprintf(...)
Definition: tprintf.h:31
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
Definition: dict.cpp:486
PermuterType
Definition: ratngs.h:240
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422
int UNICHAR_ID
Definition: unichar.h:33
int dawg_debug_level
Definition: dict.h:595
#define REFFORMAT
Definition: dawg.h:92
GenericVector< int > SuccessorList
Definition: dawg.h:68
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
inT64 EDGE_REF
Definition: dawg.h:54
#define NULL
Definition: host.h:144
inT64 NODE_REF
Definition: dawg.h:55
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
double tesseract::Dict::def_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Default (no-op) implementation of probability in context function.

Definition at line 374 of file dict.h.

376  {
377  (void) context;
378  (void) context_bytes;
379  (void) character;
380  (void) character_bytes;
381  return 0.0;
382  }
void tesseract::Dict::default_dawgs ( DawgPositionVector anylength_dawgs,
bool  suppress_patterns 
) const

Definition at line 540 of file dict.cpp.

541  {
542  bool punc_dawg_available =
543  (punc_dawg_ != NULL) &&
544  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
545 
546  for (int i = 0; i < dawgs_.length(); i++) {
547  if (dawgs_[i] != NULL &&
548  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
549  int dawg_ty = dawgs_[i]->type();
550  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
551  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
552  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
553  if (dawg_debug_level >= 3) {
554  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
555  NO_EDGE);
556  }
557  } else if (!punc_dawg_available || !subsumed_by_punc) {
558  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
559  if (dawg_debug_level >= 3) {
560  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
561  }
562  }
563  }
564  }
565 }
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
int dawg_debug_level
Definition: dict.h:595
#define REFFORMAT
Definition: dawg.h:92
#define NULL
Definition: host.h:144
void tesseract::Dict::End ( )

Definition at line 310 of file dict.cpp.

310  {
311  if (dawgs_.length() == 0)
312  return; // Not safe to call twice.
313  for (int i = 0; i < dawgs_.size(); i++) {
314  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
315  delete dawgs_[i];
316  }
317  }
318  dawg_cache_->FreeDawg(bigram_dawg_);
319  if (dawg_cache_is_ours_) {
320  delete dawg_cache_;
321  dawg_cache_ = NULL;
322  }
323  successors_.delete_data_pointers();
324  dawgs_.clear();
325  successors_.clear();
326  document_words_ = NULL;
327  if (pending_words_ != NULL) {
328  delete pending_words_;
329  pending_words_ = NULL;
330  }
331 }
int size() const
Definition: genericvector.h:72
int length() const
Definition: genericvector.h:79
void delete_data_pointers()
#define NULL
Definition: host.h:144
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:41
void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 368 of file stopper.cpp.

368 {}
bool tesseract::Dict::fragment_state_okay ( UNICHAR_ID  curr_unichar_id,
float  curr_rating,
float  curr_certainty,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
const char *  debug,
int  word_ending,
CHAR_FRAGMENT_INFO char_frag_info 
)

Definition at line 322 of file permdawg.cpp.

326  {
327  const CHAR_FRAGMENT *this_fragment =
328  getUnicharset().get_fragment(curr_unichar_id);
329  const CHAR_FRAGMENT *prev_fragment =
330  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
331 
332  // Print debug info for fragments.
333  if (debug && (prev_fragment || this_fragment)) {
334  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
335  getUnicharset().debug_str(curr_unichar_id).string(),
336  word_ending);
337  if (prev_fragment) {
338  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
339  }
340  if (this_fragment) {
341  tprintf("this_fragment %s\n", this_fragment->to_string().string());
342  }
343  }
344 
345  char_frag_info->unichar_id = curr_unichar_id;
346  char_frag_info->fragment = this_fragment;
347  char_frag_info->rating = curr_rating;
348  char_frag_info->certainty = curr_certainty;
349  char_frag_info->num_fragments = 1;
350  if (prev_fragment && !this_fragment) {
351  if (debug) tprintf("Skip choice with incomplete fragment\n");
352  return false;
353  }
354  if (this_fragment) {
355  // We are dealing with a fragment.
356  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
357  if (prev_fragment) {
358  if (!this_fragment->is_continuation_of(prev_fragment)) {
359  if (debug) tprintf("Non-matching fragment piece\n");
360  return false;
361  }
362  if (this_fragment->is_ending()) {
363  char_frag_info->unichar_id =
364  getUnicharset().unichar_to_id(this_fragment->get_unichar());
365  char_frag_info->fragment = NULL;
366  if (debug) {
367  tprintf("Built character %s from fragments\n",
368  getUnicharset().debug_str(
369  char_frag_info->unichar_id).string());
370  }
371  } else {
372  if (debug) tprintf("Record fragment continuation\n");
373  char_frag_info->fragment = this_fragment;
374  }
375  // Update certainty and rating.
376  char_frag_info->rating =
377  prev_char_frag_info->rating + curr_rating;
378  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
379  char_frag_info->certainty =
380  MIN(curr_certainty, prev_char_frag_info->certainty);
381  } else {
382  if (this_fragment->is_beginning()) {
383  if (debug) tprintf("Record fragment beginning\n");
384  } else {
385  if (debug) {
386  tprintf("Non-starting fragment piece with no prev_fragment\n");
387  }
388  return false;
389  }
390  }
391  }
392  if (word_ending && char_frag_info->fragment) {
393  if (debug) tprintf("Word can not end with a fragment\n");
394  return false;
395  }
396  return true;
397 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:92
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
UNICHAR_ID unichar_id
Definition: dict.h:41
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
bool is_ending() const
Definition: unicharset.h:102
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:64
int num_fragments
Definition: dict.h:43
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
float rating
Definition: dict.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
float certainty
Definition: dict.h:45
bool is_beginning() const
Definition: unicharset.h:99
const CCUtil* tesseract::Dict::getCCUtil ( ) const
inline

Definition at line 90 of file dict.h.

90  {
91  return ccutil_;
92  }
CCUtil* tesseract::Dict::getCCUtil ( )
inline

Definition at line 93 of file dict.h.

93  {
94  return ccutil_;
95  }
const Dawg* tesseract::Dict::GetDawg ( int  index) const
inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 406 of file dict.h.

406 { return dawgs_[index]; }
const Dawg* tesseract::Dict::GetPuncDawg ( ) const
inline

Return the points to the punctuation dawg.

Definition at line 408 of file dict.h.

408 { return punc_dawg_; }
static NODE_REF tesseract::Dict::GetStartingNode ( const Dawg dawg,
EDGE_REF  edge_ref 
)
inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 412 of file dict.h.

412  {
413  if (edge_ref == NO_EDGE) return 0; // beginning to explore the dawg
414  NODE_REF node = dawg->next_node(edge_ref);
415  if (node == 0) node = NO_EDGE; // end of word
416  return node;
417  }
inT64 NODE_REF
Definition: dawg.h:55
const Dawg* tesseract::Dict::GetUnambigDawg ( ) const
inline

Return the points to the unambiguous words dawg.

Definition at line 410 of file dict.h.

410 { return unambig_dawg_; }
const UnicharAmbigs& tesseract::Dict::getUnicharAmbigs ( ) const
inline

Definition at line 102 of file dict.h.

102  {
103  return getCCUtil()->unichar_ambigs;
104  }
const CCUtil * getCCUtil() const
Definition: dict.h:90
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
const UNICHARSET& tesseract::Dict::getUnicharset ( ) const
inline

Definition at line 96 of file dict.h.

96  {
97  return getCCUtil()->unicharset;
98  }
const CCUtil * getCCUtil() const
Definition: dict.h:90
UNICHARSET unicharset
Definition: ccutil.h:72
UNICHARSET& tesseract::Dict::getUnicharset ( )
inline

Definition at line 99 of file dict.h.

99  {
100  return getCCUtil()->unicharset;
101  }
const CCUtil * getCCUtil() const
Definition: dict.h:90
UNICHARSET unicharset
Definition: ccutil.h:72
DawgCache * tesseract::Dict::GlobalDawgCache ( )
static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 186 of file dict.cpp.

186  {
187  // We dynamically allocate this global cache (a singleton) so it will outlive
188  // every Tesseract instance (even those that someone else might declare as
189  // global statics).
190  static DawgCache *cache = new DawgCache(); // evil global singleton
191  return cache;
192 }
void tesseract::Dict::go_deeper_dawg_fxn ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
bool  word_ending,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  void_more_args 
)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 52 of file permdawg.cpp.

56  {
57  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
58  word_ending = (char_choice_index == char_choices.size()-1);
59  int word_index = word->length() - 1;
60  if (best_choice->rating() < *limit) return;
61  // Look up char in DAWG
62 
63  // If the current unichar is an ngram first try calling
64  // letter_is_okay() for each unigram it contains separately.
65  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
66  bool checked_unigrams = false;
67  if (getUnicharset().get_isngram(orig_uch_id)) {
68  if (dawg_debug_level) {
69  tprintf("checking unigrams in an ngram %s\n",
70  getUnicharset().debug_str(orig_uch_id).string());
71  }
72  int num_unigrams = 0;
73  word->remove_last_unichar_id();
75  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
76  // Since the string came out of the unicharset, failure is impossible.
77  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL,
78  NULL));
79  bool unigrams_ok = true;
80  // Construct DawgArgs that reflect the current state.
81  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
82  DawgPositionVector unigram_updated_dawgs;
83  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
84  &unigram_updated_dawgs,
85  more_args->permuter);
86  // Check unigrams in the ngram with letter_is_okay().
87  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
88  UNICHAR_ID uch_id = encoding[i];
89  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
90  ++num_unigrams;
91  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
92  unigrams_ok = (this->*letter_is_okay_)(
93  &unigram_dawg_args,
94  word->unichar_id(word_index+num_unigrams-1),
95  word_ending && i == encoding.size() - 1);
96  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
97  if (dawg_debug_level) {
98  tprintf("unigram %s is %s\n",
99  getUnicharset().debug_str(uch_id).string(),
100  unigrams_ok ? "OK" : "not OK");
101  }
102  }
103  // Restore the word and copy the updated dawg state if needed.
104  while (num_unigrams-- > 0) word->remove_last_unichar_id();
105  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
106  if (unigrams_ok) {
107  checked_unigrams = true;
108  more_args->permuter = unigram_dawg_args.permuter;
109  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
110  }
111  }
112 
113  // Check which dawgs from the dawgs_ vector contain the word
114  // up to and including the current unichar.
115  if (checked_unigrams || (this->*letter_is_okay_)(
116  more_args, word->unichar_id(word_index), word_ending)) {
117  // Add a new word choice
118  if (word_ending) {
119  if (dawg_debug_level) {
120  tprintf("found word = %s\n", word->debug_string().string());
121  }
122  if (strcmp(output_ambig_words_file.string(), "") != 0) {
123  if (output_ambig_words_file_ == NULL) {
124  output_ambig_words_file_ =
125  fopen(output_ambig_words_file.string(), "wb+");
126  if (output_ambig_words_file_ == NULL) {
127  tprintf("Failed to open output_ambig_words_file %s\n",
128  output_ambig_words_file.string());
129  exit(1);
130  }
131  STRING word_str;
132  word->string_and_lengths(&word_str, NULL);
133  word_str += " ";
134  fprintf(output_ambig_words_file_, "%s", word_str.string());
135  }
136  STRING word_str;
137  word->string_and_lengths(&word_str, NULL);
138  word_str += " ";
139  fprintf(output_ambig_words_file_, "%s", word_str.string());
140  }
141  WERD_CHOICE *adjusted_word = word;
142  adjusted_word->set_permuter(more_args->permuter);
143  update_best_choice(*adjusted_word, best_choice);
144  } else { // search the next letter
145  // Make updated_* point to the next entries in the DawgPositionVector
146  // arrays (that were originally created in dawg_permute_and_select)
147  ++(more_args->updated_dawgs);
148  // Make active_dawgs and constraints point to the updated ones.
149  ++(more_args->active_dawgs);
150  permute_choices(debug, char_choices, char_choice_index + 1,
151  prev_char_frag_info, word, certainties, limit,
152  best_choice, attempts_left, more_args);
153  // Restore previous state to explore another letter in this position.
154  --(more_args->updated_dawgs);
155  --(more_args->active_dawgs);
156  }
157  } else {
158  if (dawg_debug_level) {
159  tprintf("last unichar not OK at index %d in %s\n",
160  word_index, word->debug_string().string());
161  }
162  }
163 }
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int size() const
Definition: genericvector.h:72
float rating() const
Definition: ratngs.h:324
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:169
int length() const
Definition: ratngs.h:300
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
#define tprintf(...)
Definition: tprintf.h:31
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204
#define ASSERT_HOST(x)
Definition: errcode.h:84
void remove_last_unichar_id()
Definition: ratngs.h:480
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const STRING debug_string() const
Definition: ratngs.h:502
int UNICHAR_ID
Definition: unichar.h:33
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
int dawg_debug_level
Definition: dict.h:595
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
char * output_ambig_words_file
Definition: dict.h:593
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int tesseract::Dict::good_choice ( const WERD_CHOICE choice)

Returns true if a good answer is found for the unknown blob rating.

bool tesseract::Dict::has_hyphen_end ( UNICHAR_ID  unichar_id,
bool  first_pos 
) const
inline

Check whether the word has a hyphen at the end.

Definition at line 142 of file dict.h.

142  {
143  if (!last_word_on_line_ || first_pos)
144  return false;
145  const GenericVector<UNICHAR_ID>& normed_ids =
146  getUnicharset().normed_ids(unichar_id);
147  return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
148  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE word) const
inline

Same as above, but check the unichar at the end of the word.

Definition at line 150 of file dict.h.

150  {
151  int word_index = word.length() - 1;
152  return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
153  }
int length() const
Definition: ratngs.h:300
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
int tesseract::Dict::hyphen_base_size ( ) const
inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 129 of file dict.h.

129  {
130  return this->hyphenated() ? hyphen_word_->length() : 0;
131  }
int length() const
Definition: ratngs.h:300
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
bool tesseract::Dict::hyphenated ( ) const
inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 125 of file dict.h.

125  { return
126  !last_word_on_line_ && hyphen_word_;
127  }
void tesseract::Dict::init_active_dawgs ( DawgPositionVector active_dawgs,
bool  ambigs_mode 
) const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 523 of file dict.cpp.

524  {
525  int i;
526  if (hyphenated()) {
527  *active_dawgs = hyphen_active_dawgs_;
528  if (dawg_debug_level >= 3) {
529  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
530  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
531  hyphen_active_dawgs_[i].dawg_index,
532  hyphen_active_dawgs_[i].dawg_ref);
533  }
534  }
535  } else {
536  default_dawgs(active_dawgs, ambigs_mode);
537  }
538 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
int dawg_debug_level
Definition: dict.h:595
#define REFFORMAT
Definition: dawg.h:92
bool tesseract::Dict::is_apostrophe ( UNICHAR_ID  unichar_id)
inline

Definition at line 116 of file dict.h.

116  {
117  const GenericVector<UNICHAR_ID>& normed_ids =
118  getUnicharset().normed_ids(unichar_id);
119  return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
120  }
int size() const
Definition: genericvector.h:72
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE WordChoice)

Returns the length of the shortest alpha run in WordChoice.

Definition at line 452 of file stopper.cpp.

452  {
453  int shortest = MAX_INT32;
454  int curr_len = 0;
455  for (int w = 0; w < WordChoice.length(); ++w) {
456  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
457  curr_len++;
458  } else if (curr_len > 0) {
459  if (curr_len < shortest) shortest = curr_len;
460  curr_len = 0;
461  }
462  }
463  if (curr_len > 0 && curr_len < shortest) {
464  shortest = curr_len;
465  } else if (shortest == MAX_INT32) {
466  shortest = 0;
467  }
468  return shortest;
469 }
int length() const
Definition: ratngs.h:300
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
#define MAX_INT32
Definition: host.h:120
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
int tesseract::Dict::LetterIsOkay ( void *  void_dawg_args,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inline

Calls letter_is_okay_ member function.

Definition at line 350 of file dict.h.

351  {
352  return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
353  }
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
void tesseract::Dict::Load ( DawgCache dawg_cache)

Definition at line 194 of file dict.cpp.

194  {
195  STRING name;
196  STRING &lang = getCCUtil()->lang;
197 
198  if (dawgs_.length() != 0) this->End();
199 
200  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
201  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
202  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
203  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
204 
205  if (dawg_cache != NULL) {
206  dawg_cache_ = dawg_cache;
207  dawg_cache_is_ours_ = false;
208  } else {
209  dawg_cache_ = new DawgCache();
210  dawg_cache_is_ours_ = true;
211  }
212 
213  TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager;
214  const char *data_file_name = tessdata_manager.GetDataFileName().string();
215 
216  // Load dawgs_.
217  if (load_punc_dawg) {
218  punc_dawg_ = dawg_cache_->GetSquishedDawg(
219  lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
220  if (punc_dawg_) dawgs_ += punc_dawg_;
221  }
222  if (load_system_dawg) {
223  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
224  lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
225  if (system_dawg) dawgs_ += system_dawg;
226  }
227  if (load_number_dawg) {
228  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
229  lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
230  if (number_dawg) dawgs_ += number_dawg;
231  }
232  if (load_bigram_dawg) {
233  bigram_dawg_ = dawg_cache_->GetSquishedDawg(
234  lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
235  }
236  if (load_freq_dawg) {
237  freq_dawg_ = dawg_cache_->GetSquishedDawg(
238  lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
239  if (freq_dawg_) { dawgs_ += freq_dawg_; }
240  }
241  if (load_unambig_dawg) {
242  unambig_dawg_ = dawg_cache_->GetSquishedDawg(
243  lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
244  if (unambig_dawg_) dawgs_ += unambig_dawg_;
245  }
246 
247  if (((STRING &)user_words_suffix).length() > 0 ||
248  ((STRING &)user_words_file).length() > 0) {
249  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
250  getUnicharset().size(), dawg_debug_level);
251  if (((STRING &)user_words_file).length() > 0) {
252  name = user_words_file;
253  } else {
255  name += user_words_suffix;
256  }
257  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
259  tprintf("Error: failed to load %s\n", name.string());
260  delete trie_ptr;
261  } else {
262  dawgs_ += trie_ptr;
263  }
264  }
265 
266  if (((STRING &)user_patterns_suffix).length() > 0 ||
267  ((STRING &)user_patterns_file).length() > 0) {
268  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
269  getUnicharset().size(), dawg_debug_level);
270  trie_ptr->initialize_patterns(&(getUnicharset()));
271  if (((STRING &)user_patterns_file).length() > 0) {
272  name = user_patterns_file;
273  } else {
275  name += user_patterns_suffix;
276  }
277  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
278  tprintf("Error: failed to load %s\n", name.string());
279  delete trie_ptr;
280  } else {
281  dawgs_ += trie_ptr;
282  }
283  }
284 
285  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
286  getUnicharset().size(), dawg_debug_level);
287  dawgs_ += document_words_;
288 
289  // This dawg is temporary and should not be searched by letter_is_ok.
290  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
291  getUnicharset().size(), dawg_debug_level);
292 
293  // Construct a list of corresponding successors for each dawg. Each entry i
294  // in the successors_ vector is a vector of integers that represent the
295  // indices into the dawgs_ vector of the successors for dawg i.
296  successors_.reserve(dawgs_.length());
297  for (int i = 0; i < dawgs_.length(); ++i) {
298  const Dawg *dawg = dawgs_[i];
299  SuccessorList *lst = new SuccessorList();
300  for (int j = 0; j < dawgs_.length(); ++j) {
301  const Dawg *other = dawgs_[j];
302  if (dawg != NULL && other != NULL &&
303  (dawg->lang() == other->lang()) &&
304  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
305  }
306  successors_ += lst;
307  }
308 }
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int length() const
Definition: genericvector.h:79
const CCUtil * getCCUtil() const
Definition: dict.h:90
bool load_bigram_dawg
Definition: dict.h:561
#define tprintf(...)
Definition: tprintf.h:31
TessdataManager tessdata_manager
Definition: ccutil.h:71
char * user_patterns_file
Definition: dict.h:551
void End()
Definition: dict.cpp:310
Dawg * GetSquishedDawg(const STRING &lang, const char *data_file_name, TessdataType tessdata_dawg_type, int debug_level)
Definition: dawg_cache.cpp:47
name_table name
const STRING & GetDataFileName() const
char * user_patterns_suffix
Definition: dict.h:553
bool load_system_dawg
Definition: dict.h:554
STRING language_data_path_prefix
Definition: ccutil.h:70
int dawg_debug_level
Definition: dict.h:595
bool load_unambig_dawg
Definition: dict.h:556
bool load_freq_dawg
Definition: dict.h:555
GenericVector< int > SuccessorList
Definition: dawg.h:68
char * user_words_suffix
Definition: dict.h:549
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void reserve(int size)
STRING lang
Definition: ccutil.h:69
char * user_words_file
Definition: dict.h:547
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
bool load_number_dawg
Definition: dict.h:559
bool load_punc_dawg
Definition: dict.h:558
double tesseract::Dict::ngram_probability_in_context ( const char *  lang,
const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
bool tesseract::Dict::NoDangerousAmbig ( WERD_CHOICE BestChoice,
DANGERR fixpt,
bool  fix_replaceable,
MATRIX ratings 
)

Definition at line 152 of file stopper.cpp.

155  {
156  if (stopper_debug_level > 2) {
157  tprintf("\nRunning NoDangerousAmbig() for %s\n",
158  best_choice->debug_string().string());
159  }
160 
161  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
162  // for each unichar id in BestChoice.
163  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
164  int i;
165  bool ambigs_found = false;
166  // For each position in best_choice:
167  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
168  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
169  // -- look for ambiguities corresponding to wrong_ngram in the list while
170  // adding the following unichar_ids from best_choice to wrong_ngram
171  //
172  // Repeat the above procedure twice: first time look through
173  // ambigs to be replaced and replace all the ambiguities found;
174  // second time look through dangerous ambiguities and construct
175  // ambig_blob_choices with fake a blob choice for each ambiguity
176  // and pass them to dawg_permute_and_select() to search for
177  // ambiguous words in the dictionaries.
178  //
179  // Note that during the execution of the for loop (on the first pass)
180  // if replacements are made the length of best_choice might change.
181  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
182  bool replace = (fix_replaceable && pass == 0);
183  const UnicharAmbigsVector &table = replace ?
185  if (!replace) {
186  // Initialize ambig_blob_choices with lists containing a single
187  // unichar id for the correspoding position in best_choice.
188  // best_choice consisting from only the original letters will
189  // have a rating of 0.0.
190  for (i = 0; i < best_choice->length(); ++i) {
191  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
192  BLOB_CHOICE_IT lst_it(lst);
193  // TODO(rays/antonova) Put real xheights and y shifts here.
194  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
195  0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
196  ambig_blob_choices.push_back(lst);
197  }
198  }
199  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
200  int wrong_ngram_index;
201  int next_index;
202  int blob_index = 0;
203  for (i = 0; i < best_choice->length(); blob_index += best_choice->state(i),
204  ++i) {
205  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
206  if (stopper_debug_level > 2) {
207  tprintf("Looking for %s ngrams starting with %s:\n",
208  replace ? "replaceable" : "ambiguous",
209  getUnicharset().debug_str(curr_unichar_id).string());
210  }
211  int num_wrong_blobs = best_choice->state(i);
212  wrong_ngram_index = 0;
213  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
214  if (curr_unichar_id == INVALID_UNICHAR_ID ||
215  curr_unichar_id >= table.size() ||
216  table[curr_unichar_id] == NULL) {
217  continue; // there is no ambig spec for this unichar id
218  }
219  AmbigSpec_IT spec_it(table[curr_unichar_id]);
220  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
221  const AmbigSpec *ambig_spec = spec_it.data();
222  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
223  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
224  ambig_spec->wrong_ngram);
225  if (stopper_debug_level > 2) {
226  tprintf("candidate ngram: ");
228  tprintf("current ngram from spec: ");
229  UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
230  tprintf("comparison result: %d\n", compare);
231  }
232  if (compare == 0) {
233  // Record the place where we found an ambiguity.
234  if (fixpt != NULL) {
235  UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
236  fixpt->push_back(DANGERR_INFO(
237  blob_index, blob_index + num_wrong_blobs, replace,
238  getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
239  leftmost_id));
240  if (stopper_debug_level > 1) {
241  tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index,
242  blob_index + num_wrong_blobs, false,
243  getUnicharset().get_isngram(
244  ambig_spec->correct_ngram_id),
245  getUnicharset().id_to_unichar(leftmost_id));
246  }
247  }
248 
249  if (replace) {
250  if (stopper_debug_level > 2) {
251  tprintf("replace ambiguity with %s : ",
252  getUnicharset().id_to_unichar(
253  ambig_spec->correct_ngram_id));
255  ambig_spec->correct_fragments, getUnicharset());
256  }
257  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
258  ambig_spec->correct_ngram_id,
259  best_choice, ratings);
260  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
261  // We found dang ambig - update ambig_blob_choices.
262  if (stopper_debug_level > 2) {
263  tprintf("found ambiguity: ");
265  ambig_spec->correct_fragments, getUnicharset());
266  }
267  ambigs_found = true;
268  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
269  ++tmp_index) {
270  // Add a blob choice for the corresponding fragment of the
271  // ambiguity. These fake blob choices are initialized with
272  // negative ratings (which are not possible for real blob
273  // choices), so that dawg_permute_and_select() considers any
274  // word not consisting of only the original letters a better
275  // choice and stops searching for alternatives once such a
276  // choice is found.
277  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
278  bc_it.add_to_end(new BLOB_CHOICE(
279  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
280  -1, 0, 1, 0, BCC_AMBIG));
281  }
282  }
283  spec_it.forward();
284  } else if (compare == -1) {
285  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
286  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
287  // Add the next unichar id to wrong_ngram and keep looking for
288  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
289  wrong_ngram[++wrong_ngram_index] =
290  best_choice->unichar_id(next_index);
291  num_wrong_blobs += best_choice->state(next_index);
292  } else {
293  break; // no more matching ambigs in this AMBIG_SPEC_LIST
294  }
295  } else {
296  spec_it.forward();
297  }
298  } // end searching AmbigSpec_LIST
299  } // end searching best_choice
300  } // end searching replace and dangerous ambigs
301 
302  // If any ambiguities were found permute the constructed ambig_blob_choices
303  // to see if an alternative dictionary word can be found.
304  if (ambigs_found) {
305  if (stopper_debug_level > 2) {
306  tprintf("\nResulting ambig_blob_choices:\n");
307  for (i = 0; i < ambig_blob_choices.length(); ++i) {
308  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
309  tprintf("\n");
310  }
311  }
312  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
313  ambigs_found = (alt_word->rating() < 0.0);
314  if (ambigs_found) {
315  if (stopper_debug_level >= 1) {
316  tprintf ("Stopper: Possible ambiguous word = %s\n",
317  alt_word->debug_string().string());
318  }
319  if (fixpt != NULL) {
320  // Note: Currently character choices combined from fragments can only
321  // be generated by NoDangrousAmbigs(). This code should be updated if
322  // the capability to produce classifications combined from character
323  // fragments is added to other functions.
324  int orig_i = 0;
325  for (i = 0; i < alt_word->length(); ++i) {
326  const UNICHARSET &uchset = getUnicharset();
327  bool replacement_is_ngram =
328  uchset.get_isngram(alt_word->unichar_id(i));
329  UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
330  if (replacement_is_ngram) {
331  // we have to extract the leftmost unichar from the ngram.
332  const char *str = uchset.id_to_unichar(leftmost_id);
333  int step = uchset.step(str);
334  if (step) leftmost_id = uchset.unichar_to_id(str, step);
335  }
336  int end_i = orig_i + alt_word->state(i);
337  if (alt_word->state(i) > 1 ||
338  (orig_i + 1 == end_i && replacement_is_ngram)) {
339  // Compute proper blob indices.
340  int blob_start = 0;
341  for (int j = 0; j < orig_i; ++j)
342  blob_start += best_choice->state(j);
343  int blob_end = blob_start;
344  for (int j = orig_i; j < end_i; ++j)
345  blob_end += best_choice->state(j);
346  fixpt->push_back(DANGERR_INFO(blob_start, blob_end, true,
347  replacement_is_ngram, leftmost_id));
348  if (stopper_debug_level > 1) {
349  tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
350  true, replacement_is_ngram,
351  uchset.id_to_unichar(leftmost_id));
352  }
353  }
354  orig_i += alt_word->state(i);
355  }
356  }
357  }
358  delete alt_word;
359  }
360  if (output_ambig_words_file_ != NULL) {
361  fprintf(output_ambig_words_file_, "\n");
362  }
363 
364  ambig_blob_choices.delete_data_pointers();
365  return !ambigs_found;
366 }
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
Definition: stopper.cpp:378
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int length() const
Definition: genericvector.h:79
static int compare(const UNICHAR_ID array1[], const UNICHAR_ID array2[])
Definition: ambigs.h:62
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
int length() const
Definition: ratngs.h:300
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:154
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:142
int state(int index) const
Definition: ratngs.h:316
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:484
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
void delete_data_pointers()
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const STRING debug_string() const
Definition: ratngs.h:502
int stopper_debug_level
Definition: dict.h:612
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:175
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:153
int UNICHAR_ID
Definition: unichar.h:33
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:97
int step(const char *str) const
Definition: unicharset.cpp:211
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
T & get(int index) const
const int tesseract::Dict::NumDawgs ( ) const
inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 404 of file dict.h.

404 { return dawgs_.size(); }
int size() const
Definition: genericvector.h:72
float tesseract::Dict::ParamsModelClassify ( const char *  lang,
void *  path 
)
void tesseract::Dict::permute_choices ( const char *  debug,
const BLOB_CHOICE_LIST_VECTOR char_choices,
int  char_choice_index,
const CHAR_FRAGMENT_INFO prev_char_frag_info,
WERD_CHOICE word,
float  certainties[],
float *  limit,
WERD_CHOICE best_choice,
int *  attempts_left,
void *  more_args 
)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 204 of file permdawg.cpp.

214  {
215  if (debug) {
216  tprintf("%s permute_choices: char_choice_index=%d"
217  " limit=%g rating=%g, certainty=%g word=%s\n",
218  debug, char_choice_index, *limit, word->rating(),
219  word->certainty(), word->debug_string().string());
220  }
221  if (char_choice_index < char_choices.length()) {
222  BLOB_CHOICE_IT blob_choice_it;
223  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
224  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
225  blob_choice_it.forward()) {
226  (*attempts_left)--;
227  append_choices(debug, char_choices, *(blob_choice_it.data()),
228  char_choice_index, prev_char_frag_info, word,
229  certainties, limit, best_choice, attempts_left, more_args);
230  if (*attempts_left <= 0) {
231  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
232  break;
233  }
234  }
235  }
236 }
float rating() const
Definition: ratngs.h:324
int length() const
Definition: genericvector.h:79
#define tprintf(...)
Definition: tprintf.h:31
float certainty() const
Definition: ratngs.h:327
const STRING debug_string() const
Definition: ratngs.h:502
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:246
const char * string() const
Definition: strngs.cpp:193
T & get(int index) const
double tesseract::Dict::ProbabilityInContext ( const char *  context,
int  context_bytes,
const char *  character,
int  character_bytes 
)
inline

Calls probability_in_context_ member function.

Definition at line 363 of file dict.h.

366  {
367  return (this->*probability_in_context_)(
368  getCCUtil()->lang.string(),
369  context, context_bytes,
370  character, character_bytes);
371  }
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:357
const CCUtil * getCCUtil() const
Definition: dict.h:90
STRING lang
Definition: ccutil.h:69
const char * string() const
Definition: strngs.cpp:193
void tesseract::Dict::ProcessPatternEdges ( const Dawg dawg,
const DawgPosition info,
UNICHAR_ID  unichar_id,
bool  word_end,
DawgPositionVector updated_dawgs,
PermuterType current_permuter 
) const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 486 of file dict.cpp.

489  {
490  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
491  // Try to find the edge corresponding to the exact unichar_id and to all the
492  // edges corresponding to the character class of unichar_id.
493  GenericVector<UNICHAR_ID> unichar_id_patterns;
494  unichar_id_patterns.push_back(unichar_id);
495  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
496  &unichar_id_patterns);
497  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
498  // On the first iteration check all the outgoing edges.
499  // On the second iteration check all self-loops.
500  for (int k = 0; k < 2; ++k) {
501  EDGE_REF edge = (k == 0)
502  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
503  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
504  if (edge == NO_EDGE) continue;
505  if (dawg_debug_level >= 3) {
506  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
507  pos.dawg_index, node, edge);
508  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
509  }
510  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
511  updated_dawgs->add_unique(
512  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
513  pos.back_to_punc),
514  dawg_debug_level > 0,
515  "Append current dawg to updated active dawgs: ");
516  }
517  }
518 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int dawg_debug_level
Definition: dict.h:595
#define REFFORMAT
Definition: dawg.h:92
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
inT64 EDGE_REF
Definition: dawg.h:54
inT64 NODE_REF
Definition: dawg.h:55
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
void tesseract::Dict::ReplaceAmbig ( int  wrong_ngram_begin_index,
int  wrong_ngram_size,
UNICHAR_ID  correct_ngram_id,
WERD_CHOICE werd_choice,
MATRIX ratings 
)

Definition at line 378 of file stopper.cpp.

380  {
381  int num_blobs_to_replace = 0;
382  int begin_blob_index = 0;
383  int i;
384  // Rating and certainty for the new BLOB_CHOICE are derived from the
385  // replaced choices.
386  float new_rating = 0.0f;
387  float new_certainty = 0.0f;
388  BLOB_CHOICE* old_choice = NULL;
389  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
390  if (i >= wrong_ngram_begin_index) {
391  int num_blobs = werd_choice->state(i);
392  int col = begin_blob_index + num_blobs_to_replace;
393  int row = col + num_blobs - 1;
394  BLOB_CHOICE_LIST* choices = ratings->get(col, row);
395  ASSERT_HOST(choices != NULL);
396  old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
397  ASSERT_HOST(old_choice != NULL);
398  new_rating += old_choice->rating();
399  new_certainty += old_choice->certainty();
400  num_blobs_to_replace += num_blobs;
401  } else {
402  begin_blob_index += werd_choice->state(i);
403  }
404  }
405  new_certainty /= wrong_ngram_size;
406  // If there is no entry in the ratings matrix, add it.
407  MATRIX_COORD coord(begin_blob_index,
408  begin_blob_index + num_blobs_to_replace - 1);
409  if (!coord.Valid(*ratings)) {
410  ratings->IncreaseBandSize(coord.row - coord.col + 1);
411  }
412  if (ratings->get(coord.col, coord.row) == NULL)
413  ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
414  BLOB_CHOICE_LIST* new_choices = ratings->get(coord.col, coord.row);
415  BLOB_CHOICE* choice = FindMatchingChoice(correct_ngram_id, new_choices);
416  if (choice != NULL) {
417  // Already there. Upgrade if new rating better.
418  if (new_rating < choice->rating())
419  choice->set_rating(new_rating);
420  if (new_certainty < choice->certainty())
421  choice->set_certainty(new_certainty);
422  // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
423  } else {
424  // Need a new choice with the correct_ngram_id.
425  choice = new BLOB_CHOICE(*old_choice);
426  choice->set_unichar_id(correct_ngram_id);
427  choice->set_rating(new_rating);
428  choice->set_certainty(new_certainty);
429  choice->set_classifier(BCC_AMBIG);
430  choice->set_matrix_cell(coord.col, coord.row);
431  BLOB_CHOICE_IT it (new_choices);
432  it.add_to_end(choice);
433  }
434  // Remove current unichar from werd_choice. On the last iteration
435  // set the correct replacement unichar instead of removing a unichar.
436  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
437  ++replaced_count) {
438  if (replaced_count + 1 == wrong_ngram_size) {
439  werd_choice->set_blob_choice(wrong_ngram_begin_index,
440  num_blobs_to_replace, choice);
441  } else {
442  werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
443  }
444  }
445  if (stopper_debug_level >= 1) {
446  werd_choice->print("ReplaceAmbig() ");
447  tprintf("Modified blob_choices: ");
448  print_ratings_list("\n", new_choices, getUnicharset());
449  }
450 }
void remove_unichar_id(int index)
Definition: ratngs.h:481
T get(int column, int row) const
Definition: matrix.h:171
#define tprintf(...)
Definition: tprintf.h:31
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160
void put(int column, int row, const T &thing)
Definition: matrix.h:166
#define ASSERT_HOST(x)
Definition: errcode.h:84
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
float rating() const
Definition: ratngs.h:79
int state(int index) const
Definition: ratngs.h:316
void set_classifier(BlobChoiceClassifier classifier)
Definition: ratngs.h:166
void set_certainty(float newrat)
Definition: ratngs.h:150
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int stopper_debug_level
Definition: dict.h:612
void set_matrix_cell(int col, int row)
Definition: ratngs.h:156
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:290
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void print() const
Definition: ratngs.h:563
#define NULL
Definition: host.h:144
void set_rating(float newrat)
Definition: ratngs.h:147
float certainty() const
Definition: ratngs.h:82
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
void tesseract::Dict::reset_hyphen_vars ( bool  last_word_on_line)

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 32 of file hyphen.cpp.

32  {
33  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
34  if (hyphen_word_ != NULL) {
35  delete hyphen_word_;
36  hyphen_word_ = NULL;
37  hyphen_active_dawgs_.clear();
38  }
39  }
40  if (hyphen_debug_level) {
41  tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n",
42  last_word_on_line_, last_word_on_line);
43  }
44  last_word_on_line_ = last_word_on_line;
45 }
int hyphen_debug_level
Definition: dict.h:596
#define tprintf(...)
Definition: tprintf.h:31
#define NULL
Definition: host.h:144
void tesseract::Dict::ResetDocumentDictionary ( )
inline

Definition at line 301 of file dict.h.

301  {
302  if (pending_words_ != NULL)
303  pending_words_->clear();
304  if (document_words_ != NULL)
305  document_words_->clear();
306  }
void clear()
Definition: trie.cpp:66
#define NULL
Definition: host.h:144
void tesseract::Dict::set_hyphen_word ( const WERD_CHOICE word,
const DawgPositionVector active_dawgs 
)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 49 of file hyphen.cpp.

50  {
51  if (hyphen_word_ == NULL) {
52  hyphen_word_ = new WERD_CHOICE(word.unicharset());
53  hyphen_word_->make_bad();
54  }
55  if (hyphen_word_->rating() > word.rating()) {
56  *hyphen_word_ = word;
57  // Remove the last unichar id as it is a hyphen, and remove
58  // any unichar_string/lengths that are present.
59  hyphen_word_->remove_last_unichar_id();
60  hyphen_active_dawgs_ = active_dawgs;
61  }
62  if (hyphen_debug_level) {
63  hyphen_word_->print("set_hyphen_word: ");
64  }
65 }
int hyphen_debug_level
Definition: dict.h:596
float rating() const
Definition: ratngs.h:324
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
void remove_last_unichar_id()
Definition: ratngs.h:480
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:440
void print() const
Definition: ratngs.h:563
#define NULL
Definition: host.h:144
void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 370 of file stopper.cpp.

370  {
371  reject_offset_ = 0.0;
372 }
void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 374 of file stopper.cpp.

374  {
376 }
double stopper_phase2_certainty_rejection_offset
Definition: dict.h:605
void tesseract::Dict::SetWildcardID ( UNICHAR_ID  id)
inline

Definition at line 399 of file dict.h.

399 { wildcard_unichar_id_ = id; }
void tesseract::Dict::SetWordsegRatingAdjustFactor ( float  f)
inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 483 of file dict.h.

483  {
484  wordseg_rating_adjust_factor_ = f;
485  }
int tesseract::Dict::UniformCertainties ( const WERD_CHOICE word)

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 471 of file stopper.cpp.

471  {
472  float Certainty;
473  float WorstCertainty = MAX_FLOAT32;
474  float CertaintyThreshold;
475  FLOAT64 TotalCertainty;
476  FLOAT64 TotalCertaintySquared;
477  FLOAT64 Variance;
478  FLOAT32 Mean, StdDev;
479  int word_length = word.length();
480 
481  if (word_length < 3)
482  return true;
483 
484  TotalCertainty = TotalCertaintySquared = 0.0;
485  for (int i = 0; i < word_length; ++i) {
486  Certainty = word.certainty(i);
487  TotalCertainty += Certainty;
488  TotalCertaintySquared += Certainty * Certainty;
489  if (Certainty < WorstCertainty)
490  WorstCertainty = Certainty;
491  }
492 
493  // Subtract off worst certainty from statistics.
494  word_length--;
495  TotalCertainty -= WorstCertainty;
496  TotalCertaintySquared -= WorstCertainty * WorstCertainty;
497 
498  Mean = TotalCertainty / word_length;
499  Variance = ((word_length * TotalCertaintySquared -
500  TotalCertainty * TotalCertainty) /
501  (word_length * (word_length - 1)));
502  if (Variance < 0.0)
503  Variance = 0.0;
504  StdDev = sqrt(Variance);
505 
506  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
507  if (CertaintyThreshold > stopper_nondict_certainty_base)
508  CertaintyThreshold = stopper_nondict_certainty_base;
509 
510  if (word.certainty() < CertaintyThreshold) {
511  if (stopper_debug_level >= 1)
512  tprintf("Stopper: Non-uniform certainty = %4.1f"
513  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
514  word.certainty(), Mean, StdDev, CertaintyThreshold);
515  return false;
516  } else {
517  return true;
518  }
519 }
float FLOAT32
Definition: host.h:111
int length() const
Definition: ratngs.h:300
#define tprintf(...)
Definition: tprintf.h:31
double stopper_allowable_character_badness
Definition: dict.h:611
float certainty() const
Definition: ratngs.h:327
double stopper_nondict_certainty_base
Definition: dict.h:603
int stopper_debug_level
Definition: dict.h:612
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
#define MAX_FLOAT32
Definition: host.h:124
double FLOAT64
Definition: host.h:112
void tesseract::Dict::update_best_choice ( const WERD_CHOICE word,
WERD_CHOICE best_choice 
)
inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 169 of file dict.h.

170  {
171  if (word.rating() < best_choice->rating()) {
172  *best_choice = word;
173  }
174  }
float rating() const
Definition: ratngs.h:324
bool tesseract::Dict::valid_bigram ( const WERD_CHOICE word1,
const WERD_CHOICE word2 
) const

Definition at line 738 of file dict.cpp.

739  {
740  if (bigram_dawg_ == NULL) return false;
741 
742  // Extract the core word from the middle of each word with any digits
743  // replaced with question marks.
744  int w1start, w1end, w2start, w2end;
745  word1.punct_stripped(&w1start, &w1end);
746  word2.punct_stripped(&w2start, &w2end);
747 
748  // We don't want to penalize a single guillemet, hyphen, etc.
749  // But our bigram list doesn't have any information about punctuation.
750  if (w1start >= w1end) return word1.length() < 3;
751  if (w2start >= w2end) return word2.length() < 3;
752 
753  const UNICHARSET& uchset = getUnicharset();
754  GenericVector<UNICHAR_ID> bigram_string;
755  bigram_string.reserve(w1end + w2end + 1);
756  for (int i = w1start; i < w1end; i++) {
757  const GenericVector<UNICHAR_ID>& normed_ids =
758  getUnicharset().normed_ids(word1.unichar_id(i));
759  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
760  bigram_string.push_back(question_unichar_id_);
761  else
762  bigram_string += normed_ids;
763  }
764  bigram_string.push_back(UNICHAR_SPACE);
765  for (int i = w2start; i < w2end; i++) {
766  const GenericVector<UNICHAR_ID>& normed_ids =
767  getUnicharset().normed_ids(word2.unichar_id(i));
768  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
769  bigram_string.push_back(question_unichar_id_);
770  else
771  bigram_string += normed_ids;
772  }
773  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
774  for (int i = 0; i < bigram_string.size(); ++i) {
775  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
776  0.0f, 0.0f);
777  }
778  return bigram_dawg_->word_in_dawg(normalized_word);
779 }
int size() const
Definition: genericvector.h:72
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:361
int length() const
Definition: ratngs.h:300
int push_back(T object)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void reserve(int size)
#define NULL
Definition: host.h:144
bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE word)

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 781 of file dict.cpp.

781  {
782  if (word.length() == 0) return NO_PERM;
783  int i;
784  WERD_CHOICE new_word(word.unicharset());
785  int last_index = word.length() - 1;
786  int new_len = 0;
787  for (i = 0; i <= last_index; ++i) {
788  UNICHAR_ID unichar_id = (word.unichar_id(i));
789  if (getUnicharset().get_ispunctuation(unichar_id)) {
790  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
791  } else if (!getUnicharset().get_isalpha(unichar_id) &&
792  !getUnicharset().get_isdigit(unichar_id)) {
793  return false; // neither punc, nor alpha, nor digit
794  } else if ((new_len = new_word.length()) == 0 ||
795  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
796  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
797  }
798  }
799  for (i = 0; i < dawgs_.size(); ++i) {
800  if (dawgs_[i] != NULL &&
801  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
802  dawgs_[i]->word_in_dawg(new_word)) return true;
803  }
804  return false;
805 }
int size() const
Definition: genericvector.h:72
int length() const
Definition: ratngs.h:300
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int UNICHAR_ID
Definition: unichar.h:33
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
#define NULL
Definition: host.h:144
int tesseract::Dict::valid_word ( const WERD_CHOICE word,
bool  numbers_ok 
) const

Definition at line 705 of file dict.cpp.

705  {
706  const WERD_CHOICE *word_ptr = &word;
707  WERD_CHOICE temp_word(word.unicharset());
708  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
709  copy_hyphen_info(&temp_word);
710  temp_word += word;
711  word_ptr = &temp_word;
712  }
713  if (word_ptr->length() == 0) return NO_PERM;
714  // Allocate vectors for holding current and updated
715  // active_dawgs and initialize them.
716  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
717  init_active_dawgs(&(active_dawgs[0]), false);
718  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
719  int last_index = word_ptr->length() - 1;
720  // Call leter_is_okay for each letter in the word.
721  for (int i = hyphen_base_size(); i <= last_index; ++i) {
722  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
723  i == last_index))) break;
724  // Swap active_dawgs, constraints with the corresponding updated vector.
725  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
726  dawg_args.updated_dawgs = &(active_dawgs[0]);
727  ++(dawg_args.active_dawgs);
728  } else {
729  ++(dawg_args.updated_dawgs);
730  dawg_args.active_dawgs = &(active_dawgs[0]);
731  }
732  }
733  delete[] active_dawgs;
734  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
735  dawg_args.permuter : NO_PERM;
736 }
int length() const
Definition: ratngs.h:300
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
int tesseract::Dict::valid_word ( const WERD_CHOICE word) const
inline

Definition at line 454 of file dict.h.

454  {
455  return valid_word(word, false); // return NO_PERM for words with digits
456  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
int tesseract::Dict::valid_word ( const char *  string) const
inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 461 of file dict.h.

461  {
462  WERD_CHOICE word(string, getUnicharset());
463  return valid_word(word);
464  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE word) const
inline

Definition at line 457 of file dict.h.

457  {
458  return valid_word(word, true); // return NUMBER_PERM for valid numbers
459  }
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
static bool tesseract::Dict::valid_word_permuter ( uinT8  perm,
bool  numbers_ok 
)
inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 447 of file dict.h.

447  {
448  return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
449  perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
450  perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
451  (numbers_ok && perm == NUMBER_PERM));
452  }
const UNICHAR_ID tesseract::Dict::WildcardID ( ) const
inline

Definition at line 400 of file dict.h.

400  {
401  return wildcard_unichar_id_;
402  }

Member Data Documentation

double tesseract::Dict::certainty_scale = 20.0

"Certainty scaling factor"

Definition at line 601 of file dict.h.

int tesseract::Dict::dawg_debug_level = 0

"Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages"

Definition at line 595 of file dict.h.

double tesseract::Dict::doc_dict_certainty_threshold = -2.25

"Worst certainty" " for words that can be inserted into the document dictionary"

Definition at line 632 of file dict.h.

double tesseract::Dict::doc_dict_pending_threshold = 0.0

"Worst certainty for using pending dictionary"

Definition at line 630 of file dict.h.

int tesseract::Dict::fragments_debug = 0

"Debug character fragments"

Definition at line 623 of file dict.h.

void(Dict::* tesseract::Dict::go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 203 of file dict.h.

int tesseract::Dict::hyphen_debug_level = 0

"Debug level for hyphenated words."

Definition at line 596 of file dict.h.

int(Dict::* tesseract::Dict::letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 347 of file dict.h.

bool tesseract::Dict::load_bigram_dawg = true

"Load dawg with special word bigrams."

Definition at line 561 of file dict.h.

bool tesseract::Dict::load_freq_dawg = true

"Load frequent word dawg."

Definition at line 555 of file dict.h.

bool tesseract::Dict::load_number_dawg = true

"Load dawg with number patterns."

Definition at line 559 of file dict.h.

bool tesseract::Dict::load_punc_dawg = true

"Load dawg with punctuation patterns."

Definition at line 558 of file dict.h.

bool tesseract::Dict::load_system_dawg = true

"Load system word dawg."

Definition at line 554 of file dict.h.

bool tesseract::Dict::load_unambig_dawg = true

"Load unambiguous word dawg."

Definition at line 556 of file dict.h.

int tesseract::Dict::max_permuter_attempts = 10000

"Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" " dawg search exploring an overly large number of options."

Definition at line 637 of file dict.h.

int tesseract::Dict::max_viterbi_list_size = 10

"Maximum size of viterbi list."

Definition at line 597 of file dict.h.

char* tesseract::Dict::output_ambig_words_file = ""

"Output file for ambiguities found in the dictionary"

Definition at line 593 of file dict.h.

float(Dict::* tesseract::Dict::params_model_classify_)(const char *lang, void *path)

Definition at line 390 of file dict.h.

double(Dict::* tesseract::Dict::probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 357 of file dict.h.

bool tesseract::Dict::save_doc_words = 0

"Save Document Words"

Definition at line 628 of file dict.h.

bool tesseract::Dict::save_raw_choices = false

"Deprecated- backward compatability only"

Definition at line 617 of file dict.h.

bool tesseract::Dict::segment_nonalphabetic_script = false

"Don't use any alphabetic-specific tricks." "Set to true in the traineddata config file for" " scripts that are cursive or inherently fixed-pitch"

Definition at line 627 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_bad = 1.3125

"Default score multiplier for word matches, which may have " "case issues (lower is better)."

Definition at line 578 of file dict.h.

double tesseract::Dict::segment_penalty_dict_case_ok = 1.1

"Score multiplier for word matches that have good case " "(lower is better)."

Definition at line 574 of file dict.h.

double tesseract::Dict::segment_penalty_dict_frequent_word = 1.0

"Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better)."

Definition at line 570 of file dict.h.

double tesseract::Dict::segment_penalty_dict_nonword = 1.25

"Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better)."

Definition at line 586 of file dict.h.

double tesseract::Dict::segment_penalty_garbage = 1.50

"Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" " better)."

Definition at line 591 of file dict.h.

double tesseract::Dict::segment_penalty_ngram_best_choice = 1.24

"Multipler to for the best choice from the ngram model."

Definition at line 582 of file dict.h.

double tesseract::Dict::stopper_allowable_character_badness = 3.0

"Max certaintly variation allowed in a word (in sigma)"

Definition at line 611 of file dict.h.

double tesseract::Dict::stopper_certainty_per_char = -0.50

"Certainty to add for each dict char above small word size."

Definition at line 609 of file dict.h.

int tesseract::Dict::stopper_debug_level = 0

"Stopper debug level"

Definition at line 612 of file dict.h.

bool tesseract::Dict::stopper_no_acceptable_choices = false

"Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"

Definition at line 615 of file dict.h.

double tesseract::Dict::stopper_nondict_certainty_base = -2.50

"Certainty threshold for non-dict words"

Definition at line 603 of file dict.h.

double tesseract::Dict::stopper_phase2_certainty_rejection_offset = 1.0

"Reject certainty offset"

Definition at line 605 of file dict.h.

int tesseract::Dict::stopper_smallword_size = 2

"Size of dict word to be treated as non-dict word"

Definition at line 607 of file dict.h.

int tesseract::Dict::tessedit_truncate_wordchoice_log = 10

"Max words to keep in list"

Definition at line 618 of file dict.h.

bool tesseract::Dict::use_only_first_uft8_step = false

"Use only the first UTF8 step of the given string" " when computing log probabilities."

Definition at line 600 of file dict.h.

char* tesseract::Dict::user_patterns_file = ""

"A filename of user-provided patterns."

Definition at line 551 of file dict.h.

char* tesseract::Dict::user_patterns_suffix = ""

"A suffix of user-provided patterns located in tessdata."

Definition at line 553 of file dict.h.

char* tesseract::Dict::user_words_file = ""

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class. "A filename of user-provided words."

Definition at line 547 of file dict.h.

char* tesseract::Dict::user_words_suffix = ""

"A suffix of user-provided words located in tessdata."

Definition at line 549 of file dict.h.

char* tesseract::Dict::word_to_debug = ""

"Word for which stopper debug information" " should be printed to stdout"

Definition at line 620 of file dict.h.

char* tesseract::Dict::word_to_debug_lengths = ""

"Lengths of unichars in word_to_debug"

Definition at line 622 of file dict.h.

double tesseract::Dict::xheight_penalty_inconsistent = 0.25

"Score penalty (0.1 = 10%) added if an xheight is " "inconsistent."

Definition at line 567 of file dict.h.

double tesseract::Dict::xheight_penalty_subscripts = 0.125

"Score penalty (0.1 = 10%) added if there are subscripts " "or superscripts in a word, but it is otherwise OK."

Definition at line 564 of file dict.h.


The documentation for this class was generated from the following files: