tesseract  4.0.0-beta.1-59-g2cc4
dict.cpp
Go to the documentation of this file.
1 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <stdio.h>
20 
21 #include "dict.h"
22 #include "unicodes.h"
23 
24 #ifdef _MSC_VER
25 #pragma warning(disable:4244) // Conversion warnings
26 #endif
27 #include "tprintf.h"
28 
29 namespace tesseract {
30 
31 class Image;
32 
34  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
35  probability_in_context_(&tesseract::Dict::def_probability_in_context),
36  params_model_classify_(NULL),
37  ccutil_(ccutil),
38  STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
39  getCCUtil()->params()),
40  STRING_INIT_MEMBER(user_words_suffix, "",
41  "A suffix of user-provided words located in tessdata.",
42  getCCUtil()->params()),
43  STRING_MEMBER(user_patterns_file, "",
44  "A filename of user-provided patterns.",
45  getCCUtil()->params()),
46  STRING_INIT_MEMBER(user_patterns_suffix, "",
47  "A suffix of user-provided patterns located in "
48  "tessdata.",
49  getCCUtil()->params()),
50  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
51  getCCUtil()->params()),
52  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
53  getCCUtil()->params()),
54  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
55  getCCUtil()->params()),
56  BOOL_INIT_MEMBER(load_punc_dawg, true,
57  "Load dawg with punctuation"
58  " patterns.",
59  getCCUtil()->params()),
60  BOOL_INIT_MEMBER(load_number_dawg, true,
61  "Load dawg with number"
62  " patterns.",
63  getCCUtil()->params()),
64  BOOL_INIT_MEMBER(load_bigram_dawg, true,
65  "Load dawg with special word "
66  "bigrams.",
67  getCCUtil()->params()),
68  double_MEMBER(xheight_penalty_subscripts, 0.125,
69  "Score penalty (0.1 = 10%) added if there are subscripts "
70  "or superscripts in a word, but it is otherwise OK.",
71  getCCUtil()->params()),
72  double_MEMBER(xheight_penalty_inconsistent, 0.25,
73  "Score penalty (0.1 = 10%) added if an xheight is "
74  "inconsistent.",
75  getCCUtil()->params()),
76  double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
77  "Score multiplier for word matches which have good case and"
78  " are frequent in the given language (lower is better).",
79  getCCUtil()->params()),
80  double_MEMBER(segment_penalty_dict_case_ok, 1.1,
81  "Score multiplier for word matches that have good case "
82  "(lower is better).",
83  getCCUtil()->params()),
84  double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
85  "Default score multiplier for word matches, which may have "
86  "case issues (lower is better).",
87  getCCUtil()->params()),
88  double_MEMBER(segment_penalty_dict_nonword, 1.25,
89  "Score multiplier for glyph fragment segmentations which "
90  "do not match a dictionary word (lower is better).",
91  getCCUtil()->params()),
92  double_MEMBER(segment_penalty_garbage, 1.50,
93  "Score multiplier for poorly cased strings that are not in"
94  " the dictionary and generally look like garbage (lower is"
95  " better).",
96  getCCUtil()->params()),
97  STRING_MEMBER(output_ambig_words_file, "",
98  "Output file for ambiguities found in the dictionary",
99  getCCUtil()->params()),
100  INT_MEMBER(dawg_debug_level, 0,
101  "Set to 1 for general debug info"
102  ", to 2 for more details, to 3 to see all the debug messages",
103  getCCUtil()->params()),
104  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
105  getCCUtil()->params()),
106  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
107  getCCUtil()->params()),
108  BOOL_MEMBER(use_only_first_uft8_step, false,
109  "Use only the first UTF8 step of the given string"
110  " when computing log probabilities.",
111  getCCUtil()->params()),
112  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
113  getCCUtil()->params()),
114  double_MEMBER(stopper_nondict_certainty_base, -2.50,
115  "Certainty threshold for non-dict words",
116  getCCUtil()->params()),
117  double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
118  "Reject certainty offset", getCCUtil()->params()),
119  INT_MEMBER(stopper_smallword_size, 2,
120  "Size of dict word to be treated as non-dict word",
121  getCCUtil()->params()),
122  double_MEMBER(stopper_certainty_per_char, -0.50,
123  "Certainty to add"
124  " for each dict char above small word size.",
125  getCCUtil()->params()),
126  double_MEMBER(stopper_allowable_character_badness, 3.0,
127  "Max certaintly variation allowed in a word (in sigma)",
128  getCCUtil()->params()),
129  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
130  getCCUtil()->params()),
131  BOOL_MEMBER(stopper_no_acceptable_choices, false,
132  "Make AcceptableChoice() always return false. Useful"
133  " when there is a need to explore all segmentations",
134  getCCUtil()->params()),
135  INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
136  "Max words to keep in list", getCCUtil()->params()),
137  STRING_MEMBER(word_to_debug, "",
138  "Word for which stopper debug"
139  " information should be printed to stdout",
140  getCCUtil()->params()),
141  STRING_MEMBER(word_to_debug_lengths, "",
142  "Lengths of unichars in word_to_debug",
143  getCCUtil()->params()),
144  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
145  getCCUtil()->params()),
146  BOOL_MEMBER(segment_nonalphabetic_script, false,
147  "Don't use any alphabetic-specific tricks."
148  " Set to true in the traineddata config file for"
149  " scripts that are cursive or inherently fixed-pitch",
150  getCCUtil()->params()),
151  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
152  getCCUtil()->params()),
153  double_MEMBER(doc_dict_pending_threshold, 0.0,
154  "Worst certainty for using pending dictionary",
155  getCCUtil()->params()),
156  double_MEMBER(doc_dict_certainty_threshold, -2.25,
157  "Worst certainty for words that can be inserted into the"
158  " document dictionary",
159  getCCUtil()->params()),
160  INT_MEMBER(max_permuter_attempts, 10000,
161  "Maximum number of different"
162  " character choices to consider during permutation."
163  " This limit is especially useful when user patterns"
164  " are specified, since overly generic patterns can result in"
165  " dawg search exploring an overly large number of options.",
166  getCCUtil()->params()) {
167  dang_ambigs_table_ = NULL;
168  replace_ambigs_table_ = NULL;
169  reject_offset_ = 0.0;
170  go_deeper_fxn_ = NULL;
171  hyphen_word_ = NULL;
172  last_word_on_line_ = false;
173  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
174  document_words_ = NULL;
175  dawg_cache_ = NULL;
176  dawg_cache_is_ours_ = false;
177  pending_words_ = NULL;
178  bigram_dawg_ = NULL;
179  freq_dawg_ = NULL;
180  punc_dawg_ = NULL;
181  unambig_dawg_ = NULL;
182  wordseg_rating_adjust_factor_ = -1.0f;
183  output_ambig_words_file_ = NULL;
184 }
185 
187  End();
188  delete hyphen_word_;
189  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
190 }
191 
193  // This global cache (a singleton) will outlive every Tesseract instance
194  // (even those that someone else might declare as global statics).
195  static DawgCache cache;
196  return &cache;
197 }
198 
199 // Sets up ready for a Load or LoadLSTM.
200 void Dict::SetupForLoad(DawgCache *dawg_cache) {
201  if (dawgs_.length() != 0) this->End();
202 
203  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
204  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
205  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
206  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
207 
208  if (dawg_cache != NULL) {
209  dawg_cache_ = dawg_cache;
210  dawg_cache_is_ours_ = false;
211  } else {
212  dawg_cache_ = new DawgCache();
213  dawg_cache_is_ours_ = true;
214  }
215 }
216 
217 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
218 void Dict::Load(const STRING &lang, TessdataManager *data_file) {
219  // Load dawgs_.
220  if (load_punc_dawg) {
221  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG,
222  dawg_debug_level, data_file);
223  if (punc_dawg_) dawgs_ += punc_dawg_;
224  }
225  if (load_system_dawg) {
226  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
227  lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
228  if (system_dawg) dawgs_ += system_dawg;
229  }
230  if (load_number_dawg) {
231  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
232  lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
233  if (number_dawg) dawgs_ += number_dawg;
234  }
235  if (load_bigram_dawg) {
236  bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG,
237  dawg_debug_level, data_file);
238  // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
239  // dawgs_!!
240  }
241  if (load_freq_dawg) {
242  freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG,
243  dawg_debug_level, data_file);
244  if (freq_dawg_) dawgs_ += freq_dawg_;
245  }
246  if (load_unambig_dawg) {
247  unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG,
248  dawg_debug_level, data_file);
249  if (unambig_dawg_) dawgs_ += unambig_dawg_;
250  }
251 
252  STRING name;
253  if (((STRING &)user_words_suffix).length() > 0 ||
254  ((STRING &)user_words_file).length() > 0) {
255  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
256  getUnicharset().size(), dawg_debug_level);
257  if (((STRING &)user_words_file).length() > 0) {
258  name = user_words_file;
259  } else {
261  name += user_words_suffix;
262  }
263  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
265  tprintf("Error: failed to load %s\n", name.string());
266  delete trie_ptr;
267  } else {
268  dawgs_ += trie_ptr;
269  }
270  }
271 
272  if (((STRING &)user_patterns_suffix).length() > 0 ||
273  ((STRING &)user_patterns_file).length() > 0) {
274  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
275  getUnicharset().size(), dawg_debug_level);
276  trie_ptr->initialize_patterns(&(getUnicharset()));
277  if (((STRING &)user_patterns_file).length() > 0) {
278  name = user_patterns_file;
279  } else {
281  name += user_patterns_suffix;
282  }
283  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
284  tprintf("Error: failed to load %s\n", name.string());
285  delete trie_ptr;
286  } else {
287  dawgs_ += trie_ptr;
288  }
289  }
290 
291  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
292  getUnicharset().size(), dawg_debug_level);
293  dawgs_ += document_words_;
294 
295  // This dawg is temporary and should not be searched by letter_is_ok.
296  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
297  getUnicharset().size(), dawg_debug_level);
298 }
299 
300 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
301 void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
302  // Load dawgs_.
303  if (load_punc_dawg) {
304  punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG,
305  dawg_debug_level, data_file);
306  if (punc_dawg_) dawgs_ += punc_dawg_;
307  }
308  if (load_system_dawg) {
309  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
310  lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
311  if (system_dawg) dawgs_ += system_dawg;
312  }
313  if (load_number_dawg) {
314  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
315  lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
316  if (number_dawg) dawgs_ += number_dawg;
317  }
318 }
319 
320 // Completes the loading process after Load() and/or LoadLSTM().
321 // Returns false if no dictionaries were loaded.
323  if (dawgs_.empty()) return false;
324  // Construct a list of corresponding successors for each dawg. Each entry, i,
325  // in the successors_ vector is a vector of integers that represent the
326  // indices into the dawgs_ vector of the successors for dawg i.
327  successors_.reserve(dawgs_.length());
328  for (int i = 0; i < dawgs_.length(); ++i) {
329  const Dawg *dawg = dawgs_[i];
330  SuccessorList *lst = new SuccessorList();
331  for (int j = 0; j < dawgs_.length(); ++j) {
332  const Dawg *other = dawgs_[j];
333  if (dawg != NULL && other != NULL &&
334  (dawg->lang() == other->lang()) &&
335  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
336  }
337  successors_ += lst;
338  }
339  return true;
340 }
341 
342 void Dict::End() {
343  if (dawgs_.length() == 0)
344  return; // Not safe to call twice.
345  for (int i = 0; i < dawgs_.size(); i++) {
346  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
347  delete dawgs_[i];
348  }
349  }
350  dawg_cache_->FreeDawg(bigram_dawg_);
351  if (dawg_cache_is_ours_) {
352  delete dawg_cache_;
353  dawg_cache_ = NULL;
354  }
355  successors_.delete_data_pointers();
356  dawgs_.clear();
357  successors_.clear();
358  document_words_ = NULL;
359  delete pending_words_;
360  pending_words_ = NULL;
361 }
362 
363 // Returns true if in light of the current state unichar_id is allowed
364 // according to at least one of the dawgs in the dawgs_ vector.
365 // See more extensive comments in dict.h where this function is declared.
366 int Dict::def_letter_is_okay(void* void_dawg_args,
367  UNICHAR_ID unichar_id,
368  bool word_end) const {
369  DawgArgs *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
370 
371  if (dawg_debug_level >= 3) {
372  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
373  " num active dawgs=%d\n",
374  getUnicharset().debug_str(unichar_id).string(), word_end,
375  dawg_args->active_dawgs->length());
376  }
377 
378  // Do not accept words that contain kPatternUnicharID.
379  // (otherwise pattern dawgs would not function correctly).
380  // Do not accept words containing INVALID_UNICHAR_IDs.
381  if (unichar_id == Dawg::kPatternUnicharID ||
382  unichar_id == INVALID_UNICHAR_ID) {
383  dawg_args->permuter = NO_PERM;
384  return NO_PERM;
385  }
386 
387  // Initialization.
388  PermuterType curr_perm = NO_PERM;
389  dawg_args->updated_dawgs->clear();
390  dawg_args->valid_end = false;
391 
392  // Go over the active_dawgs vector and insert DawgPosition records
393  // with the updated ref (an edge with the corresponding unichar id) into
394  // dawg_args->updated_pos.
395  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
396  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
397  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
398  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
399 
400  if (!dawg && !punc_dawg) {
401  // shouldn't happen.
402  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
403  continue;
404  }
405  if (!dawg) {
406  // We're in the punctuation dawg. A core dawg has not been chosen.
407  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
408  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
409  punc_node, Dawg::kPatternUnicharID, word_end);
410  if (punc_transition_edge != NO_EDGE) {
411  // Find all successors, and see which can transition.
412  const SuccessorList &slist = *(successors_[pos.punc_index]);
413  for (int s = 0; s < slist.length(); ++s) {
414  int sdawg_index = slist[s];
415  const Dawg *sdawg = dawgs_[sdawg_index];
416  UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
417  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
418  if (dawg_edge != NO_EDGE) {
419  if (dawg_debug_level >=3) {
420  tprintf("Letter found in dawg %d\n", sdawg_index);
421  }
422  dawg_args->updated_dawgs->add_unique(
423  DawgPosition(sdawg_index, dawg_edge,
424  pos.punc_index, punc_transition_edge, false),
425  dawg_debug_level > 0,
426  "Append transition from punc dawg to current dawgs: ");
427  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
428  if (sdawg->end_of_word(dawg_edge) &&
429  punc_dawg->end_of_word(punc_transition_edge))
430  dawg_args->valid_end = true;
431  }
432  }
433  }
434  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
435  word_end);
436  if (punc_edge != NO_EDGE) {
437  if (dawg_debug_level >=3) {
438  tprintf("Letter found in punctuation dawg\n");
439  }
440  dawg_args->updated_dawgs->add_unique(
441  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
442  dawg_debug_level > 0,
443  "Extend punctuation dawg: ");
444  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
445  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
446  }
447  continue;
448  }
449 
450  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
451  // We can end the main word here.
452  // If we can continue on the punc ref, add that possibility.
453  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
454  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
455  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
456  if (punc_edge != NO_EDGE) {
457  dawg_args->updated_dawgs->add_unique(
459  pos.punc_index, punc_edge, true),
460  dawg_debug_level > 0,
461  "Return to punctuation dawg: ");
462  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
463  if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true;
464  }
465  }
466 
467  if (pos.back_to_punc) continue;
468 
469  // If we are dealing with the pattern dawg, look up all the
470  // possible edges, not only for the exact unichar_id, but also
471  // for all its character classes (alpha, digit, etc).
472  if (dawg->type() == DAWG_TYPE_PATTERN) {
473  ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args,
474  &curr_perm);
475  // There can't be any successors to dawg that is of type
476  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
477  continue;
478  }
479 
480  // Find the edge out of the node for the unichar_id.
481  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
482  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
483  : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
484 
485  if (dawg_debug_level >= 3) {
486  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
487  pos.dawg_index, node, edge);
488  }
489 
490  if (edge != NO_EDGE) { // the unichar was found in the current dawg
491  if (dawg_debug_level >=3) {
492  tprintf("Letter found in dawg %d\n", pos.dawg_index);
493  }
494  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
495  if (dawg_debug_level >= 3) {
496  tprintf("Punctuation constraint not satisfied at end of word.\n");
497  }
498  continue;
499  }
500  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
501  if (dawg->end_of_word(edge) &&
502  (punc_dawg == NULL || punc_dawg->end_of_word(pos.punc_ref)))
503  dawg_args->valid_end = true;
504  dawg_args->updated_dawgs->add_unique(
505  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
506  false),
507  dawg_debug_level > 0,
508  "Append current dawg to updated active dawgs: ");
509  }
510  } // end for
511  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
512  // or if we found the current letter in a non-punctuation dawg. This
513  // allows preserving information on which dawg the "core" word came from.
514  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
515  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
516  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
517  dawg_args->permuter = curr_perm;
518  }
519  if (dawg_debug_level >= 2) {
520  tprintf("Returning %d for permuter code for this character.\n",
521  dawg_args->permuter);
522  }
523  return dawg_args->permuter;
524 }
525 
526 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
527  UNICHAR_ID unichar_id, bool word_end,
528  DawgArgs *dawg_args,
529  PermuterType *curr_perm) const {
530  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
531  // Try to find the edge corresponding to the exact unichar_id and to all the
532  // edges corresponding to the character class of unichar_id.
533  GenericVector<UNICHAR_ID> unichar_id_patterns;
534  unichar_id_patterns.push_back(unichar_id);
535  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
536  &unichar_id_patterns);
537  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
538  // On the first iteration check all the outgoing edges.
539  // On the second iteration check all self-loops.
540  for (int k = 0; k < 2; ++k) {
541  EDGE_REF edge = (k == 0)
542  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
543  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
544  if (edge == NO_EDGE) continue;
545  if (dawg_debug_level >= 3) {
546  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
547  pos.dawg_index, node, edge);
548  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
549  }
550  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
551  if (dawg->end_of_word(edge)) dawg_args->valid_end = true;
552  dawg_args->updated_dawgs->add_unique(
553  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
554  pos.back_to_punc),
555  dawg_debug_level > 0,
556  "Append current dawg to updated active dawgs: ");
557  }
558  }
559 }
560 
561 // Fill the given active_dawgs vector with dawgs that could contain the
562 // beginning of the word. If hyphenated() returns true, copy the entries
563 // from hyphen_active_dawgs_ instead.
565  bool ambigs_mode) const {
566  int i;
567  if (hyphenated()) {
568  *active_dawgs = hyphen_active_dawgs_;
569  if (dawg_debug_level >= 3) {
570  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
571  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
572  hyphen_active_dawgs_[i].dawg_index,
573  hyphen_active_dawgs_[i].dawg_ref);
574  }
575  }
576  } else {
577  default_dawgs(active_dawgs, ambigs_mode);
578  }
579 }
580 
582  bool suppress_patterns) const {
583  bool punc_dawg_available =
584  (punc_dawg_ != NULL) &&
585  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
586 
587  for (int i = 0; i < dawgs_.length(); i++) {
588  if (dawgs_[i] != NULL &&
589  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
590  int dawg_ty = dawgs_[i]->type();
591  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
592  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
593  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
594  if (dawg_debug_level >= 3) {
595  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
596  NO_EDGE);
597  }
598  } else if (!punc_dawg_available || !subsumed_by_punc) {
599  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
600  if (dawg_debug_level >= 3) {
601  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
602  }
603  }
604  }
605  }
606 }
607 
608 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
609  // Do not add hyphenated word parts to the document dawg.
610  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
611  // called when the first part of the hyphenated word is
612  // discovered and while the second part of the word is recognized.
613  // hyphen_word_ is cleared in cc_recg() before the next word on
614  // the line is recognized.
615  if (hyphen_word_) return;
616 
617  char filename[CHARS_PER_LINE];
618  FILE *doc_word_file;
619  int stringlen = best_choice.length();
620 
621  if (valid_word(best_choice) || stringlen < 2)
622  return;
623 
624  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
625  if (best_choice.length() >= kDocDictMaxRepChars) {
626  int num_rep_chars = 1;
627  UNICHAR_ID uch_id = best_choice.unichar_id(0);
628  for (int i = 1; i < best_choice.length(); ++i) {
629  if (best_choice.unichar_id(i) != uch_id) {
630  num_rep_chars = 1;
631  uch_id = best_choice.unichar_id(i);
632  } else {
633  ++num_rep_chars;
634  if (num_rep_chars == kDocDictMaxRepChars) return;
635  }
636  }
637  }
638 
639  if (best_choice.certainty() < doc_dict_certainty_threshold ||
640  stringlen == 2) {
641  if (best_choice.certainty() < doc_dict_pending_threshold)
642  return;
643 
644  if (!pending_words_->word_in_dawg(best_choice)) {
645  if (stringlen > 2 ||
646  (stringlen == 2 &&
647  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
648  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
649  pending_words_->add_word_to_dawg(best_choice);
650  }
651  return;
652  }
653  }
654 
655  if (save_doc_words) {
656  strcpy(filename, getCCUtil()->imagefile.string());
657  strcat(filename, ".doc");
658  doc_word_file = open_file (filename, "a");
659  fprintf(doc_word_file, "%s\n",
660  best_choice.debug_string().string());
661  fclose(doc_word_file);
662  }
663  document_words_->add_word_to_dawg(best_choice);
664 }
665 
667  bool nonword,
668  XHeightConsistencyEnum xheight_consistency,
669  float additional_adjust,
670  bool modify_rating,
671  bool debug) {
672  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
673  word->GetTopScriptID() == getUnicharset().han_sid());
674  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
675  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
676 
677  float adjust_factor = additional_adjust;
678  float new_rating = word->rating();
679  new_rating += kRatingPad;
680  const char *xheight_triggered = "";
681  if (word->length() > 1) {
682  // Calculate x-height and y-offset consistency penalties.
683  switch (xheight_consistency) {
684  case XH_INCONSISTENT:
685  adjust_factor += xheight_penalty_inconsistent;
686  xheight_triggered = ", xhtBAD";
687  break;
688  case XH_SUBNORMAL:
689  adjust_factor += xheight_penalty_subscripts;
690  xheight_triggered = ", xhtSUB";
691  break;
692  case XH_GOOD:
693  // leave the factor alone - all good!
694  break;
695  }
696  // TODO(eger): if nonword is true, but there is a "core" thats' a dict
697  // word, negate nonword status.
698  } else {
699  if (debug) {
700  tprintf("Consistency could not be calculated.\n");
701  }
702  }
703  if (debug) {
704  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
705  word->unichar_string().string(), word->rating(),
706  xheight_triggered);
707  }
708 
709  if (nonword) { // non-dictionary word
710  if (case_is_ok && punc_is_ok) {
711  adjust_factor += segment_penalty_dict_nonword;
712  new_rating *= adjust_factor;
713  if (debug) tprintf(", W");
714  } else {
715  adjust_factor += segment_penalty_garbage;
716  new_rating *= adjust_factor;
717  if (debug) {
718  if (!case_is_ok) tprintf(", C");
719  if (!punc_is_ok) tprintf(", P");
720  }
721  }
722  } else { // dictionary word
723  if (case_is_ok) {
724  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
726  adjust_factor += segment_penalty_dict_frequent_word;
727  new_rating *= adjust_factor;
728  if (debug) tprintf(", F");
729  } else {
730  adjust_factor += segment_penalty_dict_case_ok;
731  new_rating *= adjust_factor;
732  if (debug) tprintf(", ");
733  }
734  } else {
735  adjust_factor += segment_penalty_dict_case_bad;
736  new_rating *= adjust_factor;
737  if (debug) tprintf(", C");
738  }
739  }
740  new_rating -= kRatingPad;
741  if (modify_rating) word->set_rating(new_rating);
742  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
743  word->set_adjust_factor(adjust_factor);
744 }
745 
746 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
747  const WERD_CHOICE *word_ptr = &word;
748  WERD_CHOICE temp_word(word.unicharset());
749  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
750  copy_hyphen_info(&temp_word);
751  temp_word += word;
752  word_ptr = &temp_word;
753  }
754  if (word_ptr->length() == 0) return NO_PERM;
755  // Allocate vectors for holding current and updated
756  // active_dawgs and initialize them.
757  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
758  init_active_dawgs(&(active_dawgs[0]), false);
759  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
760  int last_index = word_ptr->length() - 1;
761  // Call leter_is_okay for each letter in the word.
762  for (int i = hyphen_base_size(); i <= last_index; ++i) {
763  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
764  i == last_index))) break;
765  // Swap active_dawgs, constraints with the corresponding updated vector.
766  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
767  dawg_args.updated_dawgs = &(active_dawgs[0]);
768  ++(dawg_args.active_dawgs);
769  } else {
770  ++(dawg_args.updated_dawgs);
771  dawg_args.active_dawgs = &(active_dawgs[0]);
772  }
773  }
774  delete[] active_dawgs;
775  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
776  dawg_args.permuter : NO_PERM;
777 }
778 
779 bool Dict::valid_bigram(const WERD_CHOICE &word1,
780  const WERD_CHOICE &word2) const {
781  if (bigram_dawg_ == NULL) return false;
782 
783  // Extract the core word from the middle of each word with any digits
784  // replaced with question marks.
785  int w1start, w1end, w2start, w2end;
786  word1.punct_stripped(&w1start, &w1end);
787  word2.punct_stripped(&w2start, &w2end);
788 
789  // We don't want to penalize a single guillemet, hyphen, etc.
790  // But our bigram list doesn't have any information about punctuation.
791  if (w1start >= w1end) return word1.length() < 3;
792  if (w2start >= w2end) return word2.length() < 3;
793 
794  const UNICHARSET& uchset = getUnicharset();
795  GenericVector<UNICHAR_ID> bigram_string;
796  bigram_string.reserve(w1end + w2end + 1);
797  for (int i = w1start; i < w1end; i++) {
798  const GenericVector<UNICHAR_ID>& normed_ids =
799  getUnicharset().normed_ids(word1.unichar_id(i));
800  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
801  bigram_string.push_back(question_unichar_id_);
802  else
803  bigram_string += normed_ids;
804  }
805  bigram_string.push_back(UNICHAR_SPACE);
806  for (int i = w2start; i < w2end; i++) {
807  const GenericVector<UNICHAR_ID>& normed_ids =
808  getUnicharset().normed_ids(word2.unichar_id(i));
809  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
810  bigram_string.push_back(question_unichar_id_);
811  else
812  bigram_string += normed_ids;
813  }
814  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
815  for (int i = 0; i < bigram_string.size(); ++i) {
816  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
817  0.0f, 0.0f);
818  }
819  return bigram_dawg_->word_in_dawg(normalized_word);
820 }
821 
823  if (word.length() == 0) return NO_PERM;
824  int i;
825  WERD_CHOICE new_word(word.unicharset());
826  int last_index = word.length() - 1;
827  int new_len = 0;
828  for (i = 0; i <= last_index; ++i) {
829  UNICHAR_ID unichar_id = (word.unichar_id(i));
830  if (getUnicharset().get_ispunctuation(unichar_id)) {
831  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
832  } else if (!getUnicharset().get_isalpha(unichar_id) &&
833  !getUnicharset().get_isdigit(unichar_id)) {
834  return false; // neither punc, nor alpha, nor digit
835  } else if ((new_len = new_word.length()) == 0 ||
836  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
837  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
838  }
839  }
840  for (i = 0; i < dawgs_.size(); ++i) {
841  if (dawgs_[i] != NULL &&
842  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
843  dawgs_[i]->word_in_dawg(new_word)) return true;
844  }
845  return false;
846 }
847 
850  const UNICHARSET &u_set = getUnicharset();
851  if (u_set.han_sid() > 0) return false;
852  if (u_set.katakana_sid() > 0) return false;
853  if (u_set.thai_sid() > 0) return false;
854  return true;
855 }
856 
857 } // namespace tesseract
DawgType type() const
Definition: dawg.h:128
PermuterType permuter() const
Definition: dawg.h:130
int katakana_sid() const
Definition: unicharset.h:889
int han_sid() const
Definition: unicharset.h:887
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:345
int thai_sid() const
Definition: unicharset.h:890
bool empty() const
Definition: genericvector.h:91
const STRING debug_string() const
Definition: ratngs.h:501
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
const CCUtil * getCCUtil() const
Definition: dict.h:91
char * user_words_file
Definition: dict.h:557
double segment_penalty_dict_nonword
Definition: dict.h:592
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:185
void delete_data_pointers()
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const STRING & lang() const
Definition: dawg.h:129
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:38
double xheight_penalty_inconsistent
Definition: dict.h:577
int64_t EDGE_REF
Definition: dawg.h:55
EDGE_REF punc_ref
Definition: dawg.h:376
PermuterType
Definition: ratngs.h:238
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:218
Definition: strngs.h:45
GenericVector< int > SuccessorList
Definition: dawg.h:69
double segment_penalty_garbage
Definition: dict.h:597
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:136
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:822
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:420
int length() const
Definition: ratngs.h:299
int size() const
Definition: genericvector.h:72
PermuterType permuter
Definition: dict.h:82
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:52
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
double doc_dict_certainty_threshold
Definition: dict.h:636
int64_t NODE_REF
Definition: dawg.h:56
DawgPositionVector * active_dawgs
Definition: dict.h:80
char * user_words_suffix
Definition: dict.h:559
bool load_punc_dawg
Definition: dict.h:568
int push_back(T object)
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:402
#define REFFORMAT
Definition: dawg.h:93
void End()
Definition: dict.cpp:342
int null_sid() const
Definition: unicharset.h:882
char * user_patterns_file
Definition: dict.h:561
int dawg_debug_level
Definition: dict.h:601
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
DawgPositionVector * updated_dawgs
Definition: dict.h:81
STRING language_data_path_prefix
Definition: ccutil.h:67
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
const STRING & unichar_string() const
Definition: ratngs.h:537
void set_permuter(uint8_t perm)
Definition: ratngs.h:371
void set_rating(float new_val)
Definition: ratngs.h:365
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:200
#define tprintf(...)
Definition: tprintf.h:31
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:608
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:564
int UNICHAR_ID
Definition: unichar.h:35
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:192
bool FinishLoad()
Definition: dict.cpp:322
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
double xheight_penalty_subscripts
Definition: dict.h:574
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:318
int length() const
Definition: genericvector.h:86
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:126
bool load_number_dawg
Definition: dict.h:569
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
XHeightConsistencyEnum
Definition: dict.h:74
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:779
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:204
const char * string() const
Definition: strngs.cpp:198
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs *dawg_args, PermuterType *current_permuter) const
Definition: dict.cpp:526
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:389
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:834
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
bool IsSpaceDelimitedLang() const
Returns true if the language is space-delimited (not CJ, or T).
Definition: dict.cpp:849
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:289
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:666
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:196
void LoadLSTM(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:301
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:130
char * user_patterns_suffix
Definition: dict.h:563
void reserve(int size)
bool load_unambig_dawg
Definition: dict.h:566
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
float rating() const
Definition: ratngs.h:323
bool valid_end
Definition: dict.h:84
float certainty() const
Definition: ratngs.h:326
bool load_freq_dawg
Definition: dict.h:565
Dict(CCUtil *image_ptr)
Definition: dict.cpp:33
void set_adjust_factor(float factor)
Definition: ratngs.h:305
EDGE_REF dawg_ref
Definition: dawg.h:374
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:746
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:306
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:430
#define CHARS_PER_LINE
Definition: cutil.h:57
Dawg * GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level, TessdataManager *data_file)
Definition: dawg_cache.cpp:45
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:315
bool hyphenated() const
Returns true if we&#39;ve recorded the beginning of a hyphenated word.
Definition: dict.h:126
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:364
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448
bool load_bigram_dawg
Definition: dict.h:571
double segment_penalty_dict_case_bad
Definition: dict.h:588
double segment_penalty_dict_case_ok
Definition: dict.h:584
CCUtil ccutil
bool save_doc_words
Definition: dict.h:632
double segment_penalty_dict_frequent_word
Definition: dict.h:580
int GetTopScriptID() const
Definition: ratngs.cpp:656
double doc_dict_pending_threshold
Definition: dict.h:634
bool load_system_dawg
Definition: dict.h:564
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:581
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:366