All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
dict.cpp
Go to the documentation of this file.
1 // File: dict.cpp
3 // Description: dict class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include <stdio.h>
20 
21 #include "dict.h"
22 #include "unicodes.h"
23 
24 #ifdef _MSC_VER
25 #pragma warning(disable:4244) // Conversion warnings
26 #endif
27 #include "tprintf.h"
28 
29 namespace tesseract {
30 
31 class Image;
32 
34  : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),
35  probability_in_context_(&tesseract::Dict::def_probability_in_context),
36  params_model_classify_(NULL),
37  ccutil_(ccutil),
38  STRING_MEMBER(user_words_file, "",
39  "A filename of user-provided words.",
40  getCCUtil()->params()),
41  STRING_INIT_MEMBER(user_words_suffix, "",
42  "A suffix of user-provided words located in tessdata.",
43  getCCUtil()->params()),
44  STRING_MEMBER(user_patterns_file, "",
45  "A filename of user-provided patterns.",
46  getCCUtil()->params()),
47  STRING_INIT_MEMBER(user_patterns_suffix, "",
48  "A suffix of user-provided patterns located in "
49  "tessdata.",
50  getCCUtil()->params()),
51  BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",
52  getCCUtil()->params()),
53  BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",
54  getCCUtil()->params()),
55  BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
56  getCCUtil()->params()),
57  BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"
58  " patterns.", getCCUtil()->params()),
59  BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"
60  " patterns.", getCCUtil()->params()),
61  BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word "
62  "bigrams.", getCCUtil()->params()),
63  double_MEMBER(xheight_penalty_subscripts, 0.125,
64  "Score penalty (0.1 = 10%) added if there are subscripts "
65  "or superscripts in a word, but it is otherwise OK.",
66  getCCUtil()->params()),
67  double_MEMBER(xheight_penalty_inconsistent, 0.25,
68  "Score penalty (0.1 = 10%) added if an xheight is "
69  "inconsistent.", getCCUtil()->params()),
70  double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
71  "Score multiplier for word matches which have good case and"
72  "are frequent in the given language (lower is better).",
73  getCCUtil()->params()),
74  double_MEMBER(segment_penalty_dict_case_ok, 1.1,
75  "Score multiplier for word matches that have good case "
76  "(lower is better).", getCCUtil()->params()),
77  double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
78  "Default score multiplier for word matches, which may have "
79  "case issues (lower is better).",
80  getCCUtil()->params()),
81  double_MEMBER(segment_penalty_ngram_best_choice, 1.24,
82  "Multipler to for the best choice from the ngram model.",
83  getCCUtil()->params()),
84  double_MEMBER(segment_penalty_dict_nonword, 1.25,
85  "Score multiplier for glyph fragment segmentations which "
86  "do not match a dictionary word (lower is better).",
87  getCCUtil()->params()),
88  double_MEMBER(segment_penalty_garbage, 1.50,
89  "Score multiplier for poorly cased strings that are not in"
90  " the dictionary and generally look like garbage (lower is"
91  " better).", getCCUtil()->params()),
92  STRING_MEMBER(output_ambig_words_file, "",
93  "Output file for ambiguities found in the dictionary",
94  getCCUtil()->params()),
95  INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"
96  ", to 2 for more details, to 3 to see all the debug messages",
97  getCCUtil()->params()),
98  INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",
99  getCCUtil()->params()),
100  INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",
101  getCCUtil()->params()),
102  BOOL_MEMBER(use_only_first_uft8_step, false,
103  "Use only the first UTF8 step of the given string"
104  " when computing log probabilities.",
105  getCCUtil()->params()),
106  double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",
107  getCCUtil()->params()),
108  double_MEMBER(stopper_nondict_certainty_base, -2.50,
109  "Certainty threshold for non-dict words",
110  getCCUtil()->params()),
111  double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,
112  "Reject certainty offset",
113  getCCUtil()->params()),
114  INT_MEMBER(stopper_smallword_size, 2,
115  "Size of dict word to be treated as non-dict word",
116  getCCUtil()->params()),
117  double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"
118  " for each dict char above small word size.",
119  getCCUtil()->params()),
120  double_MEMBER(stopper_allowable_character_badness, 3.0,
121  "Max certaintly variation allowed in a word (in sigma)",
122  getCCUtil()->params()),
123  INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",
124  getCCUtil()->params()),
125  BOOL_MEMBER(stopper_no_acceptable_choices, false,
126  "Make AcceptableChoice() always return false. Useful"
127  " when there is a need to explore all segmentations",
128  getCCUtil()->params()),
129  BOOL_MEMBER(save_raw_choices, false,
130  "Deprecated- backward compatablity only",
131  getCCUtil()->params()),
132  INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
133  "Max words to keep in list",
134  getCCUtil()->params()),
135  STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"
136  " information should be printed to stdout",
137  getCCUtil()->params()),
138  STRING_MEMBER(word_to_debug_lengths, "",
139  "Lengths of unichars in word_to_debug",
140  getCCUtil()->params()),
141  INT_MEMBER(fragments_debug, 0, "Debug character fragments",
142  getCCUtil()->params()),
143  BOOL_MEMBER(segment_nonalphabetic_script, false,
144  "Don't use any alphabetic-specific tricks."
145  "Set to true in the traineddata config file for"
146  " scripts that are cursive or inherently fixed-pitch",
147  getCCUtil()->params()),
148  BOOL_MEMBER(save_doc_words, 0, "Save Document Words",
149  getCCUtil()->params()),
150  double_MEMBER(doc_dict_pending_threshold, 0.0,
151  "Worst certainty for using pending dictionary",
152  getCCUtil()->params()),
153  double_MEMBER(doc_dict_certainty_threshold, -2.25,
154  "Worst certainty for words that can be inserted into the"
155  "document dictionary", getCCUtil()->params()),
156  INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"
157  " character choices to consider during permutation."
158  " This limit is especially useful when user patterns"
159  " are specified, since overly generic patterns can result in"
160  " dawg search exploring an overly large number of options.",
161  getCCUtil()->params()) {
162  dang_ambigs_table_ = NULL;
163  replace_ambigs_table_ = NULL;
164  reject_offset_ = 0.0;
166  hyphen_word_ = NULL;
167  last_word_on_line_ = false;
168  hyphen_unichar_id_ = INVALID_UNICHAR_ID;
169  document_words_ = NULL;
170  dawg_cache_ = NULL;
171  dawg_cache_is_ours_ = false;
172  pending_words_ = NULL;
173  bigram_dawg_ = NULL;
174  freq_dawg_ = NULL;
175  punc_dawg_ = NULL;
176  unambig_dawg_ = NULL;
177  wordseg_rating_adjust_factor_ = -1.0f;
178  output_ambig_words_file_ = NULL;
179 }
180 
182  if (hyphen_word_ != NULL) delete hyphen_word_;
183  if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);
184 }
185 
187  // We dynamically allocate this global cache (a singleton) so it will outlive
188  // every Tesseract instance (even those that someone else might declare as
189  // global statics).
190  static DawgCache *cache = new DawgCache(); // evil global singleton
191  return cache;
192 }
193 
194 void Dict::Load(DawgCache *dawg_cache) {
195  STRING name;
196  STRING &lang = getCCUtil()->lang;
197 
198  if (dawgs_.length() != 0) this->End();
199 
200  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
201  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
202  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
203  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
204 
205  if (dawg_cache != NULL) {
206  dawg_cache_ = dawg_cache;
207  dawg_cache_is_ours_ = false;
208  } else {
209  dawg_cache_ = new DawgCache();
210  dawg_cache_is_ours_ = true;
211  }
212 
213  TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager;
214  const char *data_file_name = tessdata_manager.GetDataFileName().string();
215 
216  // Load dawgs_.
217  if (load_punc_dawg) {
218  punc_dawg_ = dawg_cache_->GetSquishedDawg(
219  lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);
220  if (punc_dawg_) dawgs_ += punc_dawg_;
221  }
222  if (load_system_dawg) {
223  Dawg *system_dawg = dawg_cache_->GetSquishedDawg(
224  lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);
225  if (system_dawg) dawgs_ += system_dawg;
226  }
227  if (load_number_dawg) {
228  Dawg *number_dawg = dawg_cache_->GetSquishedDawg(
229  lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);
230  if (number_dawg) dawgs_ += number_dawg;
231  }
232  if (load_bigram_dawg) {
233  bigram_dawg_ = dawg_cache_->GetSquishedDawg(
234  lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);
235  }
236  if (load_freq_dawg) {
237  freq_dawg_ = dawg_cache_->GetSquishedDawg(
238  lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);
239  if (freq_dawg_) { dawgs_ += freq_dawg_; }
240  }
241  if (load_unambig_dawg) {
242  unambig_dawg_ = dawg_cache_->GetSquishedDawg(
243  lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);
244  if (unambig_dawg_) dawgs_ += unambig_dawg_;
245  }
246 
247  if (((STRING &)user_words_suffix).length() > 0 ||
248  ((STRING &)user_words_file).length() > 0) {
249  Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,
250  getUnicharset().size(), dawg_debug_level);
251  if (((STRING &)user_words_file).length() > 0) {
252  name = user_words_file;
253  } else {
255  name += user_words_suffix;
256  }
257  if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),
259  tprintf("Error: failed to load %s\n", name.string());
260  delete trie_ptr;
261  } else {
262  dawgs_ += trie_ptr;
263  }
264  }
265 
266  if (((STRING &)user_patterns_suffix).length() > 0 ||
267  ((STRING &)user_patterns_file).length() > 0) {
268  Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,
269  getUnicharset().size(), dawg_debug_level);
270  trie_ptr->initialize_patterns(&(getUnicharset()));
271  if (((STRING &)user_patterns_file).length() > 0) {
272  name = user_patterns_file;
273  } else {
275  name += user_patterns_suffix;
276  }
277  if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {
278  tprintf("Error: failed to load %s\n", name.string());
279  delete trie_ptr;
280  } else {
281  dawgs_ += trie_ptr;
282  }
283  }
284 
285  document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,
286  getUnicharset().size(), dawg_debug_level);
287  dawgs_ += document_words_;
288 
289  // This dawg is temporary and should not be searched by letter_is_ok.
290  pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,
291  getUnicharset().size(), dawg_debug_level);
292 
293  // Construct a list of corresponding successors for each dawg. Each entry i
294  // in the successors_ vector is a vector of integers that represent the
295  // indices into the dawgs_ vector of the successors for dawg i.
296  successors_.reserve(dawgs_.length());
297  for (int i = 0; i < dawgs_.length(); ++i) {
298  const Dawg *dawg = dawgs_[i];
299  SuccessorList *lst = new SuccessorList();
300  for (int j = 0; j < dawgs_.length(); ++j) {
301  const Dawg *other = dawgs_[j];
302  if (dawg != NULL && other != NULL &&
303  (dawg->lang() == other->lang()) &&
304  kDawgSuccessors[dawg->type()][other->type()]) *lst += j;
305  }
306  successors_ += lst;
307  }
308 }
309 
310 void Dict::End() {
311  if (dawgs_.length() == 0)
312  return; // Not safe to call twice.
313  for (int i = 0; i < dawgs_.size(); i++) {
314  if (!dawg_cache_->FreeDawg(dawgs_[i])) {
315  delete dawgs_[i];
316  }
317  }
318  dawg_cache_->FreeDawg(bigram_dawg_);
319  if (dawg_cache_is_ours_) {
320  delete dawg_cache_;
321  dawg_cache_ = NULL;
322  }
323  successors_.delete_data_pointers();
324  dawgs_.clear();
325  successors_.clear();
326  document_words_ = NULL;
327  if (pending_words_ != NULL) {
328  delete pending_words_;
329  pending_words_ = NULL;
330  }
331 }
332 
333 // Returns true if in light of the current state unichar_id is allowed
334 // according to at least one of the dawgs in the dawgs_ vector.
335 // See more extensive comments in dict.h where this function is declared.
336 int Dict::def_letter_is_okay(void* void_dawg_args,
337  UNICHAR_ID unichar_id,
338  bool word_end) const {
339  DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);
340 
341  if (dawg_debug_level >= 3) {
342  tprintf("def_letter_is_okay: current unichar=%s word_end=%d"
343  " num active dawgs=%d\n",
344  getUnicharset().debug_str(unichar_id).string(), word_end,
345  dawg_args->active_dawgs->length());
346  }
347 
348  // Do not accept words that contain kPatternUnicharID.
349  // (otherwise pattern dawgs would not function correctly).
350  // Do not accept words containing INVALID_UNICHAR_IDs.
351  if (unichar_id == Dawg::kPatternUnicharID ||
352  unichar_id == INVALID_UNICHAR_ID) {
353  dawg_args->permuter = NO_PERM;
354  return NO_PERM;
355  }
356 
357  // Initialization.
358  PermuterType curr_perm = NO_PERM;
359  dawg_args->updated_dawgs->clear();
360 
361  // Go over the active_dawgs vector and insert DawgPosition records
362  // with the updated ref (an edge with the corresponding unichar id) into
363  // dawg_args->updated_pos.
364  for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {
365  const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
366  const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;
367  const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;
368 
369  if (!dawg && !punc_dawg) {
370  // shouldn't happen.
371  tprintf("Received DawgPosition with no dawg or punc_dawg. wth?\n");
372  continue;
373  }
374  if (!dawg) {
375  // We're in the punctuation dawg. A core dawg has not been chosen.
376  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
377  EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(
378  punc_node, Dawg::kPatternUnicharID, word_end);
379  if (punc_transition_edge != NO_EDGE) {
380  // Find all successors, and see which can transition.
381  const SuccessorList &slist = *(successors_[pos.punc_index]);
382  for (int s = 0; s < slist.length(); ++s) {
383  int sdawg_index = slist[s];
384  const Dawg *sdawg = dawgs_[sdawg_index];
385  UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);
386  EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
387  if (dawg_edge != NO_EDGE) {
388  if (dawg_debug_level >=3) {
389  tprintf("Letter found in dawg %d\n", sdawg_index);
390  }
391  dawg_args->updated_dawgs->add_unique(
392  DawgPosition(sdawg_index, dawg_edge,
393  pos.punc_index, punc_transition_edge, false),
394  dawg_debug_level > 0,
395  "Append transition from punc dawg to current dawgs: ");
396  if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();
397  }
398  }
399  }
400  EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,
401  word_end);
402  if (punc_edge != NO_EDGE) {
403  if (dawg_debug_level >=3) {
404  tprintf("Letter found in punctuation dawg\n");
405  }
406  dawg_args->updated_dawgs->add_unique(
407  DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),
408  dawg_debug_level > 0,
409  "Extend punctuation dawg: ");
410  if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;
411  }
412  continue;
413  }
414 
415  if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
416  // We can end the main word here.
417  // If we can continue on the punc ref, add that possibility.
418  NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
419  EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE
420  : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
421  if (punc_edge != NO_EDGE) {
422  dawg_args->updated_dawgs->add_unique(
424  pos.punc_index, punc_edge, true),
425  dawg_debug_level > 0,
426  "Return to punctuation dawg: ");
427  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
428  }
429  }
430 
431  if (pos.back_to_punc) continue;
432 
433  // If we are dealing with the pattern dawg, look up all the
434  // possible edges, not only for the exact unichar_id, but also
435  // for all its character classes (alpha, digit, etc).
436  if (dawg->type() == DAWG_TYPE_PATTERN) {
437  ProcessPatternEdges(dawg, pos, unichar_id, word_end,
438  dawg_args->updated_dawgs, &curr_perm);
439  // There can't be any successors to dawg that is of type
440  // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
441  continue;
442  }
443 
444  // Find the edge out of the node for the unichar_id.
445  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
446  EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE
447  : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);
448 
449  if (dawg_debug_level >= 3) {
450  tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
451  pos.dawg_index, node, edge);
452  }
453 
454  if (edge != NO_EDGE) { // the unichar was found in the current dawg
455  if (dawg_debug_level >=3) {
456  tprintf("Letter found in dawg %d\n", pos.dawg_index);
457  }
458  if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
459  if (dawg_debug_level >= 3) {
460  tprintf("Punctuation constraint not satisfied at end of word.\n");
461  }
462  continue;
463  }
464  if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();
465  dawg_args->updated_dawgs->add_unique(
466  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
467  false),
468  dawg_debug_level > 0,
469  "Append current dawg to updated active dawgs: ");
470  }
471  } // end for
472  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
473  // or if we found the current letter in a non-punctuation dawg. This
474  // allows preserving information on which dawg the "core" word came from.
475  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
476  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
477  (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
478  dawg_args->permuter = curr_perm;
479  }
480  if (dawg_debug_level >= 2) {
481  tprintf("Returning %d for permuter code for this character.\n");
482  }
483  return dawg_args->permuter;
484 }
485 
486 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,
487  UNICHAR_ID unichar_id, bool word_end,
488  DawgPositionVector *updated_dawgs,
489  PermuterType *curr_perm) const {
490  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
491  // Try to find the edge corresponding to the exact unichar_id and to all the
492  // edges corresponding to the character class of unichar_id.
493  GenericVector<UNICHAR_ID> unichar_id_patterns;
494  unichar_id_patterns.push_back(unichar_id);
495  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),
496  &unichar_id_patterns);
497  for (int i = 0; i < unichar_id_patterns.size(); ++i) {
498  // On the first iteration check all the outgoing edges.
499  // On the second iteration check all self-loops.
500  for (int k = 0; k < 2; ++k) {
501  EDGE_REF edge = (k == 0)
502  ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)
503  : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);
504  if (edge == NO_EDGE) continue;
505  if (dawg_debug_level >= 3) {
506  tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",
507  pos.dawg_index, node, edge);
508  tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
509  }
510  if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();
511  updated_dawgs->add_unique(
512  DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,
513  pos.back_to_punc),
514  dawg_debug_level > 0,
515  "Append current dawg to updated active dawgs: ");
516  }
517  }
518 }
519 
520 // Fill the given active_dawgs vector with dawgs that could contain the
521 // beginning of the word. If hyphenated() returns true, copy the entries
522 // from hyphen_active_dawgs_ instead.
524  bool ambigs_mode) const {
525  int i;
526  if (hyphenated()) {
527  *active_dawgs = hyphen_active_dawgs_;
528  if (dawg_debug_level >= 3) {
529  for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {
530  tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
531  hyphen_active_dawgs_[i].dawg_index,
532  hyphen_active_dawgs_[i].dawg_ref);
533  }
534  }
535  } else {
536  default_dawgs(active_dawgs, ambigs_mode);
537  }
538 }
539 
541  bool suppress_patterns) const {
542  bool punc_dawg_available =
543  (punc_dawg_ != NULL) &&
544  punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
545 
546  for (int i = 0; i < dawgs_.length(); i++) {
547  if (dawgs_[i] != NULL &&
548  !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
549  int dawg_ty = dawgs_[i]->type();
550  bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
551  if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
552  *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
553  if (dawg_debug_level >= 3) {
554  tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,
555  NO_EDGE);
556  }
557  } else if (!punc_dawg_available || !subsumed_by_punc) {
558  *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
559  if (dawg_debug_level >= 3) {
560  tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
561  }
562  }
563  }
564  }
565 }
566 
567 void Dict::add_document_word(const WERD_CHOICE &best_choice) {
568  // Do not add hyphenated word parts to the document dawg.
569  // hyphen_word_ will be non-NULL after the set_hyphen_word() is
570  // called when the first part of the hyphenated word is
571  // discovered and while the second part of the word is recognized.
572  // hyphen_word_ is cleared in cc_recg() before the next word on
573  // the line is recognized.
574  if (hyphen_word_) return;
575 
576  char filename[CHARS_PER_LINE];
577  FILE *doc_word_file;
578  int stringlen = best_choice.length();
579 
580  if (valid_word(best_choice) || stringlen < 2)
581  return;
582 
583  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
584  if (best_choice.length() >= kDocDictMaxRepChars) {
585  int num_rep_chars = 1;
586  UNICHAR_ID uch_id = best_choice.unichar_id(0);
587  for (int i = 1; i < best_choice.length(); ++i) {
588  if (best_choice.unichar_id(i) != uch_id) {
589  num_rep_chars = 1;
590  uch_id = best_choice.unichar_id(i);
591  } else {
592  ++num_rep_chars;
593  if (num_rep_chars == kDocDictMaxRepChars) return;
594  }
595  }
596  }
597 
598  if (best_choice.certainty() < doc_dict_certainty_threshold ||
599  stringlen == 2) {
600  if (best_choice.certainty() < doc_dict_pending_threshold)
601  return;
602 
603  if (!pending_words_->word_in_dawg(best_choice)) {
604  if (stringlen > 2 ||
605  (stringlen == 2 &&
606  getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
607  getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
608  pending_words_->add_word_to_dawg(best_choice);
609  }
610  return;
611  }
612  }
613 
614  if (save_doc_words) {
615  strcpy(filename, getCCUtil()->imagefile.string());
616  strcat(filename, ".doc");
617  doc_word_file = open_file (filename, "a");
618  fprintf(doc_word_file, "%s\n",
619  best_choice.debug_string().string());
620  fclose(doc_word_file);
621  }
622  document_words_->add_word_to_dawg(best_choice);
623 }
624 
626  bool nonword,
627  XHeightConsistencyEnum xheight_consistency,
628  float additional_adjust,
629  bool modify_rating,
630  bool debug) {
631  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
632  word->GetTopScriptID() == getUnicharset().han_sid());
633  bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));
634  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
635 
636  float adjust_factor = additional_adjust;
637  float new_rating = word->rating();
638  new_rating += kRatingPad;
639  const char *xheight_triggered = "";
640  if (word->length() > 1) {
641  // Calculate x-height and y-offset consistency penalties.
642  switch (xheight_consistency) {
643  case XH_INCONSISTENT:
644  adjust_factor += xheight_penalty_inconsistent;
645  xheight_triggered = ", xhtBAD";
646  break;
647  case XH_SUBNORMAL:
648  adjust_factor += xheight_penalty_subscripts;
649  xheight_triggered = ", xhtSUB";
650  break;
651  case XH_GOOD:
652  // leave the factor alone - all good!
653  break;
654  }
655  // TODO(eger): if nonword is true, but there is a "core" thats' a dict
656  // word, negate nonword status.
657  } else {
658  if (debug) {
659  tprintf("Consistency could not be calculated.\n");
660  }
661  }
662  if (debug) {
663  tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",
664  word->unichar_string().string(), word->rating(),
665  xheight_triggered);
666  }
667 
668  if (nonword) { // non-dictionary word
669  if (case_is_ok && punc_is_ok) {
670  adjust_factor += segment_penalty_dict_nonword;
671  new_rating *= adjust_factor;
672  if (debug) tprintf(", W");
673  } else {
674  adjust_factor += segment_penalty_garbage;
675  new_rating *= adjust_factor;
676  if (debug) {
677  if (!case_is_ok) tprintf(", C");
678  if (!punc_is_ok) tprintf(", P");
679  }
680  }
681  } else { // dictionary word
682  if (case_is_ok) {
683  if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {
685  adjust_factor += segment_penalty_dict_frequent_word;
686  new_rating *= adjust_factor;
687  if (debug) tprintf(", F");
688  } else {
689  adjust_factor += segment_penalty_dict_case_ok;
690  new_rating *= adjust_factor;
691  if (debug) tprintf(", ");
692  }
693  } else {
694  adjust_factor += segment_penalty_dict_case_bad;
695  new_rating *= adjust_factor;
696  if (debug) tprintf(", C");
697  }
698  }
699  new_rating -= kRatingPad;
700  if (modify_rating) word->set_rating(new_rating);
701  if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
702  word->set_adjust_factor(adjust_factor);
703 }
704 
705 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {
706  const WERD_CHOICE *word_ptr = &word;
707  WERD_CHOICE temp_word(word.unicharset());
708  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
709  copy_hyphen_info(&temp_word);
710  temp_word += word;
711  word_ptr = &temp_word;
712  }
713  if (word_ptr->length() == 0) return NO_PERM;
714  // Allocate vectors for holding current and updated
715  // active_dawgs and initialize them.
716  DawgPositionVector *active_dawgs = new DawgPositionVector[2];
717  init_active_dawgs(&(active_dawgs[0]), false);
718  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
719  int last_index = word_ptr->length() - 1;
720  // Call leter_is_okay for each letter in the word.
721  for (int i = hyphen_base_size(); i <= last_index; ++i) {
722  if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),
723  i == last_index))) break;
724  // Swap active_dawgs, constraints with the corresponding updated vector.
725  if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
726  dawg_args.updated_dawgs = &(active_dawgs[0]);
727  ++(dawg_args.active_dawgs);
728  } else {
729  ++(dawg_args.updated_dawgs);
730  dawg_args.active_dawgs = &(active_dawgs[0]);
731  }
732  }
733  delete[] active_dawgs;
734  return valid_word_permuter(dawg_args.permuter, numbers_ok) ?
735  dawg_args.permuter : NO_PERM;
736 }
737 
738 bool Dict::valid_bigram(const WERD_CHOICE &word1,
739  const WERD_CHOICE &word2) const {
740  if (bigram_dawg_ == NULL) return false;
741 
742  // Extract the core word from the middle of each word with any digits
743  // replaced with question marks.
744  int w1start, w1end, w2start, w2end;
745  word1.punct_stripped(&w1start, &w1end);
746  word2.punct_stripped(&w2start, &w2end);
747 
748  // We don't want to penalize a single guillemet, hyphen, etc.
749  // But our bigram list doesn't have any information about punctuation.
750  if (w1start >= w1end) return word1.length() < 3;
751  if (w2start >= w2end) return word2.length() < 3;
752 
753  const UNICHARSET& uchset = getUnicharset();
754  GenericVector<UNICHAR_ID> bigram_string;
755  bigram_string.reserve(w1end + w2end + 1);
756  for (int i = w1start; i < w1end; i++) {
757  const GenericVector<UNICHAR_ID>& normed_ids =
758  getUnicharset().normed_ids(word1.unichar_id(i));
759  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
760  bigram_string.push_back(question_unichar_id_);
761  else
762  bigram_string += normed_ids;
763  }
764  bigram_string.push_back(UNICHAR_SPACE);
765  for (int i = w2start; i < w2end; i++) {
766  const GenericVector<UNICHAR_ID>& normed_ids =
767  getUnicharset().normed_ids(word2.unichar_id(i));
768  if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
769  bigram_string.push_back(question_unichar_id_);
770  else
771  bigram_string += normed_ids;
772  }
773  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
774  for (int i = 0; i < bigram_string.size(); ++i) {
775  normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,
776  0.0f, 0.0f);
777  }
778  return bigram_dawg_->word_in_dawg(normalized_word);
779 }
780 
782  if (word.length() == 0) return NO_PERM;
783  int i;
784  WERD_CHOICE new_word(word.unicharset());
785  int last_index = word.length() - 1;
786  int new_len = 0;
787  for (i = 0; i <= last_index; ++i) {
788  UNICHAR_ID unichar_id = (word.unichar_id(i));
789  if (getUnicharset().get_ispunctuation(unichar_id)) {
790  new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
791  } else if (!getUnicharset().get_isalpha(unichar_id) &&
792  !getUnicharset().get_isdigit(unichar_id)) {
793  return false; // neither punc, nor alpha, nor digit
794  } else if ((new_len = new_word.length()) == 0 ||
795  new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {
796  new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
797  }
798  }
799  for (i = 0; i < dawgs_.size(); ++i) {
800  if (dawgs_[i] != NULL &&
801  dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
802  dawgs_[i]->word_in_dawg(new_word)) return true;
803  }
804  return false;
805 }
806 
807 
808 } // namespace tesseract
Dict(CCUtil *image_ptr)
Definition: dict.cpp:33
const STRING & lang() const
Definition: dawg.h:128
DawgPositionVector * active_dawgs
Definition: dict.h:81
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307
int size() const
Definition: genericvector.h:72
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int length() const
Definition: genericvector.h:79
virtual bool end_of_word(EDGE_REF edge_ref) const =0
const CCUtil * getCCUtil() const
Definition: dict.h:90
EDGE_REF dawg_ref
Definition: dawg.h:362
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:361
int length() const
Definition: ratngs.h:300
int push_back(T object)
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
bool load_bigram_dawg
Definition: dict.h:561
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705
int null_sid() const
Definition: unicharset.h:831
#define tprintf(...)
Definition: tprintf.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
Definition: dict.cpp:486
int han_sid() const
Definition: unicharset.h:836
PermuterType
Definition: ratngs.h:240
double segment_penalty_dict_case_ok
Definition: dict.h:574
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
XHeightConsistencyEnum
Definition: dict.h:75
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:336
double xheight_penalty_inconsistent
Definition: dict.h:567
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304
double segment_penalty_dict_case_bad
Definition: dict.h:578
TessdataManager tessdata_manager
Definition: ccutil.h:71
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540
bool save_doc_words
Definition: dict.h:628
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
char * user_patterns_file
Definition: dict.h:551
EDGE_REF punc_ref
Definition: dawg.h:364
const STRING & unichar_string() const
Definition: ratngs.h:524
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125
int GetTopScriptID() const
Definition: ratngs.cpp:653
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
void End()
Definition: dict.cpp:310
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
const UNICHARSET * unicharset() const
Definition: ratngs.h:297
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
double doc_dict_pending_threshold
Definition: dict.h:630
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:781
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58
double xheight_penalty_subscripts
Definition: dict.h:564
Dawg * GetSquishedDawg(const STRING &lang, const char *data_file_name, TessdataType tessdata_dawg_type, int debug_level)
Definition: dawg_cache.cpp:47
float certainty() const
Definition: ratngs.h:327
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:352
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
void delete_data_pointers()
name_table name
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:184
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422
double segment_penalty_dict_frequent_word
Definition: dict.h:570
const STRING debug_string() const
Definition: ratngs.h:502
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301
const STRING & GetDataFileName() const
char * user_patterns_suffix
Definition: dict.h:553
bool load_system_dawg
Definition: dict.h:554
int UNICHAR_ID
Definition: unichar.h:33
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738
double doc_dict_certainty_threshold
Definition: dict.h:632
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
STRING language_data_path_prefix
Definition: ccutil.h:70
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:191
int dawg_debug_level
Definition: dict.h:595
bool load_unambig_dawg
Definition: dict.h:556
bool load_freq_dawg
Definition: dict.h:555
#define REFFORMAT
Definition: dawg.h:92
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:291
GenericVector< int > SuccessorList
Definition: dawg.h:68
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
char * user_words_suffix
Definition: dict.h:549
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
void reserve(int size)
DawgType type() const
Definition: dawg.h:127
double segment_penalty_dict_nonword
Definition: dict.h:586
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:319
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
double segment_penalty_garbage
Definition: dict.h:591
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:385
STRING lang
Definition: ccutil.h:69
#define CHARS_PER_LINE
Definition: cutil.h:57
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310
DawgPositionVector * updated_dawgs
Definition: dict.h:82
inT64 EDGE_REF
Definition: dawg.h:54
char * user_words_file
Definition: dict.h:547
Definition: strngs.h:44
#define NULL
Definition: host.h:144
inT64 NODE_REF
Definition: dawg.h:55
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:567
const char * string() const
Definition: strngs.cpp:193
PermuterType permuter() const
Definition: dawg.h:129
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:41
void set_adjust_factor(float factor)
Definition: ratngs.h:306
CCUtil ccutil
bool load_number_dawg
Definition: dict.h:559
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:409
void set_rating(float new_val)
Definition: ratngs.h:366
PermuterType permuter
Definition: dict.h:83
bool load_punc_dawg
Definition: dict.h:558