All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
permdawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permdawg.c (Formerly permdawg.c)
5  * Description: Scale word choices by a dictionary
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 9 15:43:18 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #include "cutil.h"
30 #include "dawg.h"
31 #include "freelist.h"
32 #include "globals.h"
33 #include "ndminx.h"
34 #include "stopper.h"
35 #include "tprintf.h"
36 #include "params.h"
37 
38 #include <ctype.h>
39 #include "dict.h"
40 
41 /*----------------------------------------------------------------------
42  F u n c t i o n s
43 ----------------------------------------------------------------------*/
44 namespace tesseract {
45 
53  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
54  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
55  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
56  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
57  DawgArgs *more_args = reinterpret_cast<DawgArgs*>(void_more_args);
58  word_ending = (char_choice_index == char_choices.size()-1);
59  int word_index = word->length() - 1;
60  if (best_choice->rating() < *limit) return;
61  // Look up char in DAWG
62 
63  // If the current unichar is an ngram first try calling
64  // letter_is_okay() for each unigram it contains separately.
65  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
66  bool checked_unigrams = false;
67  if (getUnicharset().get_isngram(orig_uch_id)) {
68  if (dawg_debug_level) {
69  tprintf("checking unigrams in an ngram %s\n",
70  getUnicharset().debug_str(orig_uch_id).string());
71  }
72  int num_unigrams = 0;
73  word->remove_last_unichar_id();
75  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
76  // Since the string came out of the unicharset, failure is impossible.
77  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, NULL,
78  NULL));
79  bool unigrams_ok = true;
80  // Construct DawgArgs that reflect the current state.
81  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
82  DawgPositionVector unigram_updated_dawgs;
83  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
84  &unigram_updated_dawgs,
85  more_args->permuter);
86  // Check unigrams in the ngram with letter_is_okay().
87  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
88  UNICHAR_ID uch_id = encoding[i];
89  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
90  ++num_unigrams;
91  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
92  unigrams_ok = (this->*letter_is_okay_)(
93  &unigram_dawg_args,
94  word->unichar_id(word_index+num_unigrams-1),
95  word_ending && i == encoding.size() - 1);
96  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
97  if (dawg_debug_level) {
98  tprintf("unigram %s is %s\n",
99  getUnicharset().debug_str(uch_id).string(),
100  unigrams_ok ? "OK" : "not OK");
101  }
102  }
103  // Restore the word and copy the updated dawg state if needed.
104  while (num_unigrams-- > 0) word->remove_last_unichar_id();
105  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
106  if (unigrams_ok) {
107  checked_unigrams = true;
108  more_args->permuter = unigram_dawg_args.permuter;
109  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
110  }
111  }
112 
113  // Check which dawgs from the dawgs_ vector contain the word
114  // up to and including the current unichar.
115  if (checked_unigrams || (this->*letter_is_okay_)(
116  more_args, word->unichar_id(word_index), word_ending)) {
117  // Add a new word choice
118  if (word_ending) {
119  if (dawg_debug_level) {
120  tprintf("found word = %s\n", word->debug_string().string());
121  }
122  if (strcmp(output_ambig_words_file.string(), "") != 0) {
123  if (output_ambig_words_file_ == NULL) {
124  output_ambig_words_file_ =
125  fopen(output_ambig_words_file.string(), "wb+");
126  if (output_ambig_words_file_ == NULL) {
127  tprintf("Failed to open output_ambig_words_file %s\n",
128  output_ambig_words_file.string());
129  exit(1);
130  }
131  STRING word_str;
132  word->string_and_lengths(&word_str, NULL);
133  word_str += " ";
134  fprintf(output_ambig_words_file_, "%s", word_str.string());
135  }
136  STRING word_str;
137  word->string_and_lengths(&word_str, NULL);
138  word_str += " ";
139  fprintf(output_ambig_words_file_, "%s", word_str.string());
140  }
141  WERD_CHOICE *adjusted_word = word;
142  adjusted_word->set_permuter(more_args->permuter);
143  update_best_choice(*adjusted_word, best_choice);
144  } else { // search the next letter
145  // Make updated_* point to the next entries in the DawgPositionVector
146  // arrays (that were originally created in dawg_permute_and_select)
147  ++(more_args->updated_dawgs);
148  // Make active_dawgs and constraints point to the updated ones.
149  ++(more_args->active_dawgs);
150  permute_choices(debug, char_choices, char_choice_index + 1,
151  prev_char_frag_info, word, certainties, limit,
152  best_choice, attempts_left, more_args);
153  // Restore previous state to explore another letter in this position.
154  --(more_args->updated_dawgs);
155  --(more_args->active_dawgs);
156  }
157  } else {
158  if (dawg_debug_level) {
159  tprintf("last unichar not OK at index %d in %s\n",
160  word_index, word->debug_string().string());
161  }
162  }
163 }
164 
165 
176  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
177  WERD_CHOICE *best_choice = new WERD_CHOICE(&getUnicharset());
178  best_choice->make_bad();
179  best_choice->set_rating(rating_limit);
180  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
181  return best_choice;
182  DawgPositionVector *active_dawgs =
183  new DawgPositionVector[char_choices.length() + 1];
184  init_active_dawgs(&(active_dawgs[0]), true);
185  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
187 
188  float certainties[MAX_WERD_LENGTH];
190  int attempts_left = max_permuter_attempts;
191  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : NULL,
192  char_choices, 0, NULL, &word, certainties, &rating_limit, best_choice,
193  &attempts_left, &dawg_args);
194  delete[] active_dawgs;
195  return best_choice;
196 }
197 
205  const char *debug,
206  const BLOB_CHOICE_LIST_VECTOR &char_choices,
207  int char_choice_index,
208  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
209  WERD_CHOICE *word,
210  float certainties[],
211  float *limit,
212  WERD_CHOICE *best_choice,
213  int *attempts_left,
214  void *more_args) {
215  if (debug) {
216  tprintf("%s permute_choices: char_choice_index=%d"
217  " limit=%g rating=%g, certainty=%g word=%s\n",
218  debug, char_choice_index, *limit, word->rating(),
219  word->certainty(), word->debug_string().string());
220  }
221  if (char_choice_index < char_choices.length()) {
222  BLOB_CHOICE_IT blob_choice_it;
223  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
224  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
225  blob_choice_it.forward()) {
226  (*attempts_left)--;
227  append_choices(debug, char_choices, *(blob_choice_it.data()),
228  char_choice_index, prev_char_frag_info, word,
229  certainties, limit, best_choice, attempts_left, more_args);
230  if (*attempts_left <= 0) {
231  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
232  break;
233  }
234  }
235  }
236 }
237 
247  const char *debug,
248  const BLOB_CHOICE_LIST_VECTOR &char_choices,
249  const BLOB_CHOICE &blob_choice,
250  int char_choice_index,
251  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
252  WERD_CHOICE *word,
253  float certainties[],
254  float *limit,
255  WERD_CHOICE *best_choice,
256  int *attempts_left,
257  void *more_args) {
258  int word_ending =
259  (char_choice_index == char_choices.length() - 1) ? true : false;
260 
261  // Deal with fragments.
262  CHAR_FRAGMENT_INFO char_frag_info;
263  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
264  blob_choice.certainty(), prev_char_frag_info, debug,
265  word_ending, &char_frag_info)) {
266  return; // blob_choice must be an invalid fragment
267  }
268  // Search the next letter if this character is a fragment.
269  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
270  permute_choices(debug, char_choices, char_choice_index + 1,
271  &char_frag_info, word, certainties, limit,
272  best_choice, attempts_left, more_args);
273  return;
274  }
275 
276  // Add the next unichar.
277  float old_rating = word->rating();
278  float old_certainty = word->certainty();
279  uinT8 old_permuter = word->permuter();
280  certainties[word->length()] = char_frag_info.certainty;
282  char_frag_info.unichar_id, char_frag_info.num_fragments,
283  char_frag_info.rating, char_frag_info.certainty);
284 
285  // Explore the next unichar.
286  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
287  &char_frag_info, word_ending, word, certainties,
288  limit, best_choice, attempts_left, more_args);
289 
290  // Remove the unichar we added to explore other choices in it's place.
291  word->remove_last_unichar_id();
292  word->set_rating(old_rating);
293  word->set_certainty(old_certainty);
294  word->set_permuter(old_permuter);
295 }
296 
323  float curr_rating, float curr_certainty,
324  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
325  const char *debug, int word_ending,
326  CHAR_FRAGMENT_INFO *char_frag_info) {
327  const CHAR_FRAGMENT *this_fragment =
328  getUnicharset().get_fragment(curr_unichar_id);
329  const CHAR_FRAGMENT *prev_fragment =
330  prev_char_frag_info != NULL ? prev_char_frag_info->fragment : NULL;
331 
332  // Print debug info for fragments.
333  if (debug && (prev_fragment || this_fragment)) {
334  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
335  getUnicharset().debug_str(curr_unichar_id).string(),
336  word_ending);
337  if (prev_fragment) {
338  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
339  }
340  if (this_fragment) {
341  tprintf("this_fragment %s\n", this_fragment->to_string().string());
342  }
343  }
344 
345  char_frag_info->unichar_id = curr_unichar_id;
346  char_frag_info->fragment = this_fragment;
347  char_frag_info->rating = curr_rating;
348  char_frag_info->certainty = curr_certainty;
349  char_frag_info->num_fragments = 1;
350  if (prev_fragment && !this_fragment) {
351  if (debug) tprintf("Skip choice with incomplete fragment\n");
352  return false;
353  }
354  if (this_fragment) {
355  // We are dealing with a fragment.
356  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
357  if (prev_fragment) {
358  if (!this_fragment->is_continuation_of(prev_fragment)) {
359  if (debug) tprintf("Non-matching fragment piece\n");
360  return false;
361  }
362  if (this_fragment->is_ending()) {
363  char_frag_info->unichar_id =
364  getUnicharset().unichar_to_id(this_fragment->get_unichar());
365  char_frag_info->fragment = NULL;
366  if (debug) {
367  tprintf("Built character %s from fragments\n",
368  getUnicharset().debug_str(
369  char_frag_info->unichar_id).string());
370  }
371  } else {
372  if (debug) tprintf("Record fragment continuation\n");
373  char_frag_info->fragment = this_fragment;
374  }
375  // Update certainty and rating.
376  char_frag_info->rating =
377  prev_char_frag_info->rating + curr_rating;
378  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
379  char_frag_info->certainty =
380  MIN(curr_certainty, prev_char_frag_info->certainty);
381  } else {
382  if (this_fragment->is_beginning()) {
383  if (debug) tprintf("Record fragment beginning\n");
384  } else {
385  if (debug) {
386  tprintf("Non-starting fragment piece with no prev_fragment\n");
387  }
388  return false;
389  }
390  }
391  }
392  if (word_ending && char_frag_info->fragment) {
393  if (debug) tprintf("Word can not end with a fragment\n");
394  return false;
395  }
396  return true;
397 }
398 
399 } // namespace tesseract
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
DawgPositionVector * active_dawgs
Definition: dict.h:81
int size() const
Definition: genericvector.h:72
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:52
const CHAR_FRAGMENT * fragment
Definition: dict.h:42
int length() const
Definition: genericvector.h:79
int max_permuter_attempts
Definition: dict.h:637
void set_certainty(float new_val)
Definition: ratngs.h:369
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:169
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:92
int length() const
Definition: ratngs.h:300
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523
UNICHAR_ID unichar_id
Definition: dict.h:41
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:204
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203
#define ASSERT_HOST(x)
Definition: errcode.h:84
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
float rating() const
Definition: ratngs.h:79
void remove_last_unichar_id()
Definition: ratngs.h:480
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:440
float certainty() const
Definition: ratngs.h:327
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
uinT8 permuter() const
Definition: ratngs.h:343
const STRING debug_string() const
Definition: ratngs.h:502
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:246
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:175
int UNICHAR_ID
Definition: unichar.h:33
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
bool is_ending() const
Definition: unicharset.h:102
int dawg_debug_level
Definition: dict.h:595
static STRING to_string(const char *unichar, int pos, int total, bool natural)
const char * get_unichar() const
Definition: unicharset.h:64
int num_fragments
Definition: dict.h:43
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347
#define MAX_WERD_LENGTH
Definition: dict.h:36
char * output_ambig_words_file
Definition: dict.h:593
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:322
float rating
Definition: dict.h:44
DawgPositionVector * updated_dawgs
Definition: dict.h:82
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
float certainty
Definition: dict.h:45
float certainty() const
Definition: ratngs.h:82
T & get(int index) const
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void set_rating(float new_val)
Definition: ratngs.h:366
PermuterType permuter
Definition: dict.h:83
bool is_beginning() const
Definition: unicharset.h:99
unsigned char uinT8
Definition: host.h:99