tesseract v5.3.3.20231005
permdawg.cpp
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * File: permdawg.cpp (Formerly permdawg.c)
4 * Description: Scale word choices by a dictionary
5 * Author: Mark Seaman, OCR Technology
6 *
7 * (c) Copyright 1987, Hewlett-Packard Company.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 *****************************************************************************/
19/*----------------------------------------------------------------------
20 I n c l u d e s
21----------------------------------------------------------------------*/
22
23#include "dawg.h"
24#include "params.h"
25#include "stopper.h"
26#include "tprintf.h"
27
28#include <algorithm>
29#include <cctype>
30#include "dict.h"
31
32/*----------------------------------------------------------------------
33 F u n c t i o n s
34----------------------------------------------------------------------*/
35namespace tesseract {
36
43void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
44 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
45 bool word_ending, WERD_CHOICE *word, float certainties[],
46 float *limit, WERD_CHOICE *best_choice, int *attempts_left,
47 void *void_more_args) {
48 auto *more_args = static_cast<DawgArgs *>(void_more_args);
49 word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
50 int word_index = word->length() - 1;
51 if (best_choice->rating() < *limit) {
52 return;
53 }
54 // Look up char in DAWG
55
56 // If the current unichar is an ngram first try calling
57 // letter_is_okay() for each unigram it contains separately.
58 UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
59 bool checked_unigrams = false;
60 if (getUnicharset().get_isngram(orig_uch_id)) {
61 if (dawg_debug_level) {
62 tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
63 }
64 int num_unigrams = 0;
66 std::vector<UNICHAR_ID> encoding;
67 const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68 // Since the string came out of the unicharset, failure is impossible.
69 ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
70 bool unigrams_ok = true;
71 // Construct DawgArgs that reflect the current state.
72 DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
73 DawgPositionVector unigram_updated_dawgs;
74 DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
75 // Check unigrams in the ngram with letter_is_okay().
76 for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
77 UNICHAR_ID uch_id = encoding[i];
78 ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
79 ++num_unigrams;
80 word->append_unichar_id(uch_id, 1, 0.0, 0.0);
81 unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
82 word->unichar_id(word_index + num_unigrams - 1),
83 word_ending && i == encoding.size() - 1);
84 (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
85 if (dawg_debug_level) {
86 tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
87 unigrams_ok ? "OK" : "not OK");
88 }
89 }
90 // Restore the word and copy the updated dawg state if needed.
91 while (num_unigrams-- > 0) {
93 }
94 word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
95 if (unigrams_ok) {
96 checked_unigrams = true;
97 more_args->permuter = unigram_dawg_args.permuter;
98 *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
99 }
100 }
101
102 // Check which dawgs from the dawgs_ vector contain the word
103 // up to and including the current unichar.
104 if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
105 word->unichar_id(word_index), word_ending)) {
106 // Add a new word choice
107 if (word_ending) {
108 if (dawg_debug_level) {
109 tprintf("found word = %s\n", word->debug_string().c_str());
110 }
111 if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
112 if (output_ambig_words_file_ == nullptr) {
113 output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
114 if (output_ambig_words_file_ == nullptr) {
115 tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
116 exit(1);
117 }
118 std::string word_str;
119 word->string_and_lengths(&word_str, nullptr);
120 word_str += " ";
121 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
122 }
123 std::string word_str;
124 word->string_and_lengths(&word_str, nullptr);
125 word_str += " ";
126 fprintf(output_ambig_words_file_, "%s", word_str.c_str());
127 }
128 WERD_CHOICE *adjusted_word = word;
129 adjusted_word->set_permuter(more_args->permuter);
130 update_best_choice(*adjusted_word, best_choice);
131 } else { // search the next letter
132 // Make updated_* point to the next entries in the DawgPositionVector
133 // arrays (that were originally created in dawg_permute_and_select)
134 ++(more_args->updated_dawgs);
135 // Make active_dawgs and constraints point to the updated ones.
136 ++(more_args->active_dawgs);
137 permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
138 certainties, limit, best_choice, attempts_left, more_args);
139 // Restore previous state to explore another letter in this position.
140 --(more_args->updated_dawgs);
141 --(more_args->active_dawgs);
142 }
143 } else {
144 if (dawg_debug_level) {
145 tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
146 }
147 }
148}
149
160 float rating_limit) {
161 auto *best_choice = new WERD_CHOICE(&getUnicharset());
162 best_choice->make_bad();
163 best_choice->set_rating(rating_limit);
164 if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
165 return best_choice;
166 }
167 auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
168 init_active_dawgs(&(active_dawgs[0]), true);
169 DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
171
172 float certainties[MAX_WERD_LENGTH];
174 int attempts_left = max_permuter_attempts;
175 permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
176 &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
177 delete[] active_dawgs;
178 return best_choice;
179}
180
187void Dict::permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
188 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
189 WERD_CHOICE *word, float certainties[], float *limit,
190 WERD_CHOICE *best_choice, int *attempts_left, void *more_args) {
191 if (debug) {
192 tprintf(
193 "%s permute_choices: char_choice_index=%d"
194 " limit=%g rating=%g, certainty=%g word=%s\n",
195 debug, char_choice_index, *limit, word->rating(), word->certainty(),
196 word->debug_string().c_str());
197 }
198 if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
199 BLOB_CHOICE_IT blob_choice_it;
200 blob_choice_it.set_to_list(char_choices.at(char_choice_index));
201 for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
202 (*attempts_left)--;
203 append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
204 prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
205 more_args);
206 if (*attempts_left <= 0) {
207 if (debug) {
208 tprintf("permute_choices(): attempts_left is 0\n");
209 }
210 break;
211 }
212 }
213 }
214}
215
224void Dict::append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
225 const BLOB_CHOICE &blob_choice, int char_choice_index,
226 const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,
227 float certainties[], float *limit, WERD_CHOICE *best_choice,
228 int *attempts_left, void *more_args) {
229 auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
230
231 // Deal with fragments.
232 CHAR_FRAGMENT_INFO char_frag_info;
233 if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
234 prev_char_frag_info, debug, word_ending, &char_frag_info)) {
235 return; // blob_choice must be an invalid fragment
236 }
237 // Search the next letter if this character is a fragment.
238 if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
239 permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
240 limit, best_choice, attempts_left, more_args);
241 return;
242 }
243
244 // Add the next unichar.
245 float old_rating = word->rating();
246 float old_certainty = word->certainty();
247 uint8_t old_permuter = word->permuter();
248 certainties[word->length()] = char_frag_info.certainty;
249 word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
250 char_frag_info.rating, char_frag_info.certainty);
251
252 // Explore the next unichar.
253 (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
254 word, certainties, limit, best_choice, attempts_left, more_args);
255
256 // Remove the unichar we added to explore other choices in it's place.
258 word->set_rating(old_rating);
259 word->set_certainty(old_certainty);
260 word->set_permuter(old_permuter);
261}
262
288bool Dict::fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,
289 const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,
290 int word_ending, CHAR_FRAGMENT_INFO *char_frag_info) {
291 const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
292 const CHAR_FRAGMENT *prev_fragment =
293 prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
294
295 // Print debug info for fragments.
296 if (debug && (prev_fragment || this_fragment)) {
297 tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
298 getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
299 if (prev_fragment) {
300 tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
301 }
302 if (this_fragment) {
303 tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
304 }
305 }
306
307 char_frag_info->unichar_id = curr_unichar_id;
308 char_frag_info->fragment = this_fragment;
309 char_frag_info->rating = curr_rating;
310 char_frag_info->certainty = curr_certainty;
311 char_frag_info->num_fragments = 1;
312 if (prev_fragment && !this_fragment) {
313 if (debug) {
314 tprintf("Skip choice with incomplete fragment\n");
315 }
316 return false;
317 }
318 if (this_fragment) {
319 // We are dealing with a fragment.
320 char_frag_info->unichar_id = INVALID_UNICHAR_ID;
321 if (prev_fragment) {
322 if (!this_fragment->is_continuation_of(prev_fragment)) {
323 if (debug) {
324 tprintf("Non-matching fragment piece\n");
325 }
326 return false;
327 }
328 if (this_fragment->is_ending()) {
329 char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
330 char_frag_info->fragment = nullptr;
331 if (debug) {
332 tprintf("Built character %s from fragments\n",
333 getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
334 }
335 } else {
336 if (debug) {
337 tprintf("Record fragment continuation\n");
338 }
339 char_frag_info->fragment = this_fragment;
340 }
341 // Update certainty and rating.
342 char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
343 char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
344 char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
345 } else {
346 if (this_fragment->is_beginning()) {
347 if (debug) {
348 tprintf("Record fragment beginning\n");
349 }
350 } else {
351 if (debug) {
352 tprintf("Non-starting fragment piece with no prev_fragment\n");
353 }
354 return false;
355 }
356 }
357 }
358 if (word_ending && char_frag_info->fragment) {
359 if (debug) {
360 tprintf("Word cannot end with a fragment\n");
361 }
362 return false;
363 }
364 return true;
365}
366
367} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define MAX_WERD_LENGTH
Definition: dict.h:45
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UNICHAR_ID
Definition: unichar.h:34
@ NO_PERM
Definition: ratngs.h:236
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:627
float certainty() const
Definition: ratngs.h:87
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81
float rating() const
Definition: ratngs.h:84
std::string debug_string() const
Definition: ratngs.h:479
float certainty() const
Definition: ratngs.h:315
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
uint8_t permuter() const
Definition: ratngs.h:331
void set_certainty(float new_val)
Definition: ratngs.h:357
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:428
void set_permuter(uint8_t perm)
Definition: ratngs.h:360
const UNICHARSET * unicharset() const
Definition: ratngs.h:281
unsigned length() const
Definition: ratngs.h:287
void remove_last_unichar_id()
Definition: ratngs.h:455
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:447
float rating() const
Definition: ratngs.h:312
void set_rating(float new_val)
Definition: ratngs.h:354
bool is_ending() const
Definition: unicharset.h:121
static std::string to_string(const char *unichar, int pos, int total, bool natural)
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:109
const char * get_unichar() const
Definition: unicharset.h:76
bool is_beginning() const
Definition: unicharset.h:116
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool get_isngram(UNICHAR_ID unichar_id) const
Definition: unicharset.h:542
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
const CHAR_FRAGMENT * fragment
Definition: dict.h:51
DawgPositionVector * updated_dawgs
Definition: dict.h:88
DawgPositionVector * active_dawgs
Definition: dict.h:87
PermuterType permuter
Definition: dict.h:89
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:187
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:224
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:210
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:159
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:43
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:610
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:288