tesseract v5.3.3.20231005
control.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: control.cpp (Formerly control.c)
3 * Description: Module-independent matcher controller.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19// Include automatically generated configuration file if running autoconf.
20#ifdef HAVE_CONFIG_H
21# include "config_auto.h"
22#endif
23
24#include <cctype>
25#include <cmath>
26#include <cstdint> // for int16_t, int32_t
27#include <cstdio> // for fclose, fopen, FILE
28#include <ctime> // for clock
29#include "control.h"
30#ifndef DISABLED_LEGACY_ENGINE
31# include "docqual.h"
32# include "drawfx.h"
33# include "fixspace.h"
34#endif
35#include <tesseract/ocrclass.h>
36#include "lstmrecognizer.h"
37#include "output.h"
38#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...
39#ifndef DISABLED_LEGACY_ENGINE
40# include "reject.h"
41#endif
42#include "sorthelper.h"
43#include "tesseractclass.h"
44#include "tessvars.h"
45#include "werdit.h"
46
47const char *const kBackUpConfigFile = "tempconfigdata.config";
48#ifndef DISABLED_LEGACY_ENGINE
49// Min believable x-height for any text when refitting as a fraction of
50// original x-height
51const double kMinRefitXHeightFraction = 0.5;
52#endif // ! DISABLED_LEGACY_ENGINE
53
60namespace tesseract {
61
62void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {
63 PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);
64 if (it != nullptr) {
67 delete it;
68 }
69}
70
77 WordData word_data(*pr_it);
78 SetupWordPassN(2, &word_data);
79 // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
80 if (lstm_recognizer_ == nullptr) {
81#ifndef DISABLED_LEGACY_ENGINE
82 classify_word_and_language(2, pr_it, &word_data);
83#endif // ndef DISABLED_LEGACY_ENGINE
84 } else {
85 classify_word_and_language(1, pr_it, &word_data);
86 }
87#ifndef DISABLED_LEGACY_ENGINE
88 if (tessedit_debug_quality_metrics) {
89 int16_t char_qual;
90 int16_t good_char_qual;
91 WERD_RES *word_res = pr_it->word();
92 word_char_quality(word_res, &char_qual, &good_char_qual);
93 tprintf(
94 "\n%d chars; word_blob_quality: %d; outline_errs: %d; "
95 "char_quality: %d; good_char_quality: %d\n",
96 word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),
97 char_qual, good_char_qual);
98 }
99#endif // ndef DISABLED_LEGACY_ENGINE
100 return true;
101}
102
103// Helper function to check for a target word and handle it appropriately.
104// Inspired by Jetsoft's requirement to process only single words on pass2
105// and beyond.
106// If word_config is not null:
107// If the word_box and target_word_box overlap, read the word_config file
108// else reset to previous config data.
109// return true.
110// else
111// If the word_box and target_word_box overlap or pass <= 1, return true.
112// Note that this function uses a fixed temporary file for storing the previous
113// configs, so it is neither thread-safe, nor process-safe, but the assumption
114// is that it will only be used for one debug window at a time.
115//
116// Since this function is used for debugging (and not to change OCR results)
117// set only debug params from the word config file.
118bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,
119 const char *word_config, int pass) {
120 if (word_config != nullptr) {
121 if (word_box.major_overlap(target_word_box)) {
122 if (backup_config_file_ == nullptr) {
123 backup_config_file_ = kBackUpConfigFile;
124 FILE *config_fp = fopen(backup_config_file_, "wb");
125 if (config_fp == nullptr) {
126 tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);
127 } else {
128 ParamUtils::PrintParams(config_fp, params());
129 fclose(config_fp);
130 }
132 }
133 } else {
134 if (backup_config_file_ != nullptr) {
136 backup_config_file_ = nullptr;
137 }
138 }
139 } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
140 return false;
141 }
142 return true;
143}
144
146void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,
147 PAGE_RES *page_res, std::vector<WordData> *words) {
148 // Prepare all the words.
149 PAGE_RES_IT page_res_it(page_res);
150 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
151 if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),
152 *target_word_box, word_config, 1)) {
153 words->push_back(WordData(page_res_it));
154 }
155 }
156 // Setup all the words for recognition with polygonal approximation.
157 for (unsigned w = 0; w < words->size(); ++w) {
158 SetupWordPassN(pass_n, &(*words)[w]);
159 if (w > 0) {
160 (*words)[w].prev_word = &(*words)[w - 1];
161 }
162 }
163}
164
165// Sets up the single word ready for whichever engine is to be run.
166void Tesseract::SetupWordPassN(int pass_n, WordData *word) {
167 if (pass_n == 1 || !word->word->done) {
168 if (pass_n == 1) {
169 word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,
170 nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171 poly_allow_detailed_fx, word->row, word->block);
172 } else if (pass_n == 2) {
173 // TODO(rays) Should we do this on pass1 too?
174 word->word->caps_height = 0.0;
175 if (word->word->x_height == 0.0f) {
176 word->word->x_height = word->row->x_height();
177 }
178 }
179 word->lang_words.truncate(0);
180 for (unsigned s = 0; s <= sub_langs_.size(); ++s) {
181 // The sub_langs_.size() entry is for the master language.
182 Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
183 auto *word_res = new WERD_RES;
184 word_res->InitForRetryRecognition(*word->word);
185 word->lang_words.push_back(word_res);
186 // LSTM doesn't get setup for pass2.
187 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
188 word_res->SetupForRecognition(
189 lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,
190 lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191 lang_t->poly_allow_detailed_fx, word->row, word->block);
192 }
193 }
194 }
195}
196
197// Runs word recognition on all the words.
198bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,
199 std::vector<WordData> *words) {
200 // TODO(rays) Before this loop can be parallelized (it would yield a massive
201 // speed-up) all remaining member globals need to be converted to local/heap
202 // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
203 // added. The results will be significantly different with adaption on, and
204 // deterioration will need investigation.
205 pr_it->restart_page();
206 for (unsigned w = 0; w < words->size(); ++w) {
207 WordData *word = &(*words)[w];
208 if (w > 0) {
209 word->prev_word = &(*words)[w - 1];
210 }
211 if (monitor != nullptr) {
212 monitor->ocr_alive = true;
213 if (pass_n == 1) {
214 monitor->progress = 70 * w / words->size();
215 } else {
216 monitor->progress = 70 + 30 * w / words->size();
217 }
218 if (monitor->progress_callback2 != nullptr) {
219 TBOX box = pr_it->word()->word->bounding_box();
220 (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());
221 }
222 if (monitor->deadline_exceeded() ||
223 (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {
224 // Timeout. Fake out the rest of the words.
225 for (; w < words->size(); ++w) {
226 (*words)[w].word->SetupFake(unicharset);
227 }
228 return false;
229 }
230 }
231 if (word->word->tess_failed) {
232 unsigned s;
233 for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {
234 }
235 // If all are failed, skip it. Image words are skipped by this test.
236 if (s > word->lang_words.size()) {
237 continue;
238 }
239 }
240 // Sync pr_it with the WordData.
241 while (pr_it->word() != nullptr && pr_it->word() != word->word) {
242 pr_it->forward();
243 }
244 ASSERT_HOST(pr_it->word() != nullptr);
245 bool make_next_word_fuzzy = false;
246#ifndef DISABLED_LEGACY_ENGINE
247 if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
248 // Needs to be setup again to see the new outlines in the chopped_word.
249 SetupWordPassN(pass_n, word);
250 }
251#endif // ndef DISABLED_LEGACY_ENGINE
252
253 classify_word_and_language(pass_n, pr_it, word);
254 if (tessedit_dump_choices || debug_noise_removal) {
255 tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),
256 word->word->best_choice->debug_string().c_str());
257 }
258 pr_it->forward();
259 if (make_next_word_fuzzy && pr_it->word() != nullptr) {
260 pr_it->MakeCurrentWordFuzzy();
261 }
262 }
263 return true;
264}
265
288 const TBOX *target_word_box, const char *word_config,
289 int dopasses) {
290 PAGE_RES_IT page_res_it(page_res);
291
292 if (tessedit_minimal_rej_pass1) {
293 tessedit_test_adaption.set_value(true);
294 tessedit_minimal_rejection.set_value(true);
295 }
296
297 if (dopasses == 0 || dopasses == 1) {
298 page_res_it.restart_page();
299 // ****************** Pass 1 *******************
300
301#ifndef DISABLED_LEGACY_ENGINE
302 // If the adaptive classifier is full switch to one we prepared earlier,
303 // ie on the previous page. If the current adaptive classifier is non-empty,
304 // prepare a backup starting at this page, in case it fills up. Do all this
305 // independently for each language.
308 } else if (!AdaptiveClassifierIsEmpty()) {
310 }
311 // Now check the sub-langs as well.
312 for (auto &lang : sub_langs_) {
313 if (lang->AdaptiveClassifierIsFull()) {
314 lang->SwitchAdaptiveClassifier();
315 } else if (!lang->AdaptiveClassifierIsEmpty()) {
316 lang->StartBackupAdaptiveClassifier();
317 }
318 }
319
320#endif // ndef DISABLED_LEGACY_ENGINE
321
322 // Set up all words ready for recognition, so that if parallelism is on
323 // all the input and output classes are ready to run the classifier.
324 std::vector<WordData> words;
325 SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
326#ifndef DISABLED_LEGACY_ENGINE
327 if (tessedit_parallelize) {
328 PrerecAllWordsPar(words);
329 }
330#endif // ndef DISABLED_LEGACY_ENGINE
331
332 stats_.word_count = words.size();
333
334 stats_.dict_words = 0;
335 stats_.doc_blob_quality = 0;
336 stats_.doc_outline_errs = 0;
337 stats_.doc_char_quality = 0;
338 stats_.good_char_count = 0;
339 stats_.doc_good_char_quality = 0;
340
341 most_recently_used_ = this;
342 // Run pass 1 word recognition.
343 if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {
344 return false;
345 }
346 // Pass 1 post-processing.
347 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
348 if (page_res_it.word()->word->flag(W_REP_CHAR)) {
349 fix_rep_char(&page_res_it);
350 continue;
351 }
352
353 // Count dict words.
354 if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {
355 ++(stats_.dict_words);
356 }
357
358 // Update misadaption log (we only need to do it on pass 1, since
359 // adaption only happens on this pass).
360 if (page_res_it.word()->blamer_bundle != nullptr &&
361 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
362 page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());
363 }
364 }
365 }
366
367 if (dopasses == 1) {
368 return true;
369 }
370
371#ifndef DISABLED_LEGACY_ENGINE
372
373 // ****************** Pass 2 *******************
374 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {
375 page_res_it.restart_page();
376 std::vector<WordData> words;
377 SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
378 if (tessedit_parallelize) {
379 PrerecAllWordsPar(words);
380 }
381 most_recently_used_ = this;
382 // Run pass 2 word recognition.
383 if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {
384 return false;
385 }
386 }
387
388 // The next passes are only required for Tess-only.
389 if (AnyTessLang() && !AnyLSTMLang()) {
390 // ****************** Pass 3 *******************
391 // Fix fuzzy spaces.
392
393 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
394 !right_to_left()) {
395 fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
396 }
397
398 // ****************** Pass 4 *******************
399 if (tessedit_enable_dict_correction) {
401 }
402 if (tessedit_enable_bigram_correction) {
403 bigram_correction_pass(page_res);
404 }
405
406 // ****************** Pass 5,6 *******************
407 rejection_passes(page_res, monitor, target_word_box, word_config);
408
409 // ****************** Pass 8 *******************
410 font_recognition_pass(page_res);
411
412 // ****************** Pass 9 *******************
413 // Check the correctness of the final results.
414 blamer_pass(page_res);
415 script_pos_pass(page_res);
416 }
417
418#endif // ndef DISABLED_LEGACY_ENGINE
419
420 // Write results pass.
421 // This is now redundant, but retained commented so show how to obtain
422 // bounding boxes and style information.
423
424#ifndef DISABLED_LEGACY_ENGINE
425 // changed by jetsoft
426 // needed for dll to output memory structure
427 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
428 output_pass(page_res_it, target_word_box);
429 }
430// end jetsoft
431#endif // ndef DISABLED_LEGACY_ENGINE
432
433 const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
434 textord_.CleanupSingleRowResult(pageseg_mode, page_res);
435
436 // Remove empty words, as these mess up the result iterators.
437 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
438 const WERD_RES *word = page_res_it.word();
439 const POLY_BLOCK *pb = page_res_it.block()->block != nullptr
440 ? page_res_it.block()->block->pdblk.poly_block()
441 : nullptr;
442 if (word->best_choice == nullptr || word->best_choice->empty() ||
443 (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {
444 page_res_it.DeleteCurrentWord();
445 }
446 }
447
448 if (monitor != nullptr) {
449 monitor->progress = 100;
450 }
451 return true;
452}
453
454#ifndef DISABLED_LEGACY_ENGINE
455
457 PAGE_RES_IT word_it(page_res);
458
459 WERD_RES *w_prev = nullptr;
460 WERD_RES *w = word_it.word();
461 while (true) {
462 w_prev = w;
463 while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {
464 // advance word_it, skipping over parts of combos
465 }
466 if (!word_it.word()) {
467 break;
468 }
469 w = word_it.word();
470 if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
471 continue;
472 }
473 if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
474 if (tessedit_bigram_debug) {
475 tprintf("Skipping because one of the words is W_REP_CHAR\n");
476 }
477 continue;
478 }
479 // Two words sharing the same language model, excellent!
480 std::vector<WERD_CHOICE *> overrides_word1;
481 std::vector<WERD_CHOICE *> overrides_word2;
482
483 const auto orig_w1_str = w_prev->best_choice->unichar_string();
484 const auto orig_w2_str = w->best_choice->unichar_string();
485 WERD_CHOICE prev_best(w->uch_set);
486 {
487 int w1start, w1end;
488 w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
489 prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
490 }
491 WERD_CHOICE this_best(w->uch_set);
492 {
493 int w2start, w2end;
494 w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
495 this_best = w->best_choice->shallow_copy(w2start, w2end);
496 }
497
498 if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
499 if (tessedit_bigram_debug) {
500 tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501 orig_w2_str.c_str());
502 }
503 continue;
504 }
505 if (tessedit_bigram_debug > 2) {
506 tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
507 }
508 if (tessedit_bigram_debug > 1) {
509 if (!w_prev->best_choices.singleton()) {
510 w_prev->PrintBestChoices();
511 }
512 if (!w->best_choices.singleton()) {
513 w->PrintBestChoices();
514 }
515 }
516 float best_rating = 0.0;
517 int best_idx = 0;
518 WERD_CHOICE_IT prev_it(&w_prev->best_choices);
519 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
520 WERD_CHOICE *p1 = prev_it.data();
521 WERD_CHOICE strip1(w->uch_set);
522 {
523 int p1start, p1end;
524 p1->GetNonSuperscriptSpan(&p1start, &p1end);
525 strip1 = p1->shallow_copy(p1start, p1end);
526 }
527 WERD_CHOICE_IT w_it(&w->best_choices);
528 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
529 WERD_CHOICE *p2 = w_it.data();
530 WERD_CHOICE strip2(w->uch_set);
531 {
532 int p2start, p2end;
533 p2->GetNonSuperscriptSpan(&p2start, &p2end);
534 strip2 = p2->shallow_copy(p2start, p2end);
535 }
536 if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
537 overrides_word1.push_back(p1);
538 overrides_word2.push_back(p2);
539 if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {
540 best_rating = p1->rating() + p2->rating();
541 best_idx = overrides_word1.size() - 1;
542 }
543 }
544 }
545 }
546 if (!overrides_word1.empty()) {
547 // Excellent, we have some bigram matches.
548 if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&
549 EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {
550 if (tessedit_bigram_debug > 1) {
551 tprintf(
552 "Top choice \"%s %s\" verified (sans case) by bigram "
553 "model.\n",
554 orig_w1_str.c_str(), orig_w2_str.c_str());
555 }
556 continue;
557 }
558 const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559 const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560 if (new_w1_str != orig_w1_str) {
561 w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
562 }
563 if (new_w2_str != orig_w2_str) {
564 w->ReplaceBestChoice(overrides_word2[best_idx]);
565 }
566 if (tessedit_bigram_debug > 0) {
567 std::string choices_description;
568 int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569 if (num_bigram_choices == 1) {
570 choices_description = "This was the unique bigram choice.";
571 } else {
572 if (tessedit_bigram_debug > 1) {
573 std::string bigrams_list;
574 const int kMaxChoicesToPrint = 20;
575 for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {
576 if (i > 0) {
577 bigrams_list += ", ";
578 }
579 WERD_CHOICE *p1 = overrides_word1[i];
580 WERD_CHOICE *p2 = overrides_word2[i];
581 bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
582 }
583 choices_description = "There were many choices: {";
584 choices_description += bigrams_list;
585 choices_description += "}";
586 } else {
587 choices_description += "There were " + std::to_string(num_bigram_choices);
588 choices_description += " compatible bigrams.";
589 }
590 }
591 tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593 choices_description.c_str());
594 }
595 }
596 }
597}
598
600 const TBOX *target_word_box, const char *word_config) {
601 PAGE_RES_IT page_res_it(page_res);
602 // ****************** Pass 5 *******************
603 // Gather statistics on rejects.
604 int word_index = 0;
605 while (!tessedit_test_adaption && page_res_it.word() != nullptr) {
606 WERD_RES *word = page_res_it.word();
607 word_index++;
608 if (monitor != nullptr) {
609 monitor->ocr_alive = true;
610 monitor->progress = 95 + 5 * word_index / stats_.word_count;
611 }
612 if (word->rebuild_word == nullptr) {
613 // Word was not processed by tesseract.
614 page_res_it.forward();
615 continue;
616 }
617 check_debug_pt(word, 70);
618
619 // changed by jetsoft
620 // specific to its needs to extract one word when need
621 if (target_word_box &&
622 !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {
623 page_res_it.forward();
624 continue;
625 }
626 // end jetsoft
627
628 page_res_it.rej_stat_word();
629 const int chars_in_word = word->reject_map.length();
630 const int rejects_in_word = word->reject_map.reject_count();
631
632 const int blob_quality = word_blob_quality(word);
633 stats_.doc_blob_quality += blob_quality;
634 const int outline_errs = word_outline_errs(word);
635 stats_.doc_outline_errs += outline_errs;
636 int16_t all_char_quality;
637 int16_t accepted_all_char_quality;
638 word_char_quality(word, &all_char_quality, &accepted_all_char_quality);
639 stats_.doc_char_quality += all_char_quality;
640 const uint8_t permuter_type = word->best_choice->permuter();
641 if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||
642 (permuter_type == USER_DAWG_PERM)) {
643 stats_.good_char_count += chars_in_word - rejects_in_word;
644 stats_.doc_good_char_quality += accepted_all_char_quality;
645 }
646 check_debug_pt(word, 80);
647 if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
649 }
650 check_debug_pt(word, 90);
651 page_res_it.forward();
652 }
653
654 if (tessedit_debug_quality_metrics) {
655 tprintf(
656 "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
657 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
658 page_res->char_count, page_res->rej_count,
659 page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,
660 stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,
661 stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,
662 stats_.doc_char_quality / static_cast<float>(page_res->char_count),
664 (stats_.good_char_count > 0)
665 ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))
666 : 0.0);
667 }
668 bool good_quality_doc =
669 ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&
670 (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&
671 (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&
672 (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);
673
674 // ****************** Pass 6 *******************
675 // Do whole document or whole block rejection pass
676 if (!tessedit_test_adaption) {
677 quality_based_rejection(page_res_it, good_quality_doc);
678 }
679}
680
681#endif // ndef DISABLED_LEGACY_ENGINE
682
684 if (!wordrec_run_blamer) {
685 return;
686 }
687 PAGE_RES_IT page_res_it(page_res);
688 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
689 WERD_RES *word = page_res_it.word();
690 BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
692 }
693 tprintf("Blame reasons:\n");
694 for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
696 page_res->blame_reasons[bl]);
697 }
698 if (page_res->misadaption_log.size() > 0) {
699 tprintf("Misadaption log:\n");
700 for (auto &log : page_res->misadaption_log) {
701 tprintf("%s\n", log.c_str());
702 }
703 }
704}
705
706// Sets script positions and detects smallcaps on all output words.
708 PAGE_RES_IT page_res_it(page_res);
709 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
710 WERD_RES *word = page_res_it.word();
711 if (word->word->flag(W_REP_CHAR)) {
712 page_res_it.forward();
713 continue;
714 }
715 const float x_height = page_res_it.block()->block->x_height();
716 float word_x_height = word->x_height;
717 if (word_x_height < word->best_choice->min_x_height() ||
718 word_x_height > word->best_choice->max_x_height()) {
719 word_x_height =
720 (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;
721 }
722 // Test for small caps. Word capheight must be close to block xheight,
723 // and word must contain no lower case letters, and at least one upper case.
724 const double small_cap_xheight = x_height * kXHeightCapRatio;
725 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
726 if (word->uch_set->script_has_xheight() &&
727 small_cap_xheight - small_cap_delta <= word_x_height &&
728 word_x_height <= small_cap_xheight + small_cap_delta) {
729 // Scan for upper/lower.
730 int num_upper = 0;
731 int num_lower = 0;
732 for (unsigned i = 0; i < word->best_choice->length(); ++i) {
733 if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {
734 ++num_upper;
735 } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {
736 ++num_lower;
737 }
738 }
739 if (num_upper > 0 && num_lower == 0) {
740 word->small_caps = true;
741 }
742 }
743 word->SetScriptPositions();
744 }
745}
746
747// Helper finds the gap between the index word and the next.
748static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {
749 *right = -INT32_MAX;
750 *next_left = INT32_MAX;
751 if (index < words.size()) {
752 *right = words[index]->word->bounding_box().right();
753 if (index + 1 < words.size()) {
754 *next_left = words[index + 1]->word->bounding_box().left();
755 }
756 }
757}
758
759// Factored helper computes the rating, certainty, badness and validity of
760// the permuter of the words in [first_index, end_index).
761static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,
762 float *rating, float *certainty, bool *bad, bool *valid_permuter) {
763 if (end_index <= first_index) {
764 *bad = true;
765 *valid_permuter = false;
766 }
767 for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {
768 WERD_CHOICE *choice = words[index]->best_choice;
769 if (choice == nullptr) {
770 *bad = true;
771 } else {
772 *rating += choice->rating();
773 *certainty = std::min(*certainty, choice->certainty());
774 if (!Dict::valid_word_permuter(choice->permuter(), false)) {
775 *valid_permuter = false;
776 }
777 }
778 }
779}
780
781// Helper chooses the best combination of words, transferring good ones from
782// new_words to best_words. To win, a new word must have (better rating and
783// certainty) or (better permuter status and rating within rating ratio and
784// certainty within certainty margin) than current best.
785// All the new_words are consumed (moved to best_words or deleted.)
786// The return value is the number of new_words used minus the number of
787// best_words that remain in the output.
788static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,
789 PointerVector<WERD_RES> *new_words,
790 PointerVector<WERD_RES> *best_words) {
791 // Process the smallest groups of words that have an overlapping word
792 // boundary at the end.
793 std::vector<WERD_RES *> out_words;
794 // Index into each word vector (best, new).
795 unsigned b = 0, n = 0;
796 int num_best = 0, num_new = 0;
797 while (b < best_words->size() || n < new_words->size()) {
798 // Start of the current run in each.
799 auto start_b = b, start_n = n;
800 while (b < best_words->size() || n < new_words->size()) {
801 int b_right = -INT32_MAX;
802 int next_b_left = INT32_MAX;
803 WordGap(*best_words, b, &b_right, &next_b_left);
804 int n_right = -INT32_MAX;
805 int next_n_left = INT32_MAX;
806 WordGap(*new_words, n, &n_right, &next_n_left);
807 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
808 // The word breaks overlap. [start_b,b] and [start_n, n] match.
809 break;
810 }
811 // Keep searching for the matching word break.
812 if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
813 ++b;
814 } else {
815 ++n;
816 }
817 }
818 // Rating of the current run in each.
819 float b_rating = 0.0f, n_rating = 0.0f;
820 // Certainty of the current run in each.
821 float b_certainty = 0.0f, n_certainty = 0.0f;
822 // True if any word is missing its best choice.
823 bool b_bad = false, n_bad = false;
824 // True if all words have a valid permuter.
825 bool b_valid_permuter = true, n_valid_permuter = true;
826 const int end_b = b < best_words->size() ? b + 1 : b;
827 const int end_n = n < new_words->size() ? n + 1 : n;
828 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
829 &b_valid_permuter);
830 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
831 &n_valid_permuter);
832 bool new_better = false;
833 if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
834 (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
835 n_certainty > b_certainty - certainty_margin))) {
836 // New is better.
837 for (int i = start_n; i < end_n; ++i) {
838 out_words.push_back((*new_words)[i]);
839 (*new_words)[i] = nullptr;
840 ++num_new;
841 }
842 new_better = true;
843 } else if (!b_bad) {
844 // Current best is better.
845 for (int i = start_b; i < end_b; ++i) {
846 out_words.push_back((*best_words)[i]);
847 (*best_words)[i] = nullptr;
848 ++num_best;
849 }
850 }
851 if (debug) {
852 tprintf(
853 "%d new words %s than %d old words: r: %g v %g c: %g v %g"
854 " valid dict: %d v %d\n",
855 end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,
856 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
857 }
858 // Move on to the next group.
859 b = end_b;
860 n = end_n;
861 }
862 // Transfer from out_words to best_words.
863 best_words->clear();
864 for (auto &out_word : out_words) {
865 best_words->push_back(out_word);
866 }
867 return num_new - num_best;
868}
869
870// Helper to recognize the word using the given (language-specific) tesseract.
871// Returns positive if this recognizer found more new best words than the
872// number kept from best_words.
873int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,
874 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {
875 if (debug) {
876 tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),
877 static_cast<int>(tessedit_ocr_engine_mode));
878 }
879 // Run the recognizer on the word.
880 PointerVector<WERD_RES> new_words;
881 (this->*recognizer)(word_data, in_word, &new_words);
882 if (new_words.empty()) {
883 // Transfer input word to new_words, as the classifier must have put
884 // the result back in the input.
885 new_words.push_back(*in_word);
886 *in_word = nullptr;
887 }
888 if (debug) {
889 for (unsigned i = 0; i < new_words.size(); ++i) {
890 new_words[i]->DebugTopChoice("Lang result");
891 }
892 }
893 // Initial version is a bit of a hack based on better certainty and rating
894 // or a dictionary vs non-dictionary word.
895 return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896 &new_words, best_words);
897}
898
899// Helper returns true if all the words are acceptable.
900static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {
901 for (unsigned w = 0; w < words.size(); ++w) {
902 if (words[w]->tess_failed || !words[w]->tess_accepted) {
903 return false;
904 }
905 }
906 return true;
907}
908
909#ifndef DISABLED_LEGACY_ENGINE
910
911// Moves good-looking "noise"/diacritics from the reject list to the main
912// blob list on the current word. Returns true if anything was done, and
913// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
914bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {
915 *make_next_word_fuzzy = false;
916 WERD *real_word = pr_it->word()->word;
917 if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||
918 real_word->rej_cblob_list()->length() > noise_maxperword) {
919 return false;
920 }
921 real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
922 // Get the noise outlines into a vector with matching bool map.
923 std::vector<C_OUTLINE *> outlines;
924 real_word->GetNoiseOutlines(&outlines);
925 std::vector<bool> word_wanted;
926 std::vector<bool> overlapped_any_blob;
927 std::vector<C_BLOB *> target_blobs;
928 AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,
929 &overlapped_any_blob, &target_blobs);
930 // Filter the outlines that overlapped any blob and put them into the word
931 // now. This simplifies the remaining task and also makes it more accurate
932 // as it has more completed blobs to work on.
933 std::vector<bool> wanted;
934 std::vector<C_BLOB *> wanted_blobs;
935 std::vector<C_OUTLINE *> wanted_outlines;
936 int num_overlapped = 0;
937 int num_overlapped_used = 0;
938 for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {
939 if (overlapped_any_blob[i]) {
940 ++num_overlapped;
941 if (word_wanted[i]) {
942 ++num_overlapped_used;
943 }
944 wanted.push_back(word_wanted[i]);
945 wanted_blobs.push_back(target_blobs[i]);
946 wanted_outlines.push_back(outlines[i]);
947 outlines[i] = nullptr;
948 }
949 }
950 real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);
951 AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);
952 int non_overlapped = 0;
953 int non_overlapped_used = 0;
954 for (unsigned i = 0; i < word_wanted.size(); ++i) {
955 if (word_wanted[i]) {
956 ++non_overlapped_used;
957 }
958 if (outlines[i] != nullptr) {
959 ++non_overlapped_used;
960 }
961 }
962 if (debug_noise_removal) {
963 tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,
964 num_overlapped, non_overlapped_used, non_overlapped);
965 real_word->bounding_box().print();
966 }
967 // Now we have decided which outlines we want, put them into the real_word.
968 if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
969 pr_it->MakeCurrentWordFuzzy();
970 }
971 // TODO(rays) Parts of combos have a deep copy of the real word, and need
972 // to have their noise outlines moved/assigned in the same way!!
973 return num_overlapped_used != 0 || non_overlapped_used != 0;
974}
975
976// Attempts to put noise/diacritic outlines into the blobs that they overlap.
977// Input: a set of noisy outlines that probably belong to the real_word.
978// Output: word_wanted indicates which outlines are to be assigned to a blob,
979// target_blobs indicates which to assign to, and overlapped_any_blob is
980// true for all outlines that overlapped a blob.
981void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,
982 int pass, WERD *real_word, PAGE_RES_IT *pr_it,
983 std::vector<bool> *word_wanted,
984 std::vector<bool> *overlapped_any_blob,
985 std::vector<C_BLOB *> *target_blobs) {
986 std::vector<bool> blob_wanted;
987 word_wanted->clear();
988 word_wanted->resize(outlines.size());
989 overlapped_any_blob->clear();
990 overlapped_any_blob->resize(outlines.size());
991 target_blobs->clear();
992 target_blobs->resize(outlines.size());
993 // For each real blob, find the outlines that seriously overlap it.
994 // A single blob could be several merged characters, so there can be quite
995 // a few outlines overlapping, and the full engine needs to be used to chop
996 // and join to get a sensible result.
997 C_BLOB_IT blob_it(real_word->cblob_list());
998 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999 C_BLOB *blob = blob_it.data();
1000 const TBOX blob_box = blob->bounding_box();
1001 blob_wanted.clear();
1002 blob_wanted.resize(outlines.size());
1003 int num_blob_outlines = 0;
1004 for (unsigned i = 0; i < outlines.size(); ++i) {
1005 if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {
1006 blob_wanted[i] = true;
1007 (*overlapped_any_blob)[i] = true;
1008 ++num_blob_outlines;
1009 }
1010 }
1011 if (debug_noise_removal) {
1012 tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1013 blob_box.print();
1014 }
1015 // If any outlines overlap the blob, and not too many, classify the blob
1016 // (using the full engine, languages and all), and choose the maximal
1017 // combination of outlines that doesn't hurt the end-result classification
1018 // by too much. Mark them as wanted.
1019 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1020 if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,
1021 num_blob_outlines, &blob_wanted)) {
1022 for (unsigned i = 0; i < blob_wanted.size(); ++i) {
1023 if (blob_wanted[i]) {
1024 // Claim the outline and record where it is going.
1025 (*word_wanted)[i] = true;
1026 (*target_blobs)[i] = blob;
1027 }
1028 }
1029 }
1030 }
1031 }
1032}
1033
1034// Attempts to assign non-overlapping outlines to their nearest blobs or
1035// make new blobs out of them.
1036void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,
1037 WERD *real_word, PAGE_RES_IT *pr_it,
1038 std::vector<bool> *word_wanted,
1039 std::vector<C_BLOB *> *target_blobs) {
1040 std::vector<bool> blob_wanted;
1041 word_wanted->clear();
1042 word_wanted->resize(outlines.size());
1043 target_blobs->clear();
1044 target_blobs->resize(outlines.size());
1045 // Check for outlines that need to be turned into stand-alone blobs.
1046 for (unsigned i = 0; i < outlines.size(); ++i) {
1047 if (outlines[i] == nullptr) {
1048 continue;
1049 }
1050 // Get a set of adjacent outlines that don't overlap any existing blob.
1051 blob_wanted.clear();
1052 blob_wanted.resize(outlines.size());
1053 int num_blob_outlines = 0;
1054 TBOX total_ol_box(outlines[i]->bounding_box());
1055 while (i < outlines.size() && outlines[i] != nullptr) {
1056 blob_wanted[i] = true;
1057 total_ol_box += outlines[i]->bounding_box();
1058 ++i;
1059 ++num_blob_outlines;
1060 }
1061 // Find the insertion point.
1062 C_BLOB_IT blob_it(real_word->cblob_list());
1063 while (!blob_it.at_last() &&
1064 blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {
1065 blob_it.forward();
1066 }
1067 // Choose which combination of them we actually want and where to put
1068 // them.
1069 if (debug_noise_removal) {
1070 tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1071 }
1072 C_BLOB *left_blob = blob_it.data();
1073 TBOX left_box = left_blob->bounding_box();
1074 C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075 if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||
1076 !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1077 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,
1078 num_blob_outlines, &blob_wanted)) {
1079 if (debug_noise_removal) {
1080 tprintf("Added to left blob\n");
1081 }
1082 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1083 if (blob_wanted[j]) {
1084 (*word_wanted)[j] = true;
1085 (*target_blobs)[j] = left_blob;
1086 }
1087 }
1088 } else if (right_blob != nullptr &&
1089 (!left_box.x_overlap(total_ol_box) ||
1090 right_blob->bounding_box().x_overlap(total_ol_box)) &&
1091 SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,
1092 num_blob_outlines, &blob_wanted)) {
1093 if (debug_noise_removal) {
1094 tprintf("Added to right blob\n");
1095 }
1096 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1097 if (blob_wanted[j]) {
1098 (*word_wanted)[j] = true;
1099 (*target_blobs)[j] = right_blob;
1100 }
1101 }
1102 } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,
1103 num_blob_outlines, &blob_wanted)) {
1104 if (debug_noise_removal) {
1105 tprintf("Fitted between blobs\n");
1106 }
1107 for (unsigned j = 0; j < blob_wanted.size(); ++j) {
1108 if (blob_wanted[j]) {
1109 (*word_wanted)[j] = true;
1110 (*target_blobs)[j] = nullptr;
1111 }
1112 }
1113 }
1114 }
1115}
1116
1117// Starting with ok_outlines set to indicate which outlines overlap the blob,
1118// chooses the optimal set (approximately) and returns true if any outlines
1119// are desired, in which case ok_outlines indicates which ones.
1120bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,
1121 C_BLOB *blob,
1122 const std::vector<C_OUTLINE *> &outlines,
1123 int num_outlines, std::vector<bool> *ok_outlines) {
1124 std::string best_str;
1125 float target_cert = certainty_threshold;
1126 if (blob != nullptr) {
1127 float target_c2;
1128 target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);
1129 if (debug_noise_removal) {
1130 tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1131 target_c2);
1132 blob->bounding_box().print();
1133 }
1134 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1135 }
1136 std::vector<bool> test_outlines = *ok_outlines;
1137 // Start with all the outlines in.
1138 std::string all_str;
1139 std::vector<bool> best_outlines = *ok_outlines;
1140 float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);
1141 if (debug_noise_removal) {
1142 TBOX ol_box;
1143 for (unsigned i = 0; i < test_outlines.size(); ++i) {
1144 if (test_outlines[i]) {
1145 ol_box += outlines[i]->bounding_box();
1146 }
1147 }
1148 tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149 best_cert - target_cert);
1150 ol_box.print();
1151 }
1152 // Iteratively zero out the bit that improves the certainty the most, until
1153 // we get past the threshold, have zero bits, or fail to improve.
1154 int best_index = 0; // To zero out.
1155 while (num_outlines > 1 && best_index >= 0 &&
1156 (blob == nullptr || best_cert < target_cert || blob != nullptr)) {
1157 // Find the best bit to zero out.
1158 best_index = -1;
1159 for (unsigned i = 0; i < outlines.size(); ++i) {
1160 if (test_outlines[i]) {
1161 test_outlines[i] = false;
1162 std::string str;
1163 float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);
1164 if (debug_noise_removal) {
1165 TBOX ol_box;
1166 for (unsigned j = 0; j < outlines.size(); ++j) {
1167 if (test_outlines[j]) {
1168 ol_box += outlines[j]->bounding_box();
1169 }
1170 tprintf("%c", test_outlines[j] ? 'T' : 'F');
1171 }
1172 tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173 cert - target_cert);
1174 ol_box.print();
1175 }
1176 if (cert > best_cert) {
1177 best_cert = cert;
1178 best_index = i;
1179 best_outlines = test_outlines;
1180 }
1181 test_outlines[i] = true;
1182 }
1183 }
1184 if (best_index >= 0) {
1185 test_outlines[best_index] = false;
1186 --num_outlines;
1187 }
1188 }
1189 if (best_cert >= target_cert) {
1190 // Save the best combination.
1191 *ok_outlines = best_outlines;
1192 if (debug_noise_removal) {
1193 tprintf("%s noise combination ", blob ? "Adding" : "New");
1194 for (auto &&best_outline : best_outlines) {
1195 tprintf("%c", best_outline ? 'T' : 'F');
1196 }
1197 tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1198 }
1199 return true;
1200 }
1201
1202 return false;
1203}
1204
1205// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1206// the inclusion of the outlines, and returns the certainty of the raw choice.
1207float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,
1208 const std::vector<C_OUTLINE *> &outlines, int pass_n,
1209 PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {
1210 C_OUTLINE_IT ol_it;
1211 C_OUTLINE *first_to_keep = nullptr;
1212 C_BLOB *local_blob = nullptr;
1213 if (blob != nullptr) {
1214 // Add the required outlines to the blob.
1215 ol_it.set_to_list(blob->out_list());
1216 first_to_keep = ol_it.data();
1217 }
1218 for (unsigned i = 0; i < ok_outlines.size(); ++i) {
1219 if (ok_outlines[i]) {
1220 // This outline is to be added.
1221 if (blob == nullptr) {
1222 local_blob = new C_BLOB(outlines[i]);
1223 blob = local_blob;
1224 ol_it.set_to_list(blob->out_list());
1225 } else {
1226 ol_it.add_before_stay_put(outlines[i]);
1227 }
1228 }
1229 }
1230 float c2;
1231 float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1232 ol_it.move_to_first();
1233 if (first_to_keep == nullptr) {
1234 // We created blob. Empty its outlines and delete it.
1235 for (; !ol_it.empty(); ol_it.forward()) {
1236 ol_it.extract();
1237 }
1238 delete local_blob;
1239 cert = -c2;
1240 } else {
1241 // Remove the outlines that we put in.
1242 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1243 ol_it.extract();
1244 }
1245 }
1246 return cert;
1247}
1248
1249// Classifies the given blob (part of word_data->word->word) as an individual
1250// word, using languages, chopper etc, returning only the certainty of the
1251// best raw choice, and undoing all the work done to fake out the word.
1252float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,
1253 float *c2) {
1254 WERD *real_word = pr_it->word()->word;
1255 WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),
1256 C_BLOB::deep_copy(blob));
1257 WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1258 // Get a new iterator that points to the new word.
1259 PAGE_RES_IT it(pr_it->page_res);
1260 while (it.word() != word_res && it.word() != nullptr) {
1261 it.forward();
1262 }
1263 ASSERT_HOST(it.word() == word_res);
1264 WordData wd(it);
1265 // Force full initialization.
1266 SetupWordPassN(1, &wd);
1267 classify_word_and_language(pass_n, &it, &wd);
1268 if (debug_noise_removal) {
1269 if (wd.word->raw_choice != nullptr) {
1270 tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),
1272 } else {
1273 tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,
1274 wd.row->x_height());
1275 }
1276 }
1277 float cert = 0.0f;
1278 if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...
1279 cert = wd.word->raw_choice->certainty();
1280 float rat = wd.word->raw_choice->rating();
1281 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1282 best_str = wd.word->raw_choice->unichar_string();
1283 } else {
1284 *c2 = 0.0f;
1285 best_str.clear();
1286 }
1287 it.DeleteCurrentWord();
1288 pr_it->ResetWordIterator();
1289 return cert;
1290}
1291
1292#endif // ndef DISABLED_LEGACY_ENGINE
1293
1294// Generic function for classifying a word. Can be used either for pass1 or
1295// pass2 according to the function passed to recognizer.
1296// word_data holds the word to be recognized, and its block and row, and
1297// pr_it points to the word as well, in case we are running LSTM and it wants
1298// to output multiple words.
1299// Recognizes in the current language, and if successful that is all.
1300// If recognition was not successful, tries all available languages until
1301// it gets a successful result or runs out of languages. Keeps the best result.
1302void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {
1303#ifdef DISABLED_LEGACY_ENGINE
1305#else
1306 WordRecognizer recognizer =
1308#endif // def DISABLED_LEGACY_ENGINE
1309
1310 // Best result so far.
1311 PointerVector<WERD_RES> best_words;
1312 // Points to the best result. May be word or in lang_words.
1313 const WERD_RES *word = word_data->word;
1314 clock_t start_t = clock();
1315 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1316 if (debug) {
1317 tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",
1318 most_recently_used_->lang.c_str());
1319 word->word->bounding_box().print();
1320 }
1321 if (word->done) {
1322 // If done on pass1, leave it as-is.
1323 if (!word->tess_failed) {
1324 most_recently_used_ = word->tesseract;
1325 }
1326 return;
1327 }
1328 auto sub = sub_langs_.size();
1329 if (most_recently_used_ != this) {
1330 // Get the index of the most_recently_used_.
1331 for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1332 }
1333 }
1334 most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],
1335 &best_words);
1336 Tesseract *best_lang_tess = most_recently_used_;
1337 if (!WordsAcceptable(best_words)) {
1338 // Try all the other languages to see if they are any better.
1339 if (most_recently_used_ != this &&
1340 this->RetryWithLanguage(*word_data, recognizer, debug,
1341 &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {
1342 best_lang_tess = this;
1343 }
1344 for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {
1345 if (most_recently_used_ != sub_langs_[i] &&
1346 sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],
1347 &best_words) > 0) {
1348 best_lang_tess = sub_langs_[i];
1349 }
1350 }
1351 }
1352 most_recently_used_ = best_lang_tess;
1353 if (!best_words.empty()) {
1354 if (best_words.size() == 1 && !best_words[0]->combination) {
1355 // Move the best single result to the main word.
1356 word_data->word->ConsumeWordResults(best_words[0]);
1357 } else {
1358 // Words came from LSTM, and must be moved to the PAGE_RES properly.
1359 word_data->word = best_words.back();
1360 pr_it->ReplaceCurrentWord(&best_words);
1361 }
1362 ASSERT_HOST(word_data->word->box_word != nullptr);
1363 } else {
1364 tprintf("no best words!!\n");
1365 }
1366 clock_t ocr_t = clock();
1367 if (tessedit_timing_debug) {
1368 tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
1369 static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1370 }
1371}
1372
1379void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,
1380 PointerVector<WERD_RES> *out_words) {
1381 ROW *row = word_data.row;
1382 BLOCK *block = word_data.block;
1384 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1385#ifdef DISABLED_LEGACY_ENGINE
1386 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1387#else
1388 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
1389 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
1390#endif // def DISABLED_LEGACY_ENGINE
1391 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1392 LSTMRecognizeWord(*block, row, *in_word, out_words);
1393 if (!out_words->empty()) {
1394 return; // Successful lstm recognition.
1395 }
1396 }
1397 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1398 // No fallback allowed, so use a fake.
1399 (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1400 return;
1401 }
1402
1403#ifndef DISABLED_LEGACY_ENGINE
1404 // Fall back to tesseract for failed words or odd words.
1405 (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,
1406 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407 poly_allow_detailed_fx, row, block);
1408#endif // ndef DISABLED_LEGACY_ENGINE
1409 }
1410
1411#ifndef DISABLED_LEGACY_ENGINE
1412 WERD_RES *word = *in_word;
1413 match_word_pass_n(1, word, row, block);
1414 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1415 word->tess_would_adapt = AdaptableWord(word);
1416 bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1417
1418 if (adapt_ok) {
1419 // Send word to adaptive classifier for training.
1421 LearnWord(nullptr, word);
1422 // Mark misadaptions if running blamer.
1423 if (word->blamer_bundle != nullptr) {
1424 word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);
1425 }
1426 }
1427
1428 if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {
1430 }
1431 }
1432#endif // ndef DISABLED_LEGACY_ENGINE
1433}
1434
1435// Helper to report the result of the xheight fix.
1436void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,
1437 WERD_RES *new_word) {
1438 tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),
1439 word->best_choice->debug_string().c_str());
1440 word->reject_map.print(debug_fp);
1441 tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),
1442 new_word->best_choice->debug_string().c_str());
1443 new_word->reject_map.print(debug_fp);
1444 tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",
1445 new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1446 accept_new_word ? "ACCEPTED" : "");
1447}
1448
1449#ifndef DISABLED_LEGACY_ENGINE
1450
1451// Run the x-height fix-up, based on min/max top/bottom information in
1452// unicharset.
1453// Returns true if the word was changed.
1454// See the comment in fixxht.cpp for a description of the overall process.
1456 int original_misfits = CountMisfitTops(word);
1457 if (original_misfits == 0) {
1458 return false;
1459 }
1460 float baseline_shift = 0.0f;
1461 float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1462 if (baseline_shift != 0.0f) {
1463 // Try the shift on its own first.
1464 if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {
1465 return false;
1466 }
1467 original_misfits = CountMisfitTops(word);
1468 if (original_misfits > 0) {
1469 float new_baseline_shift;
1470 // Now recompute the new x_height.
1471 new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1472 if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1473 // No test of return value here, as we are definitely making a change
1474 // to the word by shifting the baseline.
1475 TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);
1476 }
1477 }
1478 return true;
1479 } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1480 return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);
1481 } else {
1482 return false;
1483 }
1484}
1485
1486// Runs recognition with the test baseline shift and x-height and returns true
1487// if there was an improvement in recognition result.
1488bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,
1489 WERD_RES *word, BLOCK *block, ROW *row) {
1490 bool accept_new_x_ht = false;
1491 WERD_RES new_x_ht_word(word->word);
1492 if (word->blamer_bundle != nullptr) {
1493 new_x_ht_word.blamer_bundle = new BlamerBundle();
1494 new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1495 }
1496 new_x_ht_word.x_height = new_x_ht;
1497 new_x_ht_word.baseline_shift = baseline_shift;
1498 new_x_ht_word.caps_height = 0.0;
1499 new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,
1500 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501 poly_allow_detailed_fx, row, block);
1502 match_word_pass_n(2, &new_x_ht_word, row, block);
1503 if (!new_x_ht_word.tess_failed) {
1504 int new_misfits = CountMisfitTops(&new_x_ht_word);
1505 if (debug_x_ht_level >= 1) {
1506 tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507 word->x_height, new_misfits, new_x_ht);
1508 tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),
1509 word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),
1510 new_x_ht_word.best_choice->certainty());
1511 }
1512 // The misfits must improve and either the rating or certainty.
1513 accept_new_x_ht = new_misfits < original_misfits &&
1514 (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||
1515 new_x_ht_word.best_choice->rating() < word->best_choice->rating());
1516 if (debug_x_ht_level >= 1) {
1517 ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1518 }
1519 }
1520 if (accept_new_x_ht) {
1521 word->ConsumeWordResults(&new_x_ht_word);
1522 return true;
1523 }
1524 return false;
1525}
1526
1527#endif // ndef DISABLED_LEGACY_ENGINE
1528
1535void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,
1536 PointerVector<WERD_RES> *out_words) {
1537 // Return if we do not want to run Tesseract.
1538 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1539 return;
1540 }
1541#ifndef DISABLED_LEGACY_ENGINE
1542 ROW *row = word_data.row;
1543 BLOCK *block = word_data.block;
1544 WERD_RES *word = *in_word;
1546 word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;
1547
1548 check_debug_pt(word, 30);
1549 if (!word->done) {
1550 word->caps_height = 0.0;
1551 if (word->x_height == 0.0f) {
1552 word->x_height = row->x_height();
1553 }
1554 match_word_pass_n(2, word, row, block);
1555 check_debug_pt(word, 40);
1556 }
1557
1559
1560 if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1562 block->classify_rotation().y() == 0.0f) {
1563 // Use the tops and bottoms since they are available.
1564 TrainedXheightFix(word, block, row);
1565 }
1566 }
1567# ifndef GRAPHICS_DISABLED
1568 if (tessedit_display_outwords) {
1569 if (fx_win == nullptr) {
1570 create_fx_win();
1571 }
1572 clear_fx_win();
1573 word->rebuild_word->plot(fx_win);
1574 TBOX wbox = word->rebuild_word->bounding_box();
1575 fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
1577 }
1578# endif
1579 check_debug_pt(word, 50);
1580#endif // ndef DISABLED_LEGACY_ENGINE
1581}
1582
1583#ifndef DISABLED_LEGACY_ENGINE
1589void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {
1590 if (word->tess_failed) {
1591 return;
1592 }
1593 tess_segment_pass_n(pass_n, word);
1594
1595 if (!word->tess_failed) {
1596 if (!word->word->flag(W_REP_CHAR)) {
1597 word->fix_quotes();
1598 if (tessedit_fix_hyphens) {
1599 word->fix_hyphens();
1600 }
1601 /* Don't trust fix_quotes! - though I think I've fixed the bug */
1602 if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {
1603 tprintf(
1604 "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1605 " #Blobs=%u\n",
1606 word->best_choice->debug_string().c_str(), word->best_choice->length(),
1607 word->box_word->length());
1608 }
1609 word->tess_accepted = tess_acceptable_word(word);
1610
1611 // Also sets word->done flag
1612 make_reject_map(word, row, pass_n);
1613 }
1614 }
1615 set_word_fonts(word);
1616
1617 ASSERT_HOST(word->raw_choice != nullptr);
1618}
1619#endif // ndef DISABLED_LEGACY_ENGINE
1620
1621// Helper to return the best rated BLOB_CHOICE in the whole word that matches
1622// the given char_id, or nullptr if none can be found.
1623static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {
1624 // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1625 BLOB_CHOICE *best_choice = nullptr;
1626 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1627 BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));
1628 if (choice != nullptr) {
1629 if (best_choice == nullptr || choice->rating() < best_choice->rating()) {
1630 best_choice = choice;
1631 }
1632 }
1633 }
1634 return best_choice;
1635}
1636
1637// Helper to insert blob_choice in each location in the leader word if there is
1638// no matching BLOB_CHOICE there already, and correct any incorrect results
1639// in the best_choice.
1640static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1641 WERD_CHOICE *word = word_res->best_choice;
1642 for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {
1643 BLOB_CHOICE *choice =
1644 FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));
1645 if (choice == nullptr) {
1646 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1647 choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1648 }
1649 }
1650 // Correct any incorrect results in word.
1651 for (unsigned i = 0; i < word->length(); ++i) {
1652 if (word->unichar_id(i) != blob_choice->unichar_id()) {
1653 word->set_unichar_id(blob_choice->unichar_id(), i);
1654 }
1655 }
1656}
1657
1666 WERD_RES *word_res = page_res_it->word();
1667 const WERD_CHOICE &word = *(word_res->best_choice);
1668
1669 // Find the frequency of each unique character in the word.
1670 SortHelper<UNICHAR_ID> rep_ch(word.length());
1671 for (unsigned i = 0; i < word.length(); ++i) {
1672 rep_ch.Add(word.unichar_id(i), 1);
1673 }
1674
1675 // Find the most frequent result.
1676 UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1677 int max_count = rep_ch.MaxCount(&maxch_id);
1678 // Find the best exemplar of a classifier result for maxch_id.
1679 BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680 if (best_choice == nullptr) {
1681 tprintf("Failed to find a choice for %s, occurring %d times\n",
1682 word_res->uch_set->debug_str(maxch_id).c_str(), max_count);
1683 return;
1684 }
1685 word_res->done = true;
1686
1687 // Just correct existing classification.
1688 CorrectRepcharChoices(best_choice, word_res);
1689 word_res->reject_map.initialise(word.length());
1690}
1691
1693 const char *lengths) {
1694 int i = 0;
1695 int offset = 0;
1696 int leading_punct_count;
1697 int upper_count = 0;
1698 int hyphen_pos = -1;
1700
1701 if (strlen(lengths) > 20) {
1702 return word_type;
1703 }
1704
1705 /* Single Leading punctuation char*/
1706
1707 if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {
1708 offset += lengths[i++];
1709 }
1710 leading_punct_count = i;
1711
1712 /* Initial cap */
1713 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1714 offset += lengths[i++];
1715 upper_count++;
1716 }
1717 if (upper_count > 1) {
1718 word_type = AC_UPPER_CASE;
1719 } else {
1720 /* Lower case word, possibly with an initial cap */
1721 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1722 offset += lengths[i++];
1723 }
1724 if (i - leading_punct_count < quality_min_initial_alphas_reqd) {
1725 goto not_a_word;
1726 }
1727 /*
1728Allow a single hyphen in a lower case word
1729- don't trust upper case - I've seen several cases of "H" -> "I-I"
1730*/
1731 if (lengths[i] == 1 && s[offset] == '-') {
1732 hyphen_pos = i;
1733 offset += lengths[i++];
1734 if (s[offset] != '\0') {
1735 while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {
1736 offset += lengths[i++];
1737 }
1738 if (i < hyphen_pos + 3) {
1739 goto not_a_word;
1740 }
1741 }
1742 } else {
1743 /* Allow "'s" in NON hyphenated lower case words */
1744 if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&
1745 (s[offset + lengths[i]] == 's')) {
1746 offset += lengths[i++];
1747 offset += lengths[i++];
1748 }
1749 }
1750 if (upper_count > 0) {
1751 word_type = AC_INITIAL_CAP;
1752 } else {
1753 word_type = AC_LOWER_CASE;
1754 }
1755 }
1756
1757 /* Up to two different, constrained trailing punctuation chars */
1758 if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {
1759 offset += lengths[i++];
1760 }
1761 if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&
1762 chs_trailing_punct2.contains(s[offset])) {
1763 offset += lengths[i++];
1764 }
1765
1766 if (s[offset] != '\0') {
1767 word_type = AC_UNACCEPTABLE;
1768 }
1769
1770not_a_word:
1771
1772 if (word_type == AC_UNACCEPTABLE) {
1773 /* Look for abbreviation string */
1774 i = 0;
1775 offset = 0;
1776 if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1777 word_type = AC_UC_ABBREV;
1778 while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&
1779 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1780 offset += lengths[i++];
1781 offset += lengths[i++];
1782 }
1783 } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1784 word_type = AC_LC_ABBREV;
1785 while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&
1786 lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1787 offset += lengths[i++];
1788 offset += lengths[i++];
1789 }
1790 }
1791 if (s[offset] != '\0') {
1792 word_type = AC_UNACCEPTABLE;
1793 }
1794 }
1795
1796 return word_type;
1797}
1798
1799bool Tesseract::check_debug_pt(WERD_RES *word, int location) {
1800 bool show_map_detail = false;
1801 int16_t i;
1802
1803 if (!test_pt) {
1804 return false;
1805 }
1806
1807 tessedit_rejection_debug.set_value(false);
1808 debug_x_ht_level.set_value(0);
1809
1810 if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {
1811 if (location < 0) {
1812 return true; // For breakpoint use
1813 }
1814 tessedit_rejection_debug.set_value(true);
1815 debug_x_ht_level.set_value(2);
1816 tprintf("\n\nTESTWD::");
1817 switch (location) {
1818 case 0:
1819 tprintf("classify_word_pass1 start\n");
1820 word->word->print();
1821 break;
1822 case 10:
1823 tprintf("make_reject_map: initial map");
1824 break;
1825 case 20:
1826 tprintf("make_reject_map: after NN");
1827 break;
1828 case 30:
1829 tprintf("classify_word_pass2 - START");
1830 break;
1831 case 40:
1832 tprintf("classify_word_pass2 - Pre Xht");
1833 break;
1834 case 50:
1835 tprintf("classify_word_pass2 - END");
1836 show_map_detail = true;
1837 break;
1838 case 60:
1839 tprintf("fixspace");
1840 break;
1841 case 70:
1842 tprintf("MM pass START");
1843 break;
1844 case 80:
1845 tprintf("MM pass END");
1846 break;
1847 case 90:
1848 tprintf("After Poor quality rejection");
1849 break;
1850 case 100:
1851 tprintf("unrej_good_quality_words - START");
1852 break;
1853 case 110:
1854 tprintf("unrej_good_quality_words - END");
1855 break;
1856 case 120:
1857 tprintf("Write results pass");
1858 show_map_detail = true;
1859 break;
1860 }
1861 if (word->best_choice != nullptr) {
1862 tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());
1863 word->reject_map.print(debug_fp);
1864 tprintf("\n");
1865 if (show_map_detail) {
1866 tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());
1867 for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1868 tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1870 }
1871 }
1872 } else {
1873 tprintf("null best choice\n");
1874 }
1875 tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1876 tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1877 return true;
1878 } else {
1879 return false;
1880 }
1881}
1882
1888#ifndef DISABLED_LEGACY_ENGINE
1889static void find_modal_font( // good chars in word
1890 STATS *fonts, // font stats
1891 int16_t *font_out, // output font
1892 int8_t *font_count // output count
1893) {
1894 int16_t font; // font index
1895 int32_t count; // pile count
1896
1897 if (fonts->get_total() > 0) {
1898 font = static_cast<int16_t>(fonts->mode());
1899 *font_out = font;
1900 count = fonts->pile_count(font);
1901 *font_count = count < INT8_MAX ? count : INT8_MAX;
1902 fonts->add(font, -*font_count);
1903 } else {
1904 *font_out = -1;
1905 *font_count = 0;
1906 }
1907}
1908#endif // ! DISABLED_LEGACY_ENGINE
1909
1916 // Don't try to set the word fonts for an lstm word, as the configs
1917 // will be meaningless.
1918 if (word->chopped_word == nullptr) {
1919 return;
1920 }
1921 ASSERT_HOST(word->best_choice != nullptr);
1922
1923#ifndef DISABLED_LEGACY_ENGINE
1924 const int fontinfo_size = fontinfo_table_.size();
1925 if (fontinfo_size == 0) {
1926 return;
1927 }
1928 if (tessedit_font_id > 0) {
1929 if (tessedit_font_id >= fontinfo_size) {
1930 tprintf("Error, invalid font ID provided: must be below %d.\n"
1931 "Falling back to font auto-detection.\n", fontinfo_size);
1932 } else {
1933 word->fontinfo = &fontinfo_table_.at(tessedit_font_id);
1934 word->fontinfo2 = nullptr;
1935 word->fontinfo_id_count = INT8_MAX;
1936 word->fontinfo_id2_count = 0;
1937 return;
1938 }
1939 }
1940 std::vector<int> font_total_score(fontinfo_size);
1941
1942 // Compute the font scores for the word
1943 if (tessedit_debug_fonts) {
1944 tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());
1945 }
1946 for (unsigned b = 0; b < word->best_choice->length(); ++b) {
1947 const BLOB_CHOICE *choice = word->GetBlobChoice(b);
1948 if (choice == nullptr) {
1949 continue;
1950 }
1951 auto &fonts = choice->fonts();
1952 for (auto &f : fonts) {
1953 const int fontinfo_id = f.fontinfo_id;
1954 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1955 font_total_score[fontinfo_id] += f.score;
1956 }
1957 }
1958 }
1959 // Find the top and 2nd choice for the word.
1960 int score1 = 0, score2 = 0;
1961 int16_t font_id1 = -1, font_id2 = -1;
1962 for (int f = 0; f < fontinfo_size; ++f) {
1963 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1964 tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);
1965 }
1966 if (font_total_score[f] > score1) {
1967 score2 = score1;
1968 font_id2 = font_id1;
1969 score1 = font_total_score[f];
1970 font_id1 = f;
1971 } else if (font_total_score[f] > score2) {
1972 score2 = font_total_score[f];
1973 font_id2 = f;
1974 }
1975 }
1976 word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;
1977 word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;
1978 // Each score has a limit of UINT16_MAX, so divide by that to get the number
1979 // of "votes" for that font, ie number of perfect scores.
1980 word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);
1981 word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);
1982 if (score1 > 0) {
1983 const FontInfo fi = fontinfo_table_.at(font_id1);
1984 if (tessedit_debug_fonts) {
1985 if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {
1986 tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,
1987 word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,
1988 word->fontinfo_id2_count);
1989 } else {
1990 tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);
1991 }
1992 }
1993 }
1994#endif // ndef DISABLED_LEGACY_ENGINE
1995}
1996
1997#ifndef DISABLED_LEGACY_ENGINE
2004 PAGE_RES_IT page_res_it(page_res);
2005 WERD_RES *word; // current word
2006 STATS doc_fonts(0, font_table_size_ - 1); // font counters
2007
2008 // Gather font id statistics.
2009 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2010 word = page_res_it.word();
2011 if (word->fontinfo != nullptr) {
2012 doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
2013 }
2014 if (word->fontinfo2 != nullptr) {
2015 doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
2016 }
2017 }
2018 int16_t doc_font; // modal font
2019 int8_t doc_font_count; // modal font
2020 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2021 if (doc_font_count == 0) {
2022 return;
2023 }
2024 // Get the modal font pointer.
2025 const FontInfo *modal_font = nullptr;
2026 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2027 word = page_res_it.word();
2028 if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {
2029 modal_font = word->fontinfo;
2030 break;
2031 }
2032 if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {
2033 modal_font = word->fontinfo2;
2034 break;
2035 }
2036 }
2037 ASSERT_HOST(modal_font != nullptr);
2038
2039 // Assign modal font to weak words.
2040 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2041 word = page_res_it.word();
2042 const int length = word->best_choice->length();
2043
2044 const int count = word->fontinfo_id_count;
2045 if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2046 word->fontinfo = modal_font;
2047 // Counts only get 1 as it came from the doc.
2048 word->fontinfo_id_count = 1;
2049 }
2050 }
2051}
2052#endif // ndef DISABLED_LEGACY_ENGINE
2053
2054// If a word has multiple alternates check if the best choice is in the
2055// dictionary. If not, replace it with an alternate that exists in the
2056// dictionary.
2058 PAGE_RES_IT word_it(page_res);
2059 for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {
2060 if (word->best_choices.singleton()) {
2061 continue; // There are no alternates.
2062 }
2063
2064 const WERD_CHOICE *best = word->best_choice;
2065 if (word->tesseract->getDict().valid_word(*best) != 0) {
2066 continue; // The best choice is in the dictionary.
2067 }
2068
2069 WERD_CHOICE_IT choice_it(&word->best_choices);
2070 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2071 WERD_CHOICE *alternate = choice_it.data();
2072 if (word->tesseract->getDict().valid_word(*alternate)) {
2073 // The alternate choice is in the dictionary.
2074 if (tessedit_bigram_debug) {
2075 tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2076 best->unichar_string().c_str(), alternate->unichar_string().c_str());
2077 }
2078 // Replace the 'best' choice with a better choice.
2079 word->ReplaceBestChoice(alternate);
2080 break;
2081 }
2082 }
2083 }
2084}
2085
2086} // namespace tesseract
ACCEPTABLE_WERD_TYPE
Definition: control.h:28
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:34
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:33
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30
const char *const kBackUpConfigFile
Definition: control.cpp:47
const double kMinRefitXHeightFraction
Definition: control.cpp:51
FILE * debug_fp
Definition: tessvars.cpp:24
#define ASSERT_HOST(x)
Definition: errcode.h:54
int * count
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:266
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:41
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:38
void clear_fx_win()
Definition: drawfx.cpp:61
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
IncorrectResultReason
Definition: blamer.h:56
@ IRR_NUM_REASONS
Definition: blamer.h:103
void create_fx_win()
Definition: drawfx.cpp:50
int UNICHAR_ID
Definition: unichar.h:34
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ FREQ_DAWG_PERM
Definition: ratngs.h:247
ScrollView * fx_win
Definition: drawfx.cpp:42
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:773
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110
bool deadline_exceeded() const
Definition: ocrclass.h:136
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:115
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
PointerVector< WERD_RES > lang_words
void AssignDiacriticsToNewBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:1036
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:230
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:76
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1589
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:456
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:105
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1665
void AssignDiacriticsToOverlappingBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:981
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62
void PrerecAllWordsPar(const std::vector< WordData > &words)
Definition: par_control.cpp:38
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:599
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:39
Dict & getDict() override
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1488
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1379
Image BestPix() const
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
Definition: control.cpp:198
bool SubAndSuperscriptFix(WERD_RES *word_res)
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:873
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2057
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:73
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2003
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:914
bool AnyTessLang() const
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:72
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1455
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1915
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
Definition: control.cpp:1120
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77
bool AnyLSTMLang() const
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:64
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:707
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
Definition: control.cpp:1252
float ClassifyBlobPlusOutlines(const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
Definition: control.cpp:1207
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:683
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1535
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
Definition: control.cpp:146
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1436
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:540
const std::string & misadaption_debug() const
Definition: blamer.h:143
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:564
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:131
TBOX bounding_box() const
Definition: blobs.cpp:863
void plot(ScrollView *window)
Definition: blobs.cpp:907
unsigned length() const
Definition: boxword.h:81
static const double kXHeightCapRatio
Definition: ccstruct.h:35
int32_t universal_id
Definition: fontinfo.h:140
FCOORD classify_rotation() const
Definition: ocrblock.h:135
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int32_t x_height() const
return xheight
Definition: ocrblock.h:101
float x_height() const
Definition: ocrrow.h:66
int32_t rej_count
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:79
std::vector< std::string > misadaption_log
Definition: pageres.h:92
std::vector< int > blame_reasons
Definition: pageres.h:87
const FontInfo * fontinfo2
Definition: pageres.h:308
tesseract::Tesseract * tesseract
Definition: pageres.h:278
WERD_CHOICE * best_choice
Definition: pageres.h:239
WERD_CHOICE * raw_choice
Definition: pageres.h:244
int8_t fontinfo_id2_count
Definition: pageres.h:310
TWERD * chopped_word
Definition: pageres.h:210
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:279
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:785
void SetScriptPositions()
Definition: pageres.cpp:888
BlamerBundle * blamer_bundle
Definition: pageres.h:250
int8_t fontinfo_id_count
Definition: pageres.h:309
const UNICHARSET * uch_set
Definition: pageres.h:201
const FontInfo * fontinfo
Definition: pageres.h:307
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:779
void BestChoiceToCorrectText()
Definition: pageres.cpp:956
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247
tesseract::BoxWord * box_word
Definition: pageres.h:270
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:824
void PrintBestChoices() const
Definition: pageres.cpp:731
BLOB_CHOICE * GetBlobChoice(unsigned index) const
Definition: pageres.cpp:768
float baseline_shift
Definition: pageres.h:316
TWERD * rebuild_word
Definition: pageres.h:264
BLOCK_RES * block() const
Definition: pageres.h:769
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
WERD_RES * restart_page()
Definition: pageres.h:710
void ReplaceCurrentWord(PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1378
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1252
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
float y() const
Definition: points.h:209
bool IsText() const
Definition: polyblk.h:52
const std::vector< ScoredFont > & fonts() const
Definition: ratngs.h:97
float rating() const
Definition: ratngs.h:84
float max_x_height() const
Definition: ratngs.h:324
std::string debug_string() const
Definition: ratngs.h:479
float certainty() const
Definition: ratngs.h:315
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const
Definition: ratngs.cpp:393
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
bool empty() const
Definition: ratngs.h:284
uint8_t permuter() const
Definition: ratngs.h:331
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:378
float min_x_height() const
Definition: ratngs.h:321
unsigned length() const
Definition: ratngs.h:287
std::string & unichar_string()
Definition: ratngs.h:519
float rating() const
Definition: ratngs.h:312
bool IsAllSpaces() const
Definition: ratngs.h:497
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:419
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
bool contains(const FCOORD pt) const
Definition: rect.h:344
bool major_overlap(const TBOX &box) const
Definition: rect.h:374
bool x_overlap(const TBOX &box) const
Definition: rect.h:409
void print(FILE *fp) const
Definition: rejctmap.cpp:112
int16_t reject_count() const
Definition: rejctmap.h:339
void rej_word_bad_quality()
Definition: rejctmap.cpp:187
uint16_t length() const
Definition: rejctmap.h:333
void initialise(uint16_t length)
Definition: rejctmap.cpp:67
void full_print(FILE *fp) const
Definition: rejctmap.cpp:120
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t pile_count(int32_t value) const
Definition: statistc.h:72
int32_t get_total() const
Definition: statistc.h:85
int32_t mode() const
Definition: statistc.cpp:112
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70
TBOX bounding_box() const
Definition: stepblob.cpp:250
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
void GetNoiseOutlines(std::vector< C_OUTLINE * > *outlines)
Definition: werd.cpp:508
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:132
TBOX bounding_box() const
Definition: werd.cpp:155
bool AddSelectedOutlines(const std::vector< bool > &wanted, const std::vector< C_BLOB * > &target_blobs, const std::vector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:526
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91
void print() const
Definition: werd.cpp:262
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
ParamsVectors * params()
Definition: ccutil.h:53
UNICHARSET unicharset
Definition: ccutil.h:61
std::string lang
Definition: ccutil.h:59
unsigned size() const
Definition: genericvector.h:70
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
int MaxCount(T *max_value) const
Definition: sorthelper.h:86
void Add(T value, int count)
Definition: sorthelper.h:71
bool script_has_xheight() const
Definition: unicharset.h:958
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
bool top_bottom_useful() const
Definition: unicharset.h:555
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:268
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609
bool AdaptiveClassifierIsFull() const
Definition: classify.h:265
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:434
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:836
const UNICHARSET & GetUnicharset() const
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:264
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:742
static void Update()
Definition: scrollview.cpp:700
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387