21# include "config_auto.h"
30#ifndef DISABLED_LEGACY_ENGINE
39#ifndef DISABLED_LEGACY_ENGINE
48#ifndef DISABLED_LEGACY_ENGINE
80 if (lstm_recognizer_ ==
nullptr) {
81#ifndef DISABLED_LEGACY_ENGINE
87#ifndef DISABLED_LEGACY_ENGINE
88 if (tessedit_debug_quality_metrics) {
90 int16_t good_char_qual;
94 "\n%d chars; word_blob_quality: %d; outline_errs: %d; "
95 "char_quality: %d; good_char_quality: %d\n",
97 char_qual, good_char_qual);
119 const char *word_config,
int pass) {
120 if (word_config !=
nullptr) {
122 if (backup_config_file_ ==
nullptr) {
124 FILE *config_fp = fopen(backup_config_file_,
"wb");
125 if (config_fp ==
nullptr) {
126 tprintf(
"Error, failed to open file \"%s\"\n", backup_config_file_);
134 if (backup_config_file_ !=
nullptr) {
136 backup_config_file_ =
nullptr;
139 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
147 PAGE_RES *page_res, std::vector<WordData> *words) {
152 *target_word_box, word_config, 1)) {
153 words->push_back(
WordData(page_res_it));
157 for (
unsigned w = 0; w < words->size(); ++w) {
160 (*words)[w].prev_word = &(*words)[w - 1];
167 if (pass_n == 1 || !word->
word->
done) {
170 nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,
171 poly_allow_detailed_fx, word->
row, word->
block);
172 }
else if (pass_n == 2) {
180 for (
unsigned s = 0; s <= sub_langs_.size(); ++s) {
182 Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] :
this;
187 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode !=
OEM_LSTM_ONLY) {
188 word_res->SetupForRecognition(
189 lang_t->
unicharset, lang_t,
BestPix(), lang_t->tessedit_ocr_engine_mode,
nullptr,
190 lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,
191 lang_t->poly_allow_detailed_fx, word->
row, word->
block);
199 std::vector<WordData> *words) {
206 for (
unsigned w = 0; w < words->size(); ++w) {
211 if (monitor !=
nullptr) {
214 monitor->
progress = 70 * w / words->size();
216 monitor->
progress = 70 + 30 * w / words->size();
225 for (; w < words->size(); ++w) {
241 while (pr_it->
word() !=
nullptr && pr_it->
word() != word->
word) {
245 bool make_next_word_fuzzy =
false;
246#ifndef DISABLED_LEGACY_ENGINE
254 if (tessedit_dump_choices || debug_noise_removal) {
259 if (make_next_word_fuzzy && pr_it->
word() !=
nullptr) {
288 const TBOX *target_word_box,
const char *word_config,
292 if (tessedit_minimal_rej_pass1) {
293 tessedit_test_adaption.set_value(
true);
294 tessedit_minimal_rejection.set_value(
true);
297 if (dopasses == 0 || dopasses == 1) {
301#ifndef DISABLED_LEGACY_ENGINE
312 for (
auto &
lang : sub_langs_) {
313 if (
lang->AdaptiveClassifierIsFull()) {
314 lang->SwitchAdaptiveClassifier();
315 }
else if (!
lang->AdaptiveClassifierIsEmpty()) {
316 lang->StartBackupAdaptiveClassifier();
324 std::vector<WordData> words;
326#ifndef DISABLED_LEGACY_ENGINE
327 if (tessedit_parallelize) {
341 most_recently_used_ =
this;
371#ifndef DISABLED_LEGACY_ENGINE
374 if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
AnyTessLang()) {
376 std::vector<WordData> words;
378 if (tessedit_parallelize) {
381 most_recently_used_ =
this;
393 if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&
399 if (tessedit_enable_dict_correction) {
402 if (tessedit_enable_bigram_correction) {
424#ifndef DISABLED_LEGACY_ENGINE
427 if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {
433 const auto pageseg_mode =
static_cast<PageSegMode>(
static_cast<int>(tessedit_pageseg_mode));
448 if (monitor !=
nullptr) {
454#ifndef DISABLED_LEGACY_ENGINE
466 if (!word_it.
word()) {
474 if (tessedit_bigram_debug) {
475 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
480 std::vector<WERD_CHOICE *> overrides_word1;
481 std::vector<WERD_CHOICE *> overrides_word2;
499 if (tessedit_bigram_debug) {
500 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),
501 orig_w2_str.c_str());
505 if (tessedit_bigram_debug > 2) {
506 tprintf(
"Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());
508 if (tessedit_bigram_debug > 1) {
516 float best_rating = 0.0;
519 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
528 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
537 overrides_word1.push_back(p1);
538 overrides_word2.push_back(p2);
539 if (overrides_word1.size() == 1 || p1->
rating() + p2->
rating() < best_rating) {
541 best_idx = overrides_word1.size() - 1;
546 if (!overrides_word1.empty()) {
550 if (tessedit_bigram_debug > 1) {
552 "Top choice \"%s %s\" verified (sans case) by bigram "
554 orig_w1_str.c_str(), orig_w2_str.c_str());
558 const auto new_w1_str = overrides_word1[best_idx]->unichar_string();
559 const auto new_w2_str = overrides_word2[best_idx]->unichar_string();
560 if (new_w1_str != orig_w1_str) {
563 if (new_w2_str != orig_w2_str) {
566 if (tessedit_bigram_debug > 0) {
567 std::string choices_description;
568 int num_bigram_choices = overrides_word1.size() * overrides_word2.size();
569 if (num_bigram_choices == 1) {
570 choices_description =
"This was the unique bigram choice.";
572 if (tessedit_bigram_debug > 1) {
573 std::string bigrams_list;
574 const int kMaxChoicesToPrint = 20;
575 for (
unsigned i = 0;
i < overrides_word1.size() &&
i < kMaxChoicesToPrint;
i++) {
577 bigrams_list +=
", ";
583 choices_description =
"There were many choices: {";
584 choices_description += bigrams_list;
585 choices_description +=
"}";
587 choices_description +=
"There were " + std::to_string(num_bigram_choices);
588 choices_description +=
" compatible bigrams.";
591 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),
592 orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),
593 choices_description.c_str());
600 const TBOX *target_word_box,
const char *word_config) {
605 while (!tessedit_test_adaption && page_res_it.
word() !=
nullptr) {
608 if (monitor !=
nullptr) {
621 if (target_word_box &&
636 int16_t all_char_quality;
637 int16_t accepted_all_char_quality;
647 if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {
654 if (tessedit_debug_quality_metrics) {
656 "QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
657 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
668 bool good_quality_doc =
676 if (!tessedit_test_adaption) {
684 if (!wordrec_run_blamer) {
716 float word_x_height = word->
x_height;
717 if (word_x_height < word->best_choice->min_x_height() ||
725 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
727 small_cap_xheight - small_cap_delta <= word_x_height &&
728 word_x_height <= small_cap_xheight + small_cap_delta) {
739 if (num_upper > 0 && num_lower == 0) {
750 *next_left = INT32_MAX;
751 if (index < words.
size()) {
752 *right = words[index]->word->bounding_box().right();
753 if (index + 1 < words.
size()) {
754 *next_left = words[index + 1]->word->bounding_box().left();
761static void EvaluateWordSpan(
const PointerVector<WERD_RES> &words,
unsigned first_index,
unsigned end_index,
762 float *rating,
float *certainty,
bool *bad,
bool *valid_permuter) {
763 if (end_index <= first_index) {
765 *valid_permuter =
false;
767 for (
unsigned index = first_index; index < end_index && index < words.size(); ++index) {
768 WERD_CHOICE *choice = words[index]->best_choice;
769 if (choice ==
nullptr) {
772 *rating += choice->rating();
773 *certainty = std::min(*certainty, choice->certainty());
775 *valid_permuter =
false;
788static int SelectBestWords(
double rating_ratio,
double certainty_margin,
bool debug,
789 PointerVector<WERD_RES> *new_words,
790 PointerVector<WERD_RES> *best_words) {
793 std::vector<WERD_RES *> out_words;
795 unsigned b = 0, n = 0;
796 int num_best = 0, num_new = 0;
797 while (b < best_words->size() || n < new_words->size()) {
799 auto start_b = b, start_n = n;
800 while (b < best_words->size() || n < new_words->size()) {
801 int b_right = -INT32_MAX;
802 int next_b_left = INT32_MAX;
803 WordGap(*best_words, b, &b_right, &next_b_left);
804 int n_right = -INT32_MAX;
805 int next_n_left = INT32_MAX;
806 WordGap(*new_words, n, &n_right, &next_n_left);
807 if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {
812 if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {
819 float b_rating = 0.0f, n_rating = 0.0f;
821 float b_certainty = 0.0f, n_certainty = 0.0f;
823 bool b_bad =
false, n_bad =
false;
825 bool b_valid_permuter =
true, n_valid_permuter =
true;
826 const int end_b = b < best_words->size() ? b + 1 : b;
827 const int end_n = n < new_words->size() ? n + 1 : n;
828 EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,
830 EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,
832 bool new_better =
false;
833 if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||
834 (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&
835 n_certainty > b_certainty - certainty_margin))) {
837 for (
int i = start_n;
i < end_n; ++
i) {
838 out_words.push_back((*new_words)[
i]);
839 (*new_words)[
i] =
nullptr;
845 for (
int i = start_b;
i < end_b; ++
i) {
846 out_words.push_back((*best_words)[
i]);
847 (*best_words)[
i] =
nullptr;
853 "%d new words %s than %d old words: r: %g v %g c: %g v %g"
854 " valid dict: %d v %d\n",
855 end_n - start_n, new_better ?
"better" :
"worse", end_b - start_b, n_rating, b_rating,
856 n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
864 for (
auto &out_word : out_words) {
865 best_words->push_back(out_word);
867 return num_new - num_best;
876 tprintf(
"Trying word using lang %s, oem %d\n",
lang.c_str(),
877 static_cast<int>(tessedit_ocr_engine_mode));
881 (this->*recognizer)(word_data, in_word, &new_words);
882 if (new_words.
empty()) {
889 for (
unsigned i = 0;
i < new_words.
size(); ++
i) {
890 new_words[
i]->DebugTopChoice(
"Lang result");
895 return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,
896 &new_words, best_words);
901 for (
unsigned w = 0; w < words.
size(); ++w) {
902 if (words[w]->tess_failed || !words[w]->tess_accepted) {
909#ifndef DISABLED_LEGACY_ENGINE
915 *make_next_word_fuzzy =
false;
923 std::vector<C_OUTLINE *> outlines;
925 std::vector<bool> word_wanted;
926 std::vector<bool> overlapped_any_blob;
927 std::vector<C_BLOB *> target_blobs;
929 &overlapped_any_blob, &target_blobs);
933 std::vector<bool> wanted;
934 std::vector<C_BLOB *> wanted_blobs;
935 std::vector<C_OUTLINE *> wanted_outlines;
936 int num_overlapped = 0;
937 int num_overlapped_used = 0;
938 for (
unsigned i = 0;
i < overlapped_any_blob.size(); ++
i) {
939 if (overlapped_any_blob[
i]) {
941 if (word_wanted[
i]) {
942 ++num_overlapped_used;
944 wanted.push_back(word_wanted[
i]);
945 wanted_blobs.push_back(target_blobs[
i]);
946 wanted_outlines.push_back(outlines[
i]);
947 outlines[
i] =
nullptr;
952 int non_overlapped = 0;
953 int non_overlapped_used = 0;
954 for (
unsigned i = 0;
i < word_wanted.size(); ++
i) {
955 if (word_wanted[
i]) {
956 ++non_overlapped_used;
958 if (outlines[
i] !=
nullptr) {
959 ++non_overlapped_used;
962 if (debug_noise_removal) {
963 tprintf(
"Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,
964 num_overlapped, non_overlapped_used, non_overlapped);
968 if (real_word->
AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {
973 return num_overlapped_used != 0 || non_overlapped_used != 0;
983 std::vector<bool> *word_wanted,
984 std::vector<bool> *overlapped_any_blob,
985 std::vector<C_BLOB *> *target_blobs) {
986 std::vector<bool> blob_wanted;
987 word_wanted->clear();
988 word_wanted->resize(outlines.size());
989 overlapped_any_blob->clear();
990 overlapped_any_blob->resize(outlines.size());
991 target_blobs->clear();
992 target_blobs->resize(outlines.size());
998 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
999 C_BLOB *blob = blob_it.data();
1001 blob_wanted.clear();
1002 blob_wanted.resize(outlines.size());
1003 int num_blob_outlines = 0;
1004 for (
unsigned i = 0;
i < outlines.size(); ++
i) {
1005 if (blob_box.
major_x_overlap(outlines[
i]->bounding_box()) && !(*word_wanted)[
i]) {
1006 blob_wanted[
i] =
true;
1007 (*overlapped_any_blob)[
i] =
true;
1008 ++num_blob_outlines;
1011 if (debug_noise_removal) {
1012 tprintf(
"%d noise outlines overlap blob at:", num_blob_outlines);
1019 if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1021 num_blob_outlines, &blob_wanted)) {
1022 for (
unsigned i = 0;
i < blob_wanted.size(); ++
i) {
1023 if (blob_wanted[
i]) {
1025 (*word_wanted)[
i] =
true;
1026 (*target_blobs)[
i] = blob;
1038 std::vector<bool> *word_wanted,
1039 std::vector<C_BLOB *> *target_blobs) {
1040 std::vector<bool> blob_wanted;
1041 word_wanted->clear();
1042 word_wanted->resize(outlines.size());
1043 target_blobs->clear();
1044 target_blobs->resize(outlines.size());
1046 for (
unsigned i = 0;
i < outlines.size(); ++
i) {
1047 if (outlines[
i] ==
nullptr) {
1051 blob_wanted.clear();
1052 blob_wanted.resize(outlines.size());
1053 int num_blob_outlines = 0;
1054 TBOX total_ol_box(outlines[
i]->bounding_box());
1055 while (
i < outlines.size() && outlines[
i] !=
nullptr) {
1056 blob_wanted[
i] =
true;
1057 total_ol_box += outlines[
i]->bounding_box();
1059 ++num_blob_outlines;
1063 while (!blob_it.at_last() &&
1064 blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.
left()) {
1069 if (debug_noise_removal) {
1070 tprintf(
"Num blobless outlines = %d\n", num_blob_outlines);
1072 C_BLOB *left_blob = blob_it.data();
1074 C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1075 if ((left_box.
x_overlap(total_ol_box) || right_blob ==
nullptr ||
1078 num_blob_outlines, &blob_wanted)) {
1079 if (debug_noise_removal) {
1080 tprintf(
"Added to left blob\n");
1082 for (
unsigned j = 0; j < blob_wanted.size(); ++j) {
1083 if (blob_wanted[j]) {
1084 (*word_wanted)[j] =
true;
1085 (*target_blobs)[j] = left_blob;
1088 }
else if (right_blob !=
nullptr &&
1092 num_blob_outlines, &blob_wanted)) {
1093 if (debug_noise_removal) {
1094 tprintf(
"Added to right blob\n");
1096 for (
unsigned j = 0; j < blob_wanted.size(); ++j) {
1097 if (blob_wanted[j]) {
1098 (*word_wanted)[j] =
true;
1099 (*target_blobs)[j] = right_blob;
1103 num_blob_outlines, &blob_wanted)) {
1104 if (debug_noise_removal) {
1105 tprintf(
"Fitted between blobs\n");
1107 for (
unsigned j = 0; j < blob_wanted.size(); ++j) {
1108 if (blob_wanted[j]) {
1109 (*word_wanted)[j] =
true;
1110 (*target_blobs)[j] =
nullptr;
1122 const std::vector<C_OUTLINE *> &outlines,
1123 int num_outlines, std::vector<bool> *ok_outlines) {
1124 std::string best_str;
1125 float target_cert = certainty_threshold;
1126 if (blob !=
nullptr) {
1129 if (debug_noise_removal) {
1130 tprintf(
"No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,
1134 target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1136 std::vector<bool> test_outlines = *ok_outlines;
1138 std::string all_str;
1139 std::vector<bool> best_outlines = *ok_outlines;
1141 if (debug_noise_removal) {
1143 for (
unsigned i = 0;
i < test_outlines.size(); ++
i) {
1144 if (test_outlines[
i]) {
1145 ol_box += outlines[
i]->bounding_box();
1148 tprintf(
"All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,
1149 best_cert - target_cert);
1155 while (num_outlines > 1 && best_index >= 0 &&
1156 (blob ==
nullptr || best_cert < target_cert || blob !=
nullptr)) {
1159 for (
unsigned i = 0;
i < outlines.size(); ++
i) {
1160 if (test_outlines[
i]) {
1161 test_outlines[
i] =
false;
1164 if (debug_noise_removal) {
1166 for (
unsigned j = 0; j < outlines.size(); ++j) {
1167 if (test_outlines[j]) {
1168 ol_box += outlines[j]->bounding_box();
1170 tprintf(
"%c", test_outlines[j] ?
'T' :
'F');
1172 tprintf(
" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,
1173 cert - target_cert);
1176 if (cert > best_cert) {
1179 best_outlines = test_outlines;
1181 test_outlines[
i] =
true;
1184 if (best_index >= 0) {
1185 test_outlines[best_index] =
false;
1189 if (best_cert >= target_cert) {
1191 *ok_outlines = best_outlines;
1192 if (debug_noise_removal) {
1193 tprintf(
"%s noise combination ", blob ?
"Adding" :
"New");
1194 for (
auto &&best_outline : best_outlines) {
1195 tprintf(
"%c", best_outline ?
'T' :
'F');
1197 tprintf(
" yields certainty %g, beating target of %g\n", best_cert, target_cert);
1208 const std::vector<C_OUTLINE *> &outlines,
int pass_n,
1212 C_BLOB *local_blob =
nullptr;
1213 if (blob !=
nullptr) {
1215 ol_it.set_to_list(blob->
out_list());
1216 first_to_keep = ol_it.data();
1218 for (
unsigned i = 0;
i < ok_outlines.size(); ++
i) {
1219 if (ok_outlines[
i]) {
1221 if (blob ==
nullptr) {
1222 local_blob =
new C_BLOB(outlines[
i]);
1224 ol_it.set_to_list(blob->
out_list());
1226 ol_it.add_before_stay_put(outlines[
i]);
1232 ol_it.move_to_first();
1233 if (first_to_keep ==
nullptr) {
1235 for (; !ol_it.empty(); ol_it.forward()) {
1242 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1260 while (it.
word() != word_res && it.
word() !=
nullptr) {
1268 if (debug_noise_removal) {
1273 tprintf(
"Got word with null raw choice xheight=%g, row=%g\n", word_res->
x_height,
1281 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1303#ifdef DISABLED_LEGACY_ENGINE
1314 clock_t start_t = clock();
1315 const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1317 tprintf(
"%s word with lang %s at:", word->
done ?
"Already done" :
"Processing",
1318 most_recently_used_->
lang.c_str());
1328 auto sub = sub_langs_.size();
1329 if (most_recently_used_ !=
this) {
1331 for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {
1336 Tesseract *best_lang_tess = most_recently_used_;
1337 if (!WordsAcceptable(best_words)) {
1339 if (most_recently_used_ !=
this &&
1341 &word_data->
lang_words[sub_langs_.size()], &best_words) > 0) {
1342 best_lang_tess =
this;
1344 for (
unsigned i = 0; !WordsAcceptable(best_words) &&
i < sub_langs_.size(); ++
i) {
1345 if (most_recently_used_ != sub_langs_[
i] &&
1348 best_lang_tess = sub_langs_[
i];
1352 most_recently_used_ = best_lang_tess;
1353 if (!best_words.
empty()) {
1354 if (best_words.
size() == 1 && !best_words[0]->combination) {
1359 word_data->
word = best_words.
back();
1366 clock_t ocr_t = clock();
1367 if (tessedit_timing_debug) {
1369 static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
1381 ROW *row = word_data.
row;
1385#ifdef DISABLED_LEGACY_ENGINE
1391 if (!(*in_word)->odd_size || tessedit_ocr_engine_mode ==
OEM_LSTM_ONLY) {
1393 if (!out_words->
empty()) {
1403#ifndef DISABLED_LEGACY_ENGINE
1406 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1407 poly_allow_detailed_fx, row, block);
1411#ifndef DISABLED_LEGACY_ENGINE
1416 bool adapt_ok =
word_adaptable(word, tessedit_tess_adaption_mode);
1428 if (tessedit_enable_doc_dict && !word->
IsAmbiguous()) {
1445 new_word->
guessed_x_ht ?
"GUESS" :
"CERT", new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
1446 accept_new_word ?
"ACCEPTED" :
"");
1449#ifndef DISABLED_LEGACY_ENGINE
1457 if (original_misfits == 0) {
1460 float baseline_shift = 0.0f;
1462 if (baseline_shift != 0.0f) {
1468 if (original_misfits > 0) {
1469 float new_baseline_shift;
1490 bool accept_new_x_ht =
false;
1500 classify_bln_numeric_mode, textord_use_cjk_fp_model,
1501 poly_allow_detailed_fx, row, block);
1505 if (debug_x_ht_level >= 1) {
1506 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,
1507 word->
x_height, new_misfits, new_x_ht);
1513 accept_new_x_ht = new_misfits < original_misfits &&
1516 if (debug_x_ht_level >= 1) {
1520 if (accept_new_x_ht) {
1541#ifndef DISABLED_LEGACY_ENGINE
1542 ROW *row = word_data.
row;
1567# ifndef GRAPHICS_DISABLED
1568 if (tessedit_display_outwords) {
1583#ifndef DISABLED_LEGACY_ENGINE
1598 if (tessedit_fix_hyphens) {
1604 "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1628 if (choice !=
nullptr) {
1629 if (best_choice ==
nullptr || choice->
rating() < best_choice->
rating()) {
1630 best_choice = choice;
1640static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {
1641 WERD_CHOICE *word = word_res->best_choice;
1642 for (
unsigned i = 0;
i < word_res->best_choice->length(); ++
i) {
1643 BLOB_CHOICE *choice =
1645 if (choice ==
nullptr) {
1646 BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(
i));
1647 choice_it.add_before_stay_put(
new BLOB_CHOICE(*blob_choice));
1651 for (
unsigned i = 0;
i < word->length(); ++
i) {
1652 if (word->unichar_id(
i) != blob_choice->unichar_id()) {
1653 word->set_unichar_id(blob_choice->unichar_id(),
i);
1671 for (
unsigned i = 0;
i < word.
length(); ++
i) {
1677 int max_count = rep_ch.
MaxCount(&maxch_id);
1679 BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);
1680 if (best_choice ==
nullptr) {
1681 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1685 word_res->
done =
true;
1688 CorrectRepcharChoices(best_choice, word_res);
1693 const char *lengths) {
1696 int leading_punct_count;
1697 int upper_count = 0;
1698 int hyphen_pos = -1;
1701 if (strlen(lengths) > 20) {
1707 if (s[offset] !=
'\0' && chs_leading_punct.contains(s[offset])) {
1708 offset += lengths[
i++];
1710 leading_punct_count =
i;
1713 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[
i])) {
1714 offset += lengths[
i++];
1717 if (upper_count > 1) {
1721 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[
i])) {
1722 offset += lengths[
i++];
1724 if (
i - leading_punct_count < quality_min_initial_alphas_reqd) {
1731 if (lengths[
i] == 1 && s[offset] ==
'-') {
1733 offset += lengths[
i++];
1734 if (s[offset] !=
'\0') {
1735 while ((s[offset] !=
'\0') && char_set.
get_islower(s + offset, lengths[
i])) {
1736 offset += lengths[
i++];
1738 if (
i < hyphen_pos + 3) {
1744 if (lengths[
i] == 1 && (s[offset] ==
'\'') && lengths[
i + 1] == 1 &&
1745 (s[offset + lengths[
i]] ==
's')) {
1746 offset += lengths[
i++];
1747 offset += lengths[
i++];
1750 if (upper_count > 0) {
1758 if (lengths[
i] == 1 && s[offset] !=
'\0' && chs_trailing_punct1.contains(s[offset])) {
1759 offset += lengths[
i++];
1761 if (lengths[
i] == 1 && s[offset] !=
'\0' &&
i > 0 && s[offset - lengths[
i - 1]] != s[offset] &&
1762 chs_trailing_punct2.contains(s[offset])) {
1763 offset += lengths[
i++];
1766 if (s[offset] !=
'\0') {
1776 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1778 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[
i]) &&
1779 lengths[
i + 1] == 1 && s[offset + lengths[
i]] ==
'.') {
1780 offset += lengths[
i++];
1781 offset += lengths[
i++];
1783 }
else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1785 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[
i]) &&
1786 lengths[
i + 1] == 1 && s[offset + lengths[
i]] ==
'.') {
1787 offset += lengths[
i++];
1788 offset += lengths[
i++];
1791 if (s[offset] !=
'\0') {
1800 bool show_map_detail =
false;
1807 tessedit_rejection_debug.set_value(
false);
1808 debug_x_ht_level.set_value(0);
1814 tessedit_rejection_debug.set_value(
true);
1815 debug_x_ht_level.set_value(2);
1819 tprintf(
"classify_word_pass1 start\n");
1823 tprintf(
"make_reject_map: initial map");
1826 tprintf(
"make_reject_map: after NN");
1829 tprintf(
"classify_word_pass2 - START");
1832 tprintf(
"classify_word_pass2 - Pre Xht");
1835 tprintf(
"classify_word_pass2 - END");
1836 show_map_detail =
true;
1848 tprintf(
"After Poor quality rejection");
1851 tprintf(
"unrej_good_quality_words - START");
1854 tprintf(
"unrej_good_quality_words - END");
1857 tprintf(
"Write results pass");
1858 show_map_detail =
true;
1865 if (show_map_detail) {
1873 tprintf(
"null best choice\n");
1876 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
1888#ifndef DISABLED_LEGACY_ENGINE
1889static void find_modal_font(
1898 font =
static_cast<int16_t
>(fonts->
mode());
1901 *font_count =
count < INT8_MAX ?
count : INT8_MAX;
1902 fonts->
add(font, -*font_count);
1923#ifndef DISABLED_LEGACY_ENGINE
1925 if (fontinfo_size == 0) {
1928 if (tessedit_font_id > 0) {
1929 if (tessedit_font_id >= fontinfo_size) {
1930 tprintf(
"Error, invalid font ID provided: must be below %d.\n"
1931 "Falling back to font auto-detection.\n", fontinfo_size);
1940 std::vector<int> font_total_score(fontinfo_size);
1943 if (tessedit_debug_fonts) {
1948 if (choice ==
nullptr) {
1951 auto &fonts = choice->
fonts();
1952 for (
auto &f : fonts) {
1953 const int fontinfo_id = f.fontinfo_id;
1954 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1955 font_total_score[fontinfo_id] += f.score;
1960 int score1 = 0, score2 = 0;
1961 int16_t font_id1 = -1, font_id2 = -1;
1962 for (
int f = 0; f < fontinfo_size; ++f) {
1963 if (tessedit_debug_fonts && font_total_score[f] > 0) {
1966 if (font_total_score[f] > score1) {
1968 font_id2 = font_id1;
1969 score1 = font_total_score[f];
1971 }
else if (font_total_score[f] > score2) {
1972 score2 = font_total_score[f];
1984 if (tessedit_debug_fonts) {
1986 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.
name,
1997#ifndef DISABLED_LEGACY_ENGINE
2006 STATS doc_fonts(0, font_table_size_ - 1);
2010 word = page_res_it.
word();
2019 int8_t doc_font_count;
2020 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2021 if (doc_font_count == 0) {
2025 const FontInfo *modal_font =
nullptr;
2027 word = page_res_it.
word();
2041 word = page_res_it.
word();
2045 if (!(
count == length || (length > 3 &&
count >= length * 3 / 4))) {
2060 if (word->best_choices.singleton()) {
2065 if (word->tesseract->getDict().valid_word(*best) != 0) {
2069 WERD_CHOICE_IT choice_it(&word->best_choices);
2070 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
2072 if (word->tesseract->getDict().valid_word(*alternate)) {
2074 if (tessedit_bigram_debug) {
2075 tprintf(
"Dictionary correction replaces best choice '%s' with '%s'\n",
2079 word->ReplaceBestChoice(alternate);
@ AC_INITIAL_CAP
ALL but initial lc.
@ AC_UNACCEPTABLE
Unacceptable word.
@ AC_UPPER_CASE
ALL upper case.
@ AC_LOWER_CASE
ALL lower case.
const char *const kBackUpConfigFile
const double kMinRefitXHeightFraction
@ W_REP_CHAR
repeated character
@ OEM_TESSERACT_LSTM_COMBINED
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
void tprintf(const char *format,...)
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
volatile int8_t ocr_alive
true if not last
bool deadline_exceeded() const
void * cancel_this
monitor-aware progress callback
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
int16_t progress
chars in this buffer(0)
CANCEL_FUNC cancel
for errcode use
int16_t doc_good_char_quality
PointerVector< WERD_RES > lang_words
void AssignDiacriticsToNewBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
bool recog_interactive(PAGE_RES_IT *pr_it)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
void bigram_correction_pass(PAGE_RES *page_res)
int16_t word_blob_quality(WERD_RES *word)
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
void fix_rep_char(PAGE_RES_IT *page_res_it)
void AssignDiacriticsToOverlappingBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
void SetupWordPassN(int pass_n, WordData *word)
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
void PrerecAllWordsPar(const std::vector< WordData > &words)
int16_t word_outline_errs(WERD_RES *word)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Dict & getDict() override
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
bool SubAndSuperscriptFix(WERD_RES *word_res)
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
void dictionary_correction_pass(PAGE_RES *page_res)
bool check_debug_pt(WERD_RES *word, int location)
void tess_add_doc_word(WERD_CHOICE *word_choice)
void font_recognition_pass(PAGE_RES *page_res)
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
int CountMisfitTops(WERD_RES *word_res)
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
bool word_adaptable(WERD_RES *word, uint16_t mode)
void set_word_fonts(WERD_RES *word)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
bool tess_acceptable_word(WERD_RES *word)
bool right_to_left() const
void script_pos_pass(PAGE_RES *page_res)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
float ClassifyBlobPlusOutlines(const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
void blamer_pass(PAGE_RES *page_res)
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
static const char * IncorrectReasonName(IncorrectResultReason irr)
static void LastChanceBlame(bool debug, WERD_RES *word)
const std::string & misadaption_debug() const
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
void CopyTruth(const BlamerBundle &other)
IncorrectResultReason incorrect_result_reason() const
TBOX bounding_box() const
void plot(ScrollView *window)
static const double kXHeightCapRatio
FCOORD classify_rotation() const
PDBLK pdblk
Page Description Block.
int32_t x_height() const
return xheight
std::vector< std::string > misadaption_log
std::vector< int > blame_reasons
const FontInfo * fontinfo2
tesseract::Tesseract * tesseract
WERD_CHOICE * best_choice
int8_t fontinfo_id2_count
void InitForRetryRecognition(const WERD_RES &source)
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
void ConsumeWordResults(WERD_RES *word)
void SetScriptPositions()
BlamerBundle * blamer_bundle
const UNICHARSET * uch_set
const FontInfo * fontinfo
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
void BestChoiceToCorrectText()
WERD_CHOICE_LIST best_choices
tesseract::BoxWord * box_word
void ReplaceBestChoice(WERD_CHOICE *choice)
void PrintBestChoices() const
BLOB_CHOICE * GetBlobChoice(unsigned index) const
BLOCK_RES * block() const
void MakeCurrentWordFuzzy()
WERD_RES * restart_page()
void ReplaceCurrentWord(PointerVector< WERD_RES > *words)
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
POLY_BLOCK * poly_block() const
const std::vector< ScoredFont > & fonts() const
float max_x_height() const
std::string debug_string() const
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const
UNICHAR_ID unichar_id(unsigned index) const
void GetNonSuperscriptSpan(int *start, int *end) const
float min_x_height() const
std::string & unichar_string()
bool major_x_overlap(const TBOX &box) const
TDimension bottom() const
bool contains(const FCOORD pt) const
bool major_overlap(const TBOX &box) const
bool x_overlap(const TBOX &box) const
void print(FILE *fp) const
int16_t reject_count() const
void rej_word_bad_quality()
void initialise(uint16_t length)
void full_print(FILE *fp) const
void add(int32_t value, int32_t count)
int32_t pile_count(int32_t value) const
int32_t get_total() const
C_OUTLINE_LIST * out_list()
TBOX bounding_box() const
static int SortByXMiddle(const void *v1, const void *v2)
static C_BLOB * deep_copy(const C_BLOB *src)
bool flag(WERD_FLAGS mask) const
void GetNoiseOutlines(std::vector< C_OUTLINE * > *outlines)
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
TBOX bounding_box() const
bool AddSelectedOutlines(const std::vector< bool > &wanted, const std::vector< C_BLOB * > &target_blobs, const std::vector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
C_BLOB_LIST * rej_cblob_list()
C_BLOB_LIST * cblob_list()
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
int MaxCount(T *max_value) const
void Add(T value, int count)
bool script_has_xheight() const
bool get_islower(UNICHAR_ID unichar_id) const
bool top_bottom_useful() const
bool get_isupper(UNICHAR_ID unichar_id) const
std::string debug_str(UNICHAR_ID id) const
bool AdaptiveClassifierIsEmpty() const
bool AdaptableWord(WERD_RES *word)
void LearnWord(const char *fontname, WERD_RES *word)
void StartBackupAdaptiveClassifier()
void SwitchAdaptiveClassifier()
bool AdaptiveClassifierIsFull() const
UnicityTable< FontInfo > fontinfo_table_
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
const UNICHARSET & GetUnicharset() const
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
WERD_CHOICE * prev_word_best_choice_