19#ifndef DISABLED_LEGACY_ENGINE
20# include <allheaders.h>
31#ifndef DISABLED_LEGACY_ENGINE
76#ifndef DISABLED_LEGACY_ENGINE
77static void clear_any_old_text(BLOCK_LIST *block_list) {
78 BLOCK_IT block_it(block_list);
79 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
80 ROW_IT row_it(block_it.data()->row_list());
81 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82 WERD_IT word_it(row_it.data()->word_list());
83 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
84 word_it.data()->set_text(
"");
111 BLOCK_LIST *block_list) {
112 std::vector<TBOX> boxes;
113 std::vector<std::string> texts, full_texts;
114 if (!
ReadAllBoxes(applybox_page,
true, filename, &boxes, &texts, &full_texts,
nullptr)) {
118 const int box_count = boxes.size();
119 int box_failures = 0;
124 clear_any_old_text(block_list);
126 for (
int i = 0;
i < box_count;
i++) {
127 bool foundit =
false;
128 if (page_res !=
nullptr) {
131 (
i == box_count - 1) ?
nullptr : &boxes[
i + 1], full_texts[
i].c_str());
134 (
i == box_count - 1) ?
nullptr : &boxes[
i + 1], texts[
i].c_str());
138 ReportFailedBox(
i, boxes[
i], texts[
i].c_str(),
"FAILURE! Couldn't find a matching blob");
142 if (page_res ==
nullptr) {
148 if (applybox_debug > 0) {
150 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
151 if (box_failures > 0) {
152 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
160static double MedianXHeight(BLOCK_LIST *block_list) {
161 BLOCK_IT block_it(block_list);
162 STATS xheights(0, block_it.data()->pdblk.bounding_box().height() - 1);
163 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
164 ROW_IT row_it(block_it.data()->row_list());
165 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
169 return xheights.median();
175 const double median_xheight = MedianXHeight(block_list);
178 BLOCK_IT b_it(block_list);
179 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
180 BLOCK *block = b_it.data();
182 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
183 ROW *row = r_it.data();
184 const double diff = fabs(row->
x_height() - median_xheight);
185 if (diff > max_deviation) {
186 if (applybox_debug) {
187 tprintf(
"row xheight=%g, but median xheight = %g\n", row->
x_height(), median_xheight);
200 BLOCK_IT b_it(block_list);
201 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
202 BLOCK *block = b_it.data();
204 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
205 ROW *row = r_it.data();
207 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
208 WERD *word = w_it.data();
210 delete w_it.extract();
218 auto *page_res =
new PAGE_RES(
false, block_list,
nullptr);
221 while ((word_res = pr_it.
word()) !=
nullptr) {
234 classify_bln_numeric_mode, textord_use_cjk_fp_model,
235 poly_allow_detailed_fx, row, block)) {
240 tprintf(
"Maximally chopping word at:");
243 std::vector<BLOB_CHOICE *> blob_choices;
245 auto rating =
static_cast<float>(INT8_MAX);
256 blob_choices.push_back(choice);
259 const double e = exp(1.0);
260 unsigned blob_number;
261 int right_chop_index = 0;
262 if (!assume_fixed_pitch_char_segment) {
264 SEAM *seam =
nullptr;
265 while ((seam =
chop_one_blob(boxes, blob_choices, word_res, &blob_number)) !=
nullptr) {
267 BLOB_CHOICE *left_choice = blob_choices[blob_number];
268 rating = left_choice->
rating() / e;
272 auto *right_choice =
new BLOB_CHOICE(++right_chop_index, rating - 0.125f, -rating, -1, 0.0f,
274 blob_choices.insert(blob_choices.begin() + blob_number + 1, right_choice);
292static double BoxMissMetric(
const TBOX &box1,
const TBOX &box2) {
294 const int a = box1.
area();
295 const int b = box2.
area();
297 return 1.0 * (a - overlap_area) * (b - overlap_area) / a / b;
311 const TBOX *next_box,
const char *correct_text) {
312 if (applybox_debug > 1) {
313 tprintf(
"\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
317 for (word_res = page_res_it.
word(); word_res !=
nullptr; word_res = page_res_it.
forward()) {
321 if (applybox_debug > 1) {
326 for (
int i = 0;
i < word_len; ++
i) {
329 for (blob_count = 0;
i + blob_count < word_len; ++blob_count) {
337 if (next_box !=
nullptr) {
338 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
339 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
340 if (applybox_debug > 2) {
343 tprintf(
"Current miss metric = %g, next = %g\n", current_box_miss_metric,
344 next_box_miss_metric);
346 if (current_box_miss_metric > next_box_miss_metric) {
350 char_box += blob_box;
352 if (blob_count > 0) {
353 if (applybox_debug > 1) {
354 tprintf(
"Index [%d, %d) seem good.\n",
i,
i + blob_count);
357 ((next_box !=
nullptr && box.
x_gap(*next_box) < -3) ||
358 (prev_box !=
nullptr && prev_box->
x_gap(box) < -3))) {
368 if (applybox_debug > 2) {
369 tprintf(
"%d Blobs match: blob box:", blob_count);
373 if (next_box !=
nullptr) {
380 for (
int j = 1; j < blob_count; ++j) {
386 if (applybox_debug > 1) {
388 for (
auto best_state : word_res->
best_state) {
402 if (applybox_debug > 0) {
415 const char *correct_text) {
416 if (applybox_debug > 1) {
417 tprintf(
"\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
419 WERD *new_word =
nullptr;
420 BLOCK_IT b_it(block_list);
421 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
422 BLOCK *block = b_it.data();
427 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
428 ROW *row = r_it.data();
433 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
434 WERD *word = w_it.data();
435 if (applybox_debug > 2) {
439 if (word->
text() !=
nullptr && word->
text()[0] !=
'\0') {
446 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
447 C_BLOB *blob = blob_it.data();
452 if (next_box !=
nullptr) {
453 const double current_box_miss_metric = BoxMissMetric(blob_box, box);
454 const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
455 if (applybox_debug > 2) {
458 tprintf(
"Current miss metric = %g, next = %g\n", current_box_miss_metric,
459 next_box_miss_metric);
461 if (current_box_miss_metric > next_box_miss_metric) {
465 if (applybox_debug > 2) {
470 if (next_box !=
nullptr) {
475 if (new_word ==
nullptr) {
479 w_it.add_to_end(new_word);
481 C_BLOB_IT new_blob_it(new_word->
cblob_list());
482 new_blob_it.add_to_end(blob_it.extract());
487 if (new_word ==
nullptr && applybox_debug > 0) {
490 return new_word !=
nullptr;
498 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
500 if (word->
text() ==
nullptr || word->
text()[0] ==
'\0') {
504 std::vector<UNICHAR_ID> target_text;
506 tprintf(
"APPLY_BOX: FAILURE: can't find class_id for '%s'\n", word->
text());
511 tprintf(
"APPLY_BOX: FAILURE: can't find segmentation for '%s'\n", word->
text());
521 for (
int step = 0; *utf8 !=
'\0'; utf8 += step) {
522 const char *next_space = strchr(utf8,
' ');
523 if (next_space ==
nullptr) {
524 next_space = utf8 + strlen(utf8);
526 step = next_space - utf8;
528 if (class_id == INVALID_UNICHAR_ID) {
531 while (utf8[step] ==
' ') {
534 class_ids->push_back(class_id);
548 auto *choices =
new std::vector<BLOB_CHOICE_LIST *>[word_length];
549 for (
int i = 0;
i < word_length; ++
i) {
550 for (
int j = 1; j <=
kMaxGroupSize &&
i + j <= word_length; ++j) {
551 BLOB_CHOICE_LIST *match_result =
554 if (applybox_debug > 2) {
558 choices[
i].push_back(match_result);
565 std::vector<int> search_segmentation;
566 float best_rating = 0.0f;
567 SearchForText(choices, 0, word_length, target_text, 0, 0.0f, &search_segmentation, &best_rating,
569 for (
int i = 0;
i < word_length; ++
i) {
570 for (
auto choice : choices[
i]) {
589 if (word_res->
best_state.size() != target_text.size()) {
595 for (
auto &text : target_text) {
616 unsigned choices_length,
const std::vector<UNICHAR_ID> &target_text,
617 unsigned text_index,
float rating, std::vector<int> *segmentation,
618 float *best_rating, std::vector<int> *best_segmentation) {
620 for (
unsigned length = 1; length <= choices[choices_pos].size(); ++length) {
622 float choice_rating = 0.0f;
624 BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
625 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {
627 choice_rating = choice->
rating();
629 if (class_id == target_text[text_index]) {
633 if (
static_cast<size_t>(class_id) < table.size() && table[class_id] !=
nullptr) {
634 AmbigSpec_IT spec_it(table[class_id]);
635 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list(); spec_it.forward()) {
636 const AmbigSpec *ambig_spec = spec_it.data();
638 if (ambig_spec->
wrong_ngram[1] == INVALID_UNICHAR_ID &&
643 if (!spec_it.cycled_list()) {
648 if (choice_it.cycled_list()) {
651 segmentation->push_back(length);
652 if (choices_pos + length == choices_length && text_index + 1 == target_text.size()) {
654 if (applybox_debug > 2) {
655 tprintf(
"Complete match, rating = %g, best=%g, seglength=%zu, best=%zu\n",
656 rating + choice_rating, *best_rating, segmentation->size(),
657 best_segmentation->size());
659 if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
660 *best_segmentation = *segmentation;
661 *best_rating = rating + choice_rating;
663 }
else if (choices_pos + length < choices_length && text_index + 1 < target_text.size()) {
664 if (applybox_debug > 3) {
665 tprintf(
"Match found for %d=%s:%s, at %d+%d, recursing...\n", target_text[text_index],
667 choice_it.data()->unichar_id() == target_text[text_index] ?
"Match" :
"Ambig",
668 choices_pos, length);
670 SearchForText(choices, choices_pos + length, choices_length, target_text, text_index + 1,
671 rating + choice_rating, segmentation, best_rating, best_segmentation);
672 if (applybox_debug > 3) {
673 tprintf(
"End recursion for %d=%s\n", target_text[text_index],
677 segmentation->resize(segmentation->size() - 1);
686 int ok_blob_count = 0;
687 int bad_blob_count = 0;
688 int ok_word_count = 0;
689 int unlabelled_words = 0;
692 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
697 for (
int c = 0; c < blob_count; ++c) {
705 word_choice->append_unichar_id_space_allocated(INVALID_UNICHAR_ID, word_res->
best_state[c],
708 if (ok_in_word > 0) {
709 ok_blob_count += ok_in_word;
710 bad_blob_count += word_res->
correct_text.size() - ok_in_word;
715 if (applybox_debug > 0) {
716 tprintf(
"APPLY_BOXES: Unlabelled word at :");
724 for (; (word_res = pr_it.
word()) !=
nullptr; pr_it.
forward()) {
731 if (applybox_debug > 0) {
732 tprintf(
" Found %d good blobs.\n", ok_blob_count);
733 if (bad_blob_count > 0) {
734 tprintf(
" Leaving %d unlabelled blobs in %d words.\n", bad_blob_count, ok_word_count);
736 if (unlabelled_words > 0) {
737 tprintf(
" %d remaining unlabelled words deleted.\n", unlabelled_words);
744 const char *err_msg) {
745 tprintf(
"APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n", boxfile_lineno + 1, box_ch,
758 tprintf(
"Generated training data for %d words\n", word_count);
767 auto *choice =
new WERD_CHOICE(word_res->uch_set, word_res->correct_text.size());
768 for (
auto &correct_text : word_res->correct_text) {
771 std::vector<std::string> tokens =
split(correct_text,
' ');
773 choice->append_unichar_id_space_allocated(char_id, word_res->best_state[&correct_text - &word_res->correct_text[0]], 0.0f, 0.0f);
775 word_res->ClearWordChoices();
776 word_res->LogNewRawChoice(choice);
777 word_res->LogNewCookedChoice(1,
false, choice);
const double kMaxXHeightDeviationFraction
@ W_FUZZY_NON
fuzzy nonspace
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void tprintf(const char *format,...)
int IntCastRounded(double x)
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
const std::vector< std::string > split(const std::string &s, char c)
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
PAGE_RES * ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
void TidyUp(PAGE_RES *page_res)
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
void ReSegmentByClassification(PAGE_RES *page_res)
bool ConvertStringToUnichars(const char *utf8, std::vector< UNICHAR_ID > *class_ids)
Dict & getDict() override
void SearchForText(const std::vector< BLOB_CHOICE_LIST * > *choices, int choices_pos, unsigned choices_length, const std::vector< UNICHAR_ID > &target_text, unsigned text_index, float rating, std::vector< int > *segmentation, float *best_rating, std::vector< int > *best_segmentation)
void PreenXHeights(BLOCK_LIST *block_list)
void CorrectClassifyWords(PAGE_RES *page_res)
void MaximallyChopWord(const std::vector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
bool FindSegmentation(const std::vector< UNICHAR_ID > &target_text, WERD_RES *word_res)
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
void MergeBoxes(unsigned start, unsigned end)
const TBOX & bounding_box() const
const TBOX & BlobBox(unsigned index) const
PDBLK pdblk
Page Description Block.
ROW_LIST * row_list()
get rows
TBOX bounding_box() const
void set_x_height(float new_xheight)
void CloneChoppedToRebuild()
std::vector< std::string > correct_text
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
void InsertSeam(int blob_number, SEAM *seam)
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
BlamerBundle * blamer_bundle
const UNICHARSET * uch_set
std::vector< int > best_state
tesseract::BoxWord * box_word
bool LogNewRawChoice(WERD_CHOICE *word_choice)
std::vector< SEAM * > seam_array
BLOCK_RES * block() const
WERD_RES * restart_page()
ROW_RES * next_row() const
ROW_RES * prev_row() const
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
void set_certainty(float newrat)
UNICHAR_ID unichar_id() const
void set_rating(float newrat)
bool almost_equal(const TBOX &box, int tolerance) const
int x_gap(const TBOX &box) const
TBOX intersection(const TBOX &box) const
TDimension bottom() const
bool major_overlap(const TBOX &box) const
bool HasAnySplits() const
TBOX bounding_box() const
const char * text() const
void set_flag(WERD_FLAGS mask, bool value)
TBOX bounding_box() const
void set_text(const char *new_text)
C_BLOB_LIST * cblob_list()
UNICHAR_ID correct_ngram_id
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
const UnicharAmbigsVector & dang_ambigs() const
const char * id_to_unichar(UNICHAR_ID id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void LearnWord(const char *fontname, WERD_RES *word)
const UnicharAmbigs & getUnicharAmbigs() const
SEAM * chop_one_blob(const std::vector< TBOX > &boxes, const std::vector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, unsigned *blob_number)
virtual BLOB_CHOICE_LIST * classify_piece(const std::vector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)