48#define PERFECT_WERDS 999
57static int c_blob_comparator(
61 const C_BLOB *blob1 = *
reinterpret_cast<const C_BLOB *
const *
>(blob1p);
62 const C_BLOB *blob2 = *
reinterpret_cast<const C_BLOB *
const *
>(blob2p);
64 return blob1->bounding_box().left() - blob2->bounding_box().left();
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
83 WERD_RES_LIST fuzzy_space_words;
85 bool prevent_null_wd_fixsp;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {
91 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
92 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {
93 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
94 while (!word_res_it_from.at_last()) {
95 word_res = word_res_it_from.data();
96 while (!word_res_it_from.at_last() &&
98 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
99 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
100 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
101 word_res = word_res_it_from.forward();
103 if (monitor !=
nullptr) {
105 monitor->
progress = 90 + 5 * word_index / word_count;
107 (monitor->
cancel !=
nullptr &&
114 if (!word_res_it_from.at_last()) {
115 word_res_it_to = word_res_it_from;
118 debug_fix_space_level.set_value(10);
120 word_res_it_to.forward();
122 if (monitor !=
nullptr) {
124 monitor->
progress = 90 + 5 * word_index / word_count;
126 (monitor->
cancel !=
nullptr &&
131 while (!word_res_it_to.at_last() &&
132 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
133 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
135 debug_fix_space_level.set_value(10);
138 prevent_null_wd_fixsp =
true;
140 word_res = word_res_it_to.forward();
143 debug_fix_space_level.set_value(10);
146 prevent_null_wd_fixsp =
true;
148 if (prevent_null_wd_fixsp) {
149 word_res_it_from = word_res_it_to;
151 fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);
153 block_res_it.data()->block);
154 new_length = fuzzy_space_words.length();
155 word_res_it_from.add_list_before(&fuzzy_space_words);
156 for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {
157 word_res_it_from.forward();
161 debug_fix_space_level.set_value(0);
164 fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);
173 WERD_RES_LIST current_perm;
174 int16_t current_score;
175 bool improved =
false;
178 dump_words(best_perm, best_score, 1, improved);
184 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
187 dump_words(current_perm, current_score, 2, improved);
188 if (current_score > best_score) {
191 best_score = current_score;
198 dump_words(best_perm, best_score, 3, improved);
202 WERD_RES_IT src_it(&src_list);
203 WERD_RES_IT new_it(&new_list);
207 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
208 src_wd = src_it.data();
213 new_it.add_after_then_move(new_wd);
219 WERD_RES_IT word_it(&words);
224 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
225 word = word_it.data();
227 WordData word_data(block, row, word);
261 WERD_RES_IT word_res_it(&word_res_list);
262 int16_t total_score = 0;
263 int16_t word_count = 0;
264 int16_t done_word_count = 0;
267 int16_t prev_word_score = 0;
268 bool prev_word_done =
false;
269 bool prev_char_1 =
false;
270 bool prev_char_digit =
false;
271 const char *punct_chars =
"!\"`',.:;";
272 bool prev_char_punct =
false;
276 WERD_RES *word = word_res_it.data();
280 total_score += prev_word_score;
281 if (prev_word_done) {
286 prev_char_digit =
false;
287 prev_word_done =
false;
295 bool current_word_ok_so_far =
false;
302 total_score += prev_word_score;
303 if (prev_word_done) {
306 current_word_ok_so_far = word_done;
309 if (current_word_ok_so_far) {
310 prev_word_done =
true;
311 prev_word_score = word_len;
313 prev_word_done =
false;
319 for (
i = 0, prev_char_1 =
false;
i < word_len;
i++) {
321 if (prev_char_1 || (current_char_1 && (
i > 0))) {
324 prev_char_1 = current_char_1;
329 if (tessedit_prefer_joined_punct) {
330 for (
i = 0, offset = 0, prev_char_punct =
false;
i < word_len;
332 bool current_char_punct =
334 if (prev_char_punct || (current_char_punct &&
i > 0)) {
337 prev_char_punct = current_char_punct;
341 for (
i = 0, offset = 0;
i < word_len - 1;
352 word_res_it.forward();
353 }
while (word_res_it.data()->part_of_combo);
354 }
while (!word_res_it.at_first());
355 total_score += prev_word_score;
356 if (prev_word_done) {
359 if (done_word_count == word_count) {
392 WERD_RES_IT word_it(&words);
393 WERD_RES_IT prev_word_it(&words);
398 int16_t prev_right = -INT16_MAX;
401 int16_t min_gap = INT16_MAX;
403 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
404 word = word_it.data();
407 if (prev_right > -INT16_MAX) {
408 gap = box.
left() - prev_right;
413 prev_right = box.
right();
416 if (min_gap < INT16_MAX) {
417 prev_right = -INT16_MAX;
418 word_it.set_to_list(&words);
420 for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {
421 word = word_it.data();
424 if (prev_right > -INT16_MAX) {
425 gap = box.
left() - prev_right;
426 if (gap <= min_gap) {
427 prev_word = prev_word_it.data();
433 copy_word =
new WERD;
434 *copy_word = *(prev_word->
word);
440 prev_word_it.add_before_then_move(combo);
447 delete word_it.extract();
456 prev_word_it = word_it;
459 prev_right = box.
right();
468 WERD_RES_IT word_res_it(&perm);
470 if (debug_fix_space_level > 0) {
473 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
474 if (!word_res_it.data()->part_of_combo) {
475 stats_.
dump_words_str += word_res_it.data()->best_choice->unichar_string();
481 if (debug_fix_space_level > 1) {
484 tprintf(
"EXTRACTED (%d): \"", score);
487 tprintf(
"TESTED (%d): \"", score);
490 tprintf(
"RETURNED (%d): \"", score);
494 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
495 if (!word_res_it.data()->part_of_combo) {
496 tprintf(
"%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
497 static_cast<int>(word_res_it.data()->best_choice->permuter()));
501 }
else if (improved) {
503 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {
504 if (!word_res_it.data()->part_of_combo) {
505 tprintf(
"%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),
506 static_cast<int>(word_res_it.data()->best_choice->permuter()));
524 if (fixsp_done_mode > 0 &&
526 fixsp_done_mode == 3) &&
547 WERD_RES_LIST sub_word_list;
548 WERD_RES_IT sub_word_list_it(&sub_word_list);
553 word_res = word_res_it.data();
560 if (blob_index < 0) {
564 if (debug_fix_space_level > 1) {
568 sub_word_list_it.add_after_stay_put(word_res_it.extract());
570 new_length = sub_word_list.length();
571 word_res_it.add_list_before(&sub_word_list);
572 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
573 word_res_it.forward();
579 WERD_RES_IT best_perm_it(&best_perm);
580 WERD_RES_LIST current_perm;
581 WERD_RES_IT current_perm_it(¤t_perm);
583 int16_t current_score;
584 bool improved =
false;
588 dump_words(best_perm, best_score, 1, improved);
590 old_word_res = best_perm_it.data();
599 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
602 dump_words(current_perm, current_score, 2, improved);
603 if (current_score > best_score) {
606 best_score = current_score;
613 dump_words(best_perm, best_score, 3, improved);
622 WERD_RES_IT word_it(&words);
623 WERD_RES_IT worst_word_it;
624 float worst_noise_score = 9999;
625 int worst_blob_index = -1;
630 C_BLOB_IT rej_cblob_it;
631 C_BLOB_LIST new_blob_list;
632 C_BLOB_IT new_blob_it;
633 C_BLOB_IT new_rej_cblob_it;
635 int16_t start_of_noise_blob;
638 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
640 if (blob_index > -1 && worst_noise_score > noise_score) {
641 worst_noise_score = noise_score;
642 worst_blob_index = blob_index;
643 worst_word_it = word_it;
646 if (worst_blob_index < 0) {
653 word_res = worst_word_it.data();
657 new_blob_it.set_to_list(&new_blob_list);
659 for (
i = 0;
i < worst_blob_index;
i++, blob_it.forward()) {
660 new_blob_it.add_after_then_move(blob_it.extract());
662 start_of_noise_blob = blob_it.data()->bounding_box().left();
663 delete blob_it.extract();
665 new_word =
new WERD(&new_blob_list, word_res->
word);
672 for (; (!rej_cblob_it.empty() &&
673 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
674 rej_cblob_it.forward()) {
675 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
678 auto *new_word_res =
new WERD_RES(new_word);
679 new_word_res->combination =
true;
680 worst_word_it.add_before_then_move(new_word_res);
686 float noise_score[512];
691 float small_limit =
kBlnXHeight * fixsp_small_outlines_size;
701 if (blob_count < 5) {
708 if (debug_fix_space_level > 5) {
709 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
717 noise_score[
i] = non_noise_limit;
722 if (debug_fix_space_level > 5) {
726 if (debug_fix_space_level > 5) {
734 for (
i = 0;
static_cast<unsigned>(
i) < blob_count && non_noise_count < fixsp_non_noise_limit;
i++) {
735 if (noise_score[
i] >= non_noise_limit) {
739 if (non_noise_count < fixsp_non_noise_limit) {
746 for (
i = blob_count - 1;
i >= 0 && non_noise_count < fixsp_non_noise_limit;
i--) {
747 if (noise_score[
i] >= non_noise_limit) {
751 if (non_noise_count < fixsp_non_noise_limit) {
757 if (min_noise_blob > max_noise_blob) {
761 *worst_noise_score = small_limit;
763 for (
auto i = min_noise_blob;
i <= max_noise_blob;
i++) {
764 if (noise_score[
i] < *worst_noise_score) {
766 *worst_noise_score = noise_score[
i];
774 int16_t outline_count = 0;
775 int16_t max_dimension;
776 int16_t largest_outline_dimension = 0;
780 box = ol->bounding_box();
782 max_dimension = box.
height();
784 max_dimension = box.
width();
787 if (largest_outline_dimension < max_dimension) {
788 largest_outline_dimension = max_dimension;
792 if (outline_count > 5) {
794 largest_outline_dimension *= 2;
800 largest_outline_dimension /= 2;
803 return largest_outline_dimension;
808 const bool show_map_detail =
false;
817 if (show_map_detail) {
826 tprintf(
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
838 WERD_RES_IT word_it(&word_res_list);
841 float small_limit =
kBlnXHeight * fixsp_small_outlines_size;
843 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
844 word = word_it.data();
@ W_DONT_CHOP
fixed pitch chopped
@ W_REP_CHAR
repeated character
@ W_FUZZY_NON
fuzzy nonspace
void tprintf(const char *format,...)
void transform_to_next_perm(WERD_RES_LIST &words)
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
void fixspace_dbg(WERD_RES *word)
const int kBlnBaselineOffset
volatile int8_t ocr_alive
true if not last
bool deadline_exceeded() const
void * cancel_this
monitor-aware progress callback
int16_t progress
chars in this buffer(0)
CANCEL_FUNC cancel
for errcode use
std::string dump_words_str
bool fixspace_thinks_word_done(WERD_RES *word)
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
void SetupWordPassN(int pass_n, WordData *word)
float blob_noise_score(TBLOB *blob)
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
int16_t safe_dict_word(const WERD_RES *werd_res)
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool check_debug_pt(WERD_RES *word, int location)
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
TBOX bounding_box() const
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
BLOCK_RES_LIST block_res_list
void copy_on(WERD_RES *word_res)
WERD_CHOICE * best_choice
const UNICHARSET * uch_set
tesseract::BoxWord * box_word
static WERD_RES * deep_copy(const WERD_RES *src)
UNICHAR_ID unichar_id(unsigned index) const
const std::string & unichar_lengths() const
std::string & unichar_string()
TDimension height() const
TDimension bottom() const
void print(FILE *fp) const
int16_t reject_count() const
void full_print(FILE *fp) const
bool flag(WERD_FLAGS mask) const
void set_flag(WERD_FLAGS mask, bool value)
TBOX bounding_box() const
void join_on(WERD *other)
C_BLOB_LIST * rej_cblob_list()
void set_blanks(uint8_t new_blanks)
C_BLOB_LIST * cblob_list()
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
WERD_CHOICE * prev_word_best_choice_