27static void countMatchingBlobs(int16_t &match_count,
int ) {
31static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
33 if (word->reject_map[index].accepted()) {
34 ++accepted_match_count;
39static void acceptIfGoodQuality(WERD_RES *word,
int index) {
40 if (word->reject_map[index].accept_if_good_quality()) {
41 word->reject_map[index].setrej_quality_accept();
52 int16_t match_count = 0;
55 using namespace std::placeholders;
57 std::bind(countMatchingBlobs, match_count, _1));
64 int16_t err_count = 0;
82 int16_t *accepted_match_count) {
84 *accepted_match_count = 0;
87 using namespace std::placeholders;
90 std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
101 using namespace std::placeholders;
103 std::bind(acceptIfGoodQuality, word, _1));
108 int expected_outline_count;
110 if (outlines_odd.contains(c)) {
112 }
else if (outlines_2.contains(c)) {
113 expected_outline_count = 2;
115 expected_outline_count = 1;
117 return abs(outline_count - expected_outline_count);
121 if ((tessedit_good_quality_unrej && good_quality_doc)) {
125 if (unlv_tilde_crunching) {
150 while (page_res_it.
word() !=
nullptr) {
153 word = page_res_it.
word();
155 if (word->
reject_map[
i].accept_if_good_quality()) {
162 static_cast<float>(page_res_it.
row()->
char_count)) <= quality_rowrej_pc)) {
163 word = page_res_it.
word();
165 (tessedit_unrej_any_wd ||
174 current_row = page_res_it.
row();
175 while ((page_res_it.
word() !=
nullptr) && (page_res_it.
row() == current_row)) {
184 current_block =
nullptr;
185 current_row =
nullptr;
186 while (page_res_it.
word() !=
nullptr) {
187 if (current_block != page_res_it.
block()) {
188 current_block = page_res_it.
block();
192 if (current_row != page_res_it.
row()) {
193 current_row = page_res_it.
row();
212 int16_t block_no = 0;
218 bool prev_word_rejected;
219 int16_t char_quality = 0;
220 int16_t accepted_char_quality;
223 tessedit_reject_doc_percent) {
225 if (tessedit_debug_doc_rejection) {
230 if (tessedit_debug_doc_rejection) {
239 while ((word = page_res_it.
word()) !=
nullptr) {
240 current_block = page_res_it.
block();
244 tessedit_reject_block_percent) {
245 if (tessedit_debug_block_rejection) {
246 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
249 prev_word_rejected =
false;
250 while ((word = page_res_it.
word()) !=
nullptr && (page_res_it.
block() == current_block)) {
251 if (tessedit_preserve_blk_rej_perfect_wds) {
254 if (rej_word && tessedit_dont_blkrej_good_wds &&
271 if (tessedit_use_reject_spaces && prev_word_rejected &&
277 prev_word_rejected = rej_word;
281 if (tessedit_debug_block_rejection) {
282 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
288 while (page_res_it.
word() !=
nullptr && page_res_it.
block() == current_block) {
289 current_row = page_res_it.
row();
298 tessedit_reject_row_percent &&
300 tessedit_whole_wd_rej_row_percent) {
301 if (tessedit_debug_block_rejection) {
302 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
305 prev_word_rejected =
false;
306 while ((word = page_res_it.
word()) !=
nullptr && page_res_it.
row() == current_row) {
308 if (!tessedit_row_rej_good_docs && good_quality_doc) {
311 tessedit_good_doc_still_rowrej_wd;
312 }
else if (tessedit_preserve_row_rej_perfect_wds) {
316 if (rej_word && tessedit_dont_rowrej_good_wds &&
333 if (tessedit_use_reject_spaces && prev_word_rejected &&
339 prev_word_rejected = rej_word;
343 if (tessedit_debug_block_rejection) {
344 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
347 while (page_res_it.
word() !=
nullptr && page_res_it.
row() == current_row) {
365 while (page_res_it.
word() !=
nullptr) {
377 bool prev_potential_marked =
false;
378 bool found_terrible_word =
false;
382 while (page_res_it.
word() !=
nullptr) {
384 if (pb !=
nullptr && !pb->
IsText()) {
388 word = page_res_it.
word();
390 if (crunch_early_convert_bad_unlv_chs) {
394 if (crunch_early_merge_tess_fails) {
399 found_terrible_word =
false;
401 prev_potential_marked =
false;
407 if (crunch_debug > 0) {
411 if (prev_potential_marked) {
412 while (copy_it.
word() != word) {
413 if (crunch_debug > 0) {
414 tprintf(
"P1 CRUNCHING: \"%s\"\n",
420 prev_potential_marked =
false;
422 found_terrible_word =
true;
425 if (found_terrible_word) {
426 if (crunch_debug > 0) {
430 }
else if (!prev_potential_marked) {
431 copy_it = page_res_it;
432 prev_potential_marked =
true;
433 if (crunch_debug > 1) {
438 found_terrible_word =
false;
440 prev_potential_marked =
false;
441 if (crunch_debug > 2) {
461 if (adjusted_len > crunch_rating_max) {
462 adjusted_len = crunch_rating_max;
466 if (rating_per_ch > crunch_terrible_rating) {
468 }
else if (crunch_terrible_garbage && (garbage_level ==
G_TERRIBLE)) {
471 (garbage_level !=
G_OK)) {
473 }
else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level !=
G_OK)) {
477 if (crunch_mode > 0) {
478 if (crunch_debug > 2) {
479 tprintf(
"Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
494 bool word_crunchable;
495 int poor_indicator_count = 0;
502 if (adjusted_len > 10) {
507 if (rating_per_ch > crunch_pot_poor_rate) {
508 if (crunch_debug > 2) {
511 poor_indicator_count++;
515 if (crunch_debug > 2) {
518 poor_indicator_count++;
521 if (garbage_level !=
G_OK) {
522 if (crunch_debug > 2) {
525 poor_indicator_count++;
527 return poor_indicator_count >= crunch_pot_indicators;
533 bool deleting_from_bol =
false;
534 bool marked_delete_point =
false;
535 int16_t debug_delete_mode;
537 int16_t x_debug_delete_mode;
541 while (page_res_it.
word() !=
nullptr) {
542 word = page_res_it.
word();
547 if (crunch_debug > 0) {
548 tprintf(
"BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
552 deleting_from_bol =
true;
554 if (marked_delete_point) {
555 while (copy_it.
word() != word) {
557 if (crunch_debug > 0) {
558 tprintf(
"EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
565 if (crunch_debug > 0) {
566 tprintf(
"EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
570 deleting_from_bol =
false;
571 marked_delete_point =
false;
573 if (!marked_delete_point) {
574 copy_it = page_res_it;
575 marked_delete_point =
true;
579 deleting_from_bol =
false;
581 marked_delete_point =
false;
587 if (!crunch_early_merge_tess_fails) {
630 int isolated_digits = 0;
631 int isolated_alphas = 0;
632 int bad_char_count = 0;
637 int alpha_repetition_count = 0;
638 int longest_alpha_repetition_count = 0;
639 int longest_lower_run_len = 0;
640 int lower_string_count = 0;
641 int longest_upper_run_len = 0;
642 int upper_string_count = 0;
643 int total_alpha_count = 0;
644 int total_digit_count = 0;
646 for (; *str !=
'\0'; str += *(lengths++)) {
651 case SUBSEQUENT_UPPER:
653 state = SUBSEQUENT_UPPER;
654 upper_string_count++;
655 if (longest_upper_run_len < upper_string_count) {
656 longest_upper_run_len = upper_string_count;
659 alpha_repetition_count++;
660 if (longest_alpha_repetition_count < alpha_repetition_count) {
661 longest_alpha_repetition_count = alpha_repetition_count;
665 alpha_repetition_count = 1;
674 alpha_repetition_count = 1;
675 upper_string_count = 1;
681 case SUBSEQUENT_LOWER:
683 state = SUBSEQUENT_LOWER;
684 lower_string_count++;
685 if (longest_lower_run_len < lower_string_count) {
686 longest_lower_run_len = lower_string_count;
689 alpha_repetition_count++;
690 if (longest_alpha_repetition_count < alpha_repetition_count) {
691 longest_alpha_repetition_count = alpha_repetition_count;
695 alpha_repetition_count = 1;
704 alpha_repetition_count = 1;
705 lower_string_count = 1;
712 state = SUBSEQUENT_NUM;
724 if (*lengths == 1 && *str ==
' ') {
754 if (crunch_include_numerals) {
755 total_alpha_count += total_digit_count - isolated_digits;
758 if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
759 longest_alpha_repetition_count < crunch_long_repetitions) {
760 if ((crunch_accept_ok &&
762 longest_lower_run_len > crunch_leave_lc_strings ||
763 longest_upper_run_len > crunch_leave_uc_strings) {
776 ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
778 if (crunch_debug > 3) {
780 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
781 isolated_digits, isolated_alphas, tess_rejs);
783 if (bad_char_count == 0 && tess_rejs == 0 &&
784 (len > isolated_digits + isolated_alphas || len <= 2)) {
788 if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
793 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
794 if (dodgy_chars > 5 || (dodgy_chars /
static_cast<float>(len)) > 0.5) {
800 dodgy_chars = 2 * tess_rejs + bad_char_count;
801 if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
866 if (rating_per_ch > crunch_del_rating) {
899 for (; *str !=
'\0'; str++) {
909 int16_t outline_count = 0;
910 int16_t small_outline_count = 0;
911 int16_t max_dimension;
912 float small_limit =
kBlnXHeight * crunch_small_outlines_size;
914 for (
unsigned b = 0; b < word->
NumBlobs(); ++b) {
918 box = ol->bounding_box();
920 max_dimension = box.
height();
922 max_dimension = box.
width();
924 if (max_dimension < small_limit) {
925 small_outline_count++;
929 return small_outline_count >= outline_count;
@ AC_UNACCEPTABLE
Unacceptable word.
void tprintf(const char *format,...)
void reject_whole_page(PAGE_RES_IT &page_res_it)
const int kBlnBaselineOffset
void tilde_delete(PAGE_RES_IT &page_res_it)
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
int16_t word_blob_quality(WERD_RES *word)
void tilde_crunch(PAGE_RES_IT &page_res_it)
void unrej_good_chs(WERD_RES *word)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
int16_t word_outline_errs(WERD_RES *word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool noise_outlines(TWERD *word)
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
int16_t safe_dict_word(const WERD_RES *werd_res)
int16_t failure_count(WERD_RES *word)
void convert_bad_unlv_chs(WERD_RES *word_res)
bool check_debug_pt(WERD_RES *word, int location)
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
int16_t count_outline_errs(char c, int16_t outline_count)
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
TBOX bounding_box() const
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
void ProcessMatchedBlobs(const TWERD &other, const std::function< void(int)> &cb) const
PDBLK pdblk
Page Description Block.
int32_t whole_word_rej_count
WERD_CHOICE * best_choice
CRUNCH_MODE unlv_crunch_mode
tesseract::BoxWord * bln_boxes
const UNICHARSET * uch_set
BLOCK_RES * block() const
WERD_RES * restart_page()
ROW_RES * prev_row() const
POLY_BLOCK * poly_block() const
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
UNICHAR_ID unichar_id(unsigned index) const
const std::string & unichar_lengths() const
std::string & unichar_string()
TDimension height() const
TDimension bottom() const
int16_t reject_count() const
int16_t accept_count() const
void rej_word_block_rej()
bool quality_recoverable_rejects() const
bool flag(WERD_FLAGS mask) const
bool get_islower(UNICHAR_ID unichar_id) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const