tesseract v5.3.3.20231005
docqual.cpp
Go to the documentation of this file.
1/******************************************************************
2 * File: docqual.cpp (Formerly docqual.c)
3 * Description: Document Quality Metrics
4 * Author: Phil Cheatle
5 *
6 * (C) Copyright 1994, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include "docqual.h"
20#include <cctype>
21#include "reject.h"
22#include "tesseractclass.h"
23#include "tessvars.h"
24
25namespace tesseract {
26
27static void countMatchingBlobs(int16_t &match_count, int /*index*/) {
28 ++match_count;
29}
30
31static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,
32 int index) {
33 if (word->reject_map[index].accepted()) {
34 ++accepted_match_count;
35 }
36 ++match_count;
37}
38
39static void acceptIfGoodQuality(WERD_RES *word, int index) {
40 if (word->reject_map[index].accept_if_good_quality()) {
41 word->reject_map[index].setrej_quality_accept();
42 }
43}
44
45/*************************************************************************
46 * word_blob_quality()
47 * How many blobs in the box_word are identical to those of the inword?
48 * ASSUME blobs in both initial word and box_word are in ascending order of
49 * left hand blob edge.
50 *************************************************************************/
52 int16_t match_count = 0;
53 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
54 !word->rebuild_word->blobs.empty()) {
55 using namespace std::placeholders; // for _1
57 std::bind(countMatchingBlobs, match_count, _1));
58 }
59 return match_count;
60}
61
63 int16_t i = 0;
64 int16_t err_count = 0;
65
66 if (word->rebuild_word != nullptr) {
67 for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
68 TBLOB *blob = word->rebuild_word->blobs[b];
69 err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());
70 i++;
71 }
72 }
73 return err_count;
74}
75
76/*************************************************************************
77 * word_char_quality()
78 * Combination of blob quality and outline quality - how many good chars are
79 * there? - I.e chars which pass the blob AND outline tests.
80 *************************************************************************/
81void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,
82 int16_t *accepted_match_count) {
83 *match_count = 0;
84 *accepted_match_count = 0;
85 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
86 !word->rebuild_word->blobs.empty()) {
87 using namespace std::placeholders; // for _1
89 *word->rebuild_word,
90 std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));
91 }
92}
93
94/*************************************************************************
95 * unrej_good_chs()
96 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
97 *************************************************************************/
99 if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&
100 word->rebuild_word->blobs.empty()) {
101 using namespace std::placeholders; // for _1
103 std::bind(acceptIfGoodQuality, word, _1));
104 }
105}
106
107int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
108 int expected_outline_count;
109
110 if (outlines_odd.contains(c)) {
111 return 0; // Don't use this char
112 } else if (outlines_2.contains(c)) {
113 expected_outline_count = 2;
114 } else {
115 expected_outline_count = 1;
116 }
117 return abs(outline_count - expected_outline_count);
118}
119
120void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {
121 if ((tessedit_good_quality_unrej && good_quality_doc)) {
122 unrej_good_quality_words(page_res_it);
123 }
124 doc_and_block_rejection(page_res_it, good_quality_doc);
125 if (unlv_tilde_crunching) {
126 tilde_crunch(page_res_it);
127 tilde_delete(page_res_it);
128 }
129}
130
131/*************************************************************************
132 * unrej_good_quality_words()
133 * Accept potential rejects in words which pass the following checks:
134 * - Contains a potential reject
135 * - Word looks like a sensible alpha word.
136 * - Word segmentation is the same as the original image
137 * - All characters have the expected number of outlines
138 * NOTE - the rejection counts are recalculated after unrejection
139 * - CAN'T do it in a single pass without a bit of fiddling
140 * - keep it simple but inefficient
141 *************************************************************************/
142void Tesseract::unrej_good_quality_words( // unreject potential
143 PAGE_RES_IT &page_res_it) {
144 WERD_RES *word;
145 ROW_RES *current_row;
146 BLOCK_RES *current_block;
147 int i;
148
149 page_res_it.restart_page();
150 while (page_res_it.word() != nullptr) {
151 check_debug_pt(page_res_it.word(), 100);
152 if (bland_unrej) {
153 word = page_res_it.word();
154 for (i = 0; i < word->reject_map.length(); i++) {
155 if (word->reject_map[i].accept_if_good_quality()) {
156 word->reject_map[i].setrej_quality_accept();
157 }
158 }
159 page_res_it.forward();
160 } else if ((page_res_it.row()->char_count > 0) &&
161 ((page_res_it.row()->rej_count /
162 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {
163 word = page_res_it.word();
165 (tessedit_unrej_any_wd ||
167 word->best_choice->unichar_lengths().c_str()) !=
169 unrej_good_chs(word);
170 }
171 page_res_it.forward();
172 } else {
173 // Skip to end of dodgy row.
174 current_row = page_res_it.row();
175 while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {
176 page_res_it.forward();
177 }
178 }
179 check_debug_pt(page_res_it.word(), 110);
180 }
181 page_res_it.restart_page();
182 page_res_it.page_res->char_count = 0;
183 page_res_it.page_res->rej_count = 0;
184 current_block = nullptr;
185 current_row = nullptr;
186 while (page_res_it.word() != nullptr) {
187 if (current_block != page_res_it.block()) {
188 current_block = page_res_it.block();
189 current_block->char_count = 0;
190 current_block->rej_count = 0;
191 }
192 if (current_row != page_res_it.row()) {
193 current_row = page_res_it.row();
194 current_row->char_count = 0;
195 current_row->rej_count = 0;
196 current_row->whole_word_rej_count = 0;
197 }
198 page_res_it.rej_stat_word();
199 page_res_it.forward();
200 }
201}
202
203/*************************************************************************
204 * doc_and_block_rejection()
205 *
206 * If the page has too many rejects - reject all of it.
207 * If any block has too many rejects - reject all words in the block
208 *************************************************************************/
209
210void Tesseract::doc_and_block_rejection( // reject big chunks
211 PAGE_RES_IT &page_res_it, bool good_quality_doc) {
212 int16_t block_no = 0;
213 int16_t row_no = 0;
214 BLOCK_RES *current_block;
215 ROW_RES *current_row;
216
217 bool rej_word;
218 bool prev_word_rejected;
219 int16_t char_quality = 0;
220 int16_t accepted_char_quality;
221
222 if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >
223 tessedit_reject_doc_percent) {
224 reject_whole_page(page_res_it);
225 if (tessedit_debug_doc_rejection) {
226 tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,
227 page_res_it.page_res->rej_count);
228 }
229 } else {
230 if (tessedit_debug_doc_rejection) {
231 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count,
232 page_res_it.page_res->rej_count);
233 }
234
235 /* Walk blocks testing for block rejection */
236
237 page_res_it.restart_page();
238 WERD_RES *word;
239 while ((word = page_res_it.word()) != nullptr) {
240 current_block = page_res_it.block();
241 block_no = current_block->block->pdblk.index();
242 if (current_block->char_count > 0 &&
243 (current_block->rej_count * 100.0 / current_block->char_count) >
244 tessedit_reject_block_percent) {
245 if (tessedit_debug_block_rejection) {
246 tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no,
247 current_block->char_count, current_block->rej_count);
248 }
249 prev_word_rejected = false;
250 while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {
251 if (tessedit_preserve_blk_rej_perfect_wds) {
252 rej_word = word->reject_map.reject_count() > 0 ||
253 word->reject_map.length() < tessedit_preserve_min_wd_len;
254 if (rej_word && tessedit_dont_blkrej_good_wds &&
255 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
257 word->best_choice->unichar_lengths().c_str()) !=
259 word_char_quality(word, &char_quality, &accepted_char_quality);
260 rej_word = char_quality != word->reject_map.length();
261 }
262 } else {
263 rej_word = true;
264 }
265 if (rej_word) {
266 /*
267 Reject spacing if both current and prev words are rejected.
268 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
269 generated more space errors.
270*/
271 if (tessedit_use_reject_spaces && prev_word_rejected &&
272 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
273 word->reject_spaces = true;
274 }
276 }
277 prev_word_rejected = rej_word;
278 page_res_it.forward();
279 }
280 } else {
281 if (tessedit_debug_block_rejection) {
282 tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no,
283 page_res_it.block()->char_count, page_res_it.block()->rej_count);
284 }
285
286 /* Walk rows in block testing for row rejection */
287 row_no = 0;
288 while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {
289 current_row = page_res_it.row();
290 row_no++;
291 /* Reject whole row if:
292 fraction of chars on row which are rejected exceed a limit AND
293 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
294 limit
295*/
296 if (current_row->char_count > 0 &&
297 (current_row->rej_count * 100.0 / current_row->char_count) >
298 tessedit_reject_row_percent &&
299 (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <
300 tessedit_whole_wd_rej_row_percent) {
301 if (tessedit_debug_block_rejection) {
302 tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no,
303 current_row->char_count, current_row->rej_count);
304 }
305 prev_word_rejected = false;
306 while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {
307 /* Preserve words on good docs unless they are mostly rejected*/
308 if (!tessedit_row_rej_good_docs && good_quality_doc) {
309 rej_word = word->reject_map.reject_count() /
310 static_cast<float>(word->reject_map.length()) >
311 tessedit_good_doc_still_rowrej_wd;
312 } else if (tessedit_preserve_row_rej_perfect_wds) {
313 /* Preserve perfect words anyway */
314 rej_word = word->reject_map.reject_count() > 0 ||
315 word->reject_map.length() < tessedit_preserve_min_wd_len;
316 if (rej_word && tessedit_dont_rowrej_good_wds &&
317 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
319 *word->uch_set, word->best_choice->unichar_string().c_str(),
320 word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {
321 word_char_quality(word, &char_quality, &accepted_char_quality);
322 rej_word = char_quality != word->reject_map.length();
323 }
324 } else {
325 rej_word = true;
326 }
327 if (rej_word) {
328 /*
329 Reject spacing if both current and prev words are rejected.
330 NOTE - this is NOT restricted to FUZZY spaces. - When tried
331 this generated more space errors.
332*/
333 if (tessedit_use_reject_spaces && prev_word_rejected &&
334 page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {
335 word->reject_spaces = true;
336 }
338 }
339 prev_word_rejected = rej_word;
340 page_res_it.forward();
341 }
342 } else {
343 if (tessedit_debug_block_rejection) {
344 tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no,
345 current_row->char_count, current_row->rej_count);
346 }
347 while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {
348 page_res_it.forward();
349 }
350 }
351 }
352 }
353 }
354 }
355}
356
357/*************************************************************************
358 * reject_whole_page()
359 * Don't believe any of it - set the reject map to 00..00 in all words
360 *
361 *************************************************************************/
362
363void reject_whole_page(PAGE_RES_IT &page_res_it) {
364 page_res_it.restart_page();
365 while (page_res_it.word() != nullptr) {
366 page_res_it.word()->reject_map.rej_word_doc_rej();
367 page_res_it.forward();
368 }
369 // whole page is rejected
370 page_res_it.page_res->rejected = true;
371}
372
374 WERD_RES *word;
375 GARBAGE_LEVEL garbage_level;
376 PAGE_RES_IT copy_it;
377 bool prev_potential_marked = false;
378 bool found_terrible_word = false;
379 bool ok_dict_word;
380
381 page_res_it.restart_page();
382 while (page_res_it.word() != nullptr) {
383 POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();
384 if (pb != nullptr && !pb->IsText()) {
385 page_res_it.forward();
386 continue;
387 }
388 word = page_res_it.word();
389
390 if (crunch_early_convert_bad_unlv_chs) {
392 }
393
394 if (crunch_early_merge_tess_fails) {
395 word->merge_tess_fails();
396 }
397
398 if (word->reject_map.accept_count() != 0) {
399 found_terrible_word = false;
400 // Forget earlier potential crunches
401 prev_potential_marked = false;
402 } else {
403 ok_dict_word = safe_dict_word(word);
404 garbage_level = garbage_word(word, ok_dict_word);
405
406 if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {
407 if (crunch_debug > 0) {
408 tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
409 }
411 if (prev_potential_marked) {
412 while (copy_it.word() != word) {
413 if (crunch_debug > 0) {
414 tprintf("P1 CRUNCHING: \"%s\"\n",
415 copy_it.word()->best_choice->unichar_string().c_str());
416 }
418 copy_it.forward();
419 }
420 prev_potential_marked = false;
421 }
422 found_terrible_word = true;
423 } else if ((garbage_level != G_NEVER_CRUNCH) &&
424 (potential_word_crunch(word, garbage_level, ok_dict_word))) {
425 if (found_terrible_word) {
426 if (crunch_debug > 0) {
427 tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
428 }
430 } else if (!prev_potential_marked) {
431 copy_it = page_res_it;
432 prev_potential_marked = true;
433 if (crunch_debug > 1) {
434 tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());
435 }
436 }
437 } else {
438 found_terrible_word = false;
439 // Forget earlier potential crunches
440 prev_potential_marked = false;
441 if (crunch_debug > 2) {
442 tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());
443 }
444 }
445 }
446 page_res_it.forward();
447 }
448}
449
451 float rating_per_ch;
452 int adjusted_len;
453 int crunch_mode = 0;
454
455 if (word->best_choice->unichar_string().empty() ||
456 (strspn(word->best_choice->unichar_string().c_str(), " ") ==
457 word->best_choice->unichar_string().size())) {
458 crunch_mode = 1;
459 } else {
460 adjusted_len = word->reject_map.length();
461 if (adjusted_len > crunch_rating_max) {
462 adjusted_len = crunch_rating_max;
463 }
464 rating_per_ch = word->best_choice->rating() / adjusted_len;
465
466 if (rating_per_ch > crunch_terrible_rating) {
467 crunch_mode = 2;
468 } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {
469 crunch_mode = 3;
470 } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&
471 (garbage_level != G_OK)) {
472 crunch_mode = 4;
473 } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {
474 crunch_mode = 5;
475 }
476 }
477 if (crunch_mode > 0) {
478 if (crunch_debug > 2) {
479 tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,
480 word->best_choice->unichar_string().c_str());
481 }
482 return true;
483 } else {
484 return false;
485 }
486}
487
489 bool ok_dict_word) {
490 float rating_per_ch;
491 int adjusted_len;
492 const char *str = word->best_choice->unichar_string().c_str();
493 const char *lengths = word->best_choice->unichar_lengths().c_str();
494 bool word_crunchable;
495 int poor_indicator_count = 0;
496
497 word_crunchable =
498 !crunch_leave_accept_strings || word->reject_map.length() < 3 ||
499 (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);
500
501 adjusted_len = word->reject_map.length();
502 if (adjusted_len > 10) {
503 adjusted_len = 10;
504 }
505 rating_per_ch = word->best_choice->rating() / adjusted_len;
506
507 if (rating_per_ch > crunch_pot_poor_rate) {
508 if (crunch_debug > 2) {
509 tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());
510 }
511 poor_indicator_count++;
512 }
513
514 if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {
515 if (crunch_debug > 2) {
516 tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());
517 }
518 poor_indicator_count++;
519 }
520
521 if (garbage_level != G_OK) {
522 if (crunch_debug > 2) {
523 tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());
524 }
525 poor_indicator_count++;
526 }
527 return poor_indicator_count >= crunch_pot_indicators;
528}
529
531 WERD_RES *word;
532 PAGE_RES_IT copy_it;
533 bool deleting_from_bol = false;
534 bool marked_delete_point = false;
535 int16_t debug_delete_mode;
536 CRUNCH_MODE delete_mode;
537 int16_t x_debug_delete_mode;
538 CRUNCH_MODE x_delete_mode;
539
540 page_res_it.restart_page();
541 while (page_res_it.word() != nullptr) {
542 word = page_res_it.word();
543
544 delete_mode = word_deletable(word, debug_delete_mode);
545 if (delete_mode != CR_NONE) {
546 if (word->word->flag(W_BOL) || deleting_from_bol) {
547 if (crunch_debug > 0) {
548 tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
549 word->best_choice->unichar_string().c_str());
550 }
551 word->unlv_crunch_mode = delete_mode;
552 deleting_from_bol = true;
553 } else if (word->word->flag(W_EOL)) {
554 if (marked_delete_point) {
555 while (copy_it.word() != word) {
556 x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);
557 if (crunch_debug > 0) {
558 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,
559 copy_it.word()->best_choice->unichar_string().c_str());
560 }
561 copy_it.word()->unlv_crunch_mode = x_delete_mode;
562 copy_it.forward();
563 }
564 }
565 if (crunch_debug > 0) {
566 tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,
567 word->best_choice->unichar_string().c_str());
568 }
569 word->unlv_crunch_mode = delete_mode;
570 deleting_from_bol = false;
571 marked_delete_point = false;
572 } else {
573 if (!marked_delete_point) {
574 copy_it = page_res_it;
575 marked_delete_point = true;
576 }
577 }
578 } else {
579 deleting_from_bol = false;
580 // Forget earlier potential crunches
581 marked_delete_point = false;
582 }
583 /*
584 The following step has been left till now as the tess fails are used to
585 determine if the word is deletable.
586*/
587 if (!crunch_early_merge_tess_fails) {
588 word->merge_tess_fails();
589 }
590 page_res_it.forward();
591 }
592}
593
595 int i;
596 UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
597 UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
598 UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
599 UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
600 for (i = 0; i < word_res->reject_map.length(); ++i) {
601 if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
602 word_res->best_choice->set_unichar_id(unichar_dash, i);
603 if (word_res->reject_map[i].accepted()) {
604 word_res->reject_map[i].setrej_unlv_rej();
605 }
606 }
607 if (word_res->best_choice->unichar_id(i) == unichar_pow) {
608 word_res->best_choice->set_unichar_id(unichar_space, i);
609 if (word_res->reject_map[i].accepted()) {
610 word_res->reject_map[i].setrej_unlv_rej();
611 }
612 }
613 }
614}
615
617 enum STATES {
618 JUNK,
619 FIRST_UPPER,
620 FIRST_LOWER,
621 FIRST_NUM,
622 SUBSEQUENT_UPPER,
623 SUBSEQUENT_LOWER,
624 SUBSEQUENT_NUM
625 };
626 const char *str = word->best_choice->unichar_string().c_str();
627 const char *lengths = word->best_choice->unichar_lengths().c_str();
628 STATES state = JUNK;
629 int len = 0;
630 int isolated_digits = 0;
631 int isolated_alphas = 0;
632 int bad_char_count = 0;
633 int tess_rejs = 0;
634 int dodgy_chars = 0;
635 int ok_chars;
636 UNICHAR_ID last_char = -1;
637 int alpha_repetition_count = 0;
638 int longest_alpha_repetition_count = 0;
639 int longest_lower_run_len = 0;
640 int lower_string_count = 0;
641 int longest_upper_run_len = 0;
642 int upper_string_count = 0;
643 int total_alpha_count = 0;
644 int total_digit_count = 0;
645
646 for (; *str != '\0'; str += *(lengths++)) {
647 len++;
648 if (word->uch_set->get_isupper(str, *lengths)) {
649 total_alpha_count++;
650 switch (state) {
651 case SUBSEQUENT_UPPER:
652 case FIRST_UPPER:
653 state = SUBSEQUENT_UPPER;
654 upper_string_count++;
655 if (longest_upper_run_len < upper_string_count) {
656 longest_upper_run_len = upper_string_count;
657 }
658 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
659 alpha_repetition_count++;
660 if (longest_alpha_repetition_count < alpha_repetition_count) {
661 longest_alpha_repetition_count = alpha_repetition_count;
662 }
663 } else {
664 last_char = word->uch_set->unichar_to_id(str, *lengths);
665 alpha_repetition_count = 1;
666 }
667 break;
668 case FIRST_NUM:
669 isolated_digits++;
670 // Fall through.
671 default:
672 state = FIRST_UPPER;
673 last_char = word->uch_set->unichar_to_id(str, *lengths);
674 alpha_repetition_count = 1;
675 upper_string_count = 1;
676 break;
677 }
678 } else if (word->uch_set->get_islower(str, *lengths)) {
679 total_alpha_count++;
680 switch (state) {
681 case SUBSEQUENT_LOWER:
682 case FIRST_LOWER:
683 state = SUBSEQUENT_LOWER;
684 lower_string_count++;
685 if (longest_lower_run_len < lower_string_count) {
686 longest_lower_run_len = lower_string_count;
687 }
688 if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
689 alpha_repetition_count++;
690 if (longest_alpha_repetition_count < alpha_repetition_count) {
691 longest_alpha_repetition_count = alpha_repetition_count;
692 }
693 } else {
694 last_char = word->uch_set->unichar_to_id(str, *lengths);
695 alpha_repetition_count = 1;
696 }
697 break;
698 case FIRST_NUM:
699 isolated_digits++;
700 // Fall through.
701 default:
702 state = FIRST_LOWER;
703 last_char = word->uch_set->unichar_to_id(str, *lengths);
704 alpha_repetition_count = 1;
705 lower_string_count = 1;
706 break;
707 }
708 } else if (word->uch_set->get_isdigit(str, *lengths)) {
709 total_digit_count++;
710 switch (state) {
711 case FIRST_NUM:
712 state = SUBSEQUENT_NUM;
713 case SUBSEQUENT_NUM:
714 break;
715 case FIRST_UPPER:
716 case FIRST_LOWER:
717 isolated_alphas++;
718 // Fall through.
719 default:
720 state = FIRST_NUM;
721 break;
722 }
723 } else {
724 if (*lengths == 1 && *str == ' ') {
725 tess_rejs++;
726 } else {
727 bad_char_count++;
728 }
729 switch (state) {
730 case FIRST_NUM:
731 isolated_digits++;
732 break;
733 case FIRST_UPPER:
734 case FIRST_LOWER:
735 isolated_alphas++;
736 default:
737 break;
738 }
739 state = JUNK;
740 }
741 }
742
743 switch (state) {
744 case FIRST_NUM:
745 isolated_digits++;
746 break;
747 case FIRST_UPPER:
748 case FIRST_LOWER:
749 isolated_alphas++;
750 default:
751 break;
752 }
753
754 if (crunch_include_numerals) {
755 total_alpha_count += total_digit_count - isolated_digits;
756 }
757
758 if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&
759 longest_alpha_repetition_count < crunch_long_repetitions) {
760 if ((crunch_accept_ok &&
761 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||
762 longest_lower_run_len > crunch_leave_lc_strings ||
763 longest_upper_run_len > crunch_leave_uc_strings) {
764 return G_NEVER_CRUNCH;
765 }
766 }
767 if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&
769 word->best_choice->permuter() == FREQ_DAWG_PERM ||
770 word->best_choice->permuter() == USER_DAWG_PERM ||
771 word->best_choice->permuter() == NUMBER_PERM ||
772 acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {
773 return G_OK;
774 }
775
776 ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;
777
778 if (crunch_debug > 3) {
779 tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());
780 tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n", len, bad_char_count,
781 isolated_digits, isolated_alphas, tess_rejs);
782 }
783 if (bad_char_count == 0 && tess_rejs == 0 &&
784 (len > isolated_digits + isolated_alphas || len <= 2)) {
785 return G_OK;
786 }
787
788 if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {
789 return G_TERRIBLE;
790 }
791
792 if (len > 4) {
793 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;
794 if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {
795 return G_DODGY;
796 } else {
797 return G_OK;
798 }
799 } else {
800 dodgy_chars = 2 * tess_rejs + bad_char_count;
801 if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {
802 return G_DODGY;
803 } else {
804 return G_OK;
805 }
806 }
807}
808
809/*************************************************************************
810 * word_deletable()
811 * DELETE WERDS AT ENDS OF ROWS IF
812 * Word is crunched &&
813 * ( string length = 0 OR
814 * > 50% of chars are "|" (before merging) OR
815 * certainty < -10 OR
816 * rating /char > 60 OR
817 * TOP of word is more than 0.5 xht BELOW baseline OR
818 * BOTTOM of word is more than 0.5 xht ABOVE xht OR
819 * length of word < 3xht OR
820 * height of word < 0.7 xht OR
821 * height of word > 3.0 xht OR
822 * >75% of the outline BBs have longest dimension < 0.5xht
823 *************************************************************************/
824
825CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
826 int word_len = word->reject_map.length();
827 float rating_per_ch;
828 TBOX box; // BB of word
829
830 if (word->unlv_crunch_mode == CR_NONE) {
831 delete_mode = 0;
832 return CR_NONE;
833 }
834
835 if (word_len == 0) {
836 delete_mode = 1;
837 return CR_DELETE;
838 }
839
840 if (word->rebuild_word != nullptr) {
841 // Cube leaves rebuild_word nullptr.
842 box = word->rebuild_word->bounding_box();
843 if (box.height() < crunch_del_min_ht * kBlnXHeight) {
844 delete_mode = 4;
845 return CR_DELETE;
846 }
847
848 if (noise_outlines(word->rebuild_word)) {
849 delete_mode = 5;
850 return CR_DELETE;
851 }
852 }
853
854 if ((failure_count(word) * 1.5) > word_len) {
855 delete_mode = 2;
856 return CR_LOOSE_SPACE;
857 }
858
859 if (word->best_choice->certainty() < crunch_del_cert) {
860 delete_mode = 7;
861 return CR_LOOSE_SPACE;
862 }
863
864 rating_per_ch = word->best_choice->rating() / word_len;
865
866 if (rating_per_ch > crunch_del_rating) {
867 delete_mode = 8;
868 return CR_LOOSE_SPACE;
869 }
870
871 if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
872 delete_mode = 9;
873 return CR_LOOSE_SPACE;
874 }
875
876 if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
877 delete_mode = 10;
878 return CR_LOOSE_SPACE;
879 }
880
881 if (box.height() > crunch_del_max_ht * kBlnXHeight) {
882 delete_mode = 11;
883 return CR_LOOSE_SPACE;
884 }
885
886 if (box.width() < crunch_del_min_width * kBlnXHeight) {
887 delete_mode = 3;
888 return CR_LOOSE_SPACE;
889 }
890
891 delete_mode = 0;
892 return CR_NONE;
893}
894
896 const char *str = word->best_choice->unichar_string().c_str();
897 int tess_rejs = 0;
898
899 for (; *str != '\0'; str++) {
900 if (*str == ' ') {
901 tess_rejs++;
902 }
903 }
904 return tess_rejs;
905}
906
908 TBOX box; // BB of outline
909 int16_t outline_count = 0;
910 int16_t small_outline_count = 0;
911 int16_t max_dimension;
912 float small_limit = kBlnXHeight * crunch_small_outlines_size;
913
914 for (unsigned b = 0; b < word->NumBlobs(); ++b) {
915 TBLOB *blob = word->blobs[b];
916 for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {
917 outline_count++;
918 box = ol->bounding_box();
919 if (box.height() > box.width()) {
920 max_dimension = box.height();
921 } else {
922 max_dimension = box.width();
923 }
924 if (max_dimension < small_limit) {
925 small_outline_count++;
926 }
927 }
928 }
929 return small_outline_count >= outline_count;
930}
931
932} // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_LOOSE_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const int kBlnXHeight
Definition: normalis.h:33
int UNICHAR_ID
Definition: unichar.h:34
GARBAGE_LEVEL
Definition: docqual.h:30
@ G_TERRIBLE
Definition: docqual.h:30
@ G_NEVER_CRUNCH
Definition: docqual.h:30
@ G_OK
Definition: docqual.h:30
@ G_DODGY
Definition: docqual.h:30
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
@ NUMBER_PERM
Definition: ratngs.h:242
@ USER_DAWG_PERM
Definition: ratngs.h:246
@ FREQ_DAWG_PERM
Definition: ratngs.h:247
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:363
const int kBlnBaselineOffset
Definition: normalis.h:34
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:530
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:616
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:373
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:98
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:210
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:907
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:895
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:594
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:488
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:450
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:825
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:107
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:142
TESSLINE * next
Definition: blobs.h:288
TESSLINE * outlines
Definition: blobs.h:404
int NumOutlines() const
Definition: blobs.cpp:452
TBOX bounding_box() const
Definition: blobs.cpp:863
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
void ProcessMatchedBlobs(const TWERD &other, const std::function< void(int)> &cb) const
Definition: boxword.cpp:201
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int32_t rej_count
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:121
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
WERD_CHOICE * best_choice
Definition: pageres.h:239
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193
const UNICHARSET * uch_set
Definition: pageres.h:201
TWERD * rebuild_word
Definition: pageres.h:264
BLOCK_RES * block() const
Definition: pageres.h:769
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
WERD_RES * restart_page()
Definition: pageres.h:710
ROW_RES * prev_row() const
Definition: pageres.h:757
ROW_RES * row() const
Definition: pageres.h:766
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
int index() const
Definition: pdblock.h:77
bool IsText() const
Definition: polyblk.h:52
float certainty() const
Definition: ratngs.h:315
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:344
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
uint8_t permuter() const
Definition: ratngs.h:331
const std::string & unichar_lengths() const
Definition: ratngs.h:533
std::string & unichar_string()
Definition: ratngs.h:519
float rating() const
Definition: ratngs.h:312
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension bottom() const
Definition: rect.h:75
int16_t reject_count() const
Definition: rejctmap.h:339
void rej_word_row_rej()
Definition: rejctmap.cpp:211
int16_t accept_count() const
Definition: rejctmap.cpp:72
uint16_t length() const
Definition: rejctmap.h:333
void rej_word_block_rej()
Definition: rejctmap.cpp:203
bool quality_recoverable_rejects() const
Definition: rejctmap.cpp:91
void rej_word_doc_rej()
Definition: rejctmap.cpp:195
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
uint8_t space() const
Definition: werd.h:100
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186