All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  * Created: Mon May 9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <ctype.h>
25 #include "docqual.h"
26 #include "reject.h"
27 #include "tesscallback.h"
28 #include "tessvars.h"
29 #include "globals.h"
30 #include "tesseractclass.h"
31 
32 namespace tesseract{
33 
34 // A little class to provide the callbacks as we have no pre-bound args.
36  explicit DocQualCallbacks(WERD_RES* word0)
37  : word(word0), match_count(0), accepted_match_count(0) {}
38 
39  void CountMatchingBlobs(int index) {
40  ++match_count;
41  }
42 
43  void CountAcceptedBlobs(int index) {
44  if (word->reject_map[index].accepted())
46  ++match_count;
47  }
48 
49  void AcceptIfGoodQuality(int index) {
50  if (word->reject_map[index].accept_if_good_quality())
51  word->reject_map[index].setrej_quality_accept();
52  }
53 
57 };
58 
59 /*************************************************************************
60  * word_blob_quality()
61  * How many blobs in the box_word are identical to those of the inword?
62  * ASSUME blobs in both initial word and box_word are in ascending order of
63  * left hand blob edge.
64  *************************************************************************/
66  if (word->bln_boxes == NULL ||
67  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
68  return 0;
69 
70  DocQualCallbacks cb(word);
72  *word->rebuild_word,
74  return cb.match_count;
75 }
76 
78  inT16 i = 0;
79  inT16 err_count = 0;
80 
81  if (word->rebuild_word != NULL) {
82  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
83  TBLOB* blob = word->rebuild_word->blobs[b];
84  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
85  blob->NumOutlines());
86  i++;
87  }
88  }
89  return err_count;
90 }
91 
92 /*************************************************************************
93  * word_char_quality()
94  * Combination of blob quality and outline quality - how many good chars are
95  * there? - I.e chars which pass the blob AND outline tests.
96  *************************************************************************/
98  ROW *row,
99  inT16 *match_count,
100  inT16 *accepted_match_count) {
101  if (word->bln_boxes == NULL ||
102  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
103  return;
104 
105  DocQualCallbacks cb(word);
107  *word->rebuild_word,
109  *match_count = cb.match_count;
110  *accepted_match_count = cb.accepted_match_count;
111 }
112 
113 /*************************************************************************
114  * unrej_good_chs()
115  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
116  *************************************************************************/
118  if (word->bln_boxes == NULL ||
119  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
120  return;
121 
122  DocQualCallbacks cb(word);
124  *word->rebuild_word,
126 }
127 
128 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
129  int expected_outline_count;
130 
131  if (STRING (outlines_odd).contains (c))
132  return 0; //Dont use this char
133  else if (STRING (outlines_2).contains (c))
134  expected_outline_count = 2;
135  else
136  expected_outline_count = 1;
137  return abs (outline_count - expected_outline_count);
138 }
139 
141  BOOL8 good_quality_doc) {
142  if ((tessedit_good_quality_unrej && good_quality_doc))
143  unrej_good_quality_words(page_res_it);
144  doc_and_block_rejection(page_res_it, good_quality_doc);
145  if (unlv_tilde_crunching) {
146  tilde_crunch(page_res_it);
147  tilde_delete(page_res_it);
148  }
149 }
150 
151 
152 /*************************************************************************
153  * unrej_good_quality_words()
154  * Accept potential rejects in words which pass the following checks:
155  * - Contains a potential reject
156  * - Word looks like a sensible alpha word.
157  * - Word segmentation is the same as the original image
158  * - All characters have the expected number of outlines
159  * NOTE - the rejection counts are recalculated after unrejection
160  * - CANT do it in a single pass without a bit of fiddling
161  * - keep it simple but inefficient
162  *************************************************************************/
163 void Tesseract::unrej_good_quality_words( //unreject potential
164  PAGE_RES_IT &page_res_it) {
165  WERD_RES *word;
166  ROW_RES *current_row;
167  BLOCK_RES *current_block;
168  int i;
169 
170  page_res_it.restart_page ();
171  while (page_res_it.word () != NULL) {
172  check_debug_pt (page_res_it.word (), 100);
173  if (bland_unrej) {
174  word = page_res_it.word ();
175  for (i = 0; i < word->reject_map.length (); i++) {
176  if (word->reject_map[i].accept_if_good_quality ())
177  word->reject_map[i].setrej_quality_accept ();
178  }
179  page_res_it.forward ();
180  }
181  else if ((page_res_it.row ()->char_count > 0) &&
182  ((page_res_it.row ()->rej_count /
183  (float) page_res_it.row ()->char_count) <=
185  word = page_res_it.word ();
189  word->best_choice->unichar_string().string(),
191  != AC_UNACCEPTABLE)) {
192  unrej_good_chs(word, page_res_it.row ()->row);
193  }
194  page_res_it.forward ();
195  }
196  else {
197  /* Skip to end of dodgy row */
198  current_row = page_res_it.row ();
199  while ((page_res_it.word () != NULL) &&
200  (page_res_it.row () == current_row))
201  page_res_it.forward ();
202  }
203  check_debug_pt (page_res_it.word (), 110);
204  }
205  page_res_it.restart_page ();
206  page_res_it.page_res->char_count = 0;
207  page_res_it.page_res->rej_count = 0;
208  current_block = NULL;
209  current_row = NULL;
210  while (page_res_it.word () != NULL) {
211  if (current_block != page_res_it.block ()) {
212  current_block = page_res_it.block ();
213  current_block->char_count = 0;
214  current_block->rej_count = 0;
215  }
216  if (current_row != page_res_it.row ()) {
217  current_row = page_res_it.row ();
218  current_row->char_count = 0;
219  current_row->rej_count = 0;
220  current_row->whole_word_rej_count = 0;
221  }
222  page_res_it.rej_stat_word ();
223  page_res_it.forward ();
224  }
225 }
226 
227 
228 /*************************************************************************
229  * doc_and_block_rejection()
230  *
231  * If the page has too many rejects - reject all of it.
232  * If any block has too many rejects - reject all words in the block
233  *************************************************************************/
234 
235 void Tesseract::doc_and_block_rejection( //reject big chunks
236  PAGE_RES_IT &page_res_it,
237  BOOL8 good_quality_doc) {
238  inT16 block_no = 0;
239  inT16 row_no = 0;
240  BLOCK_RES *current_block;
241  ROW_RES *current_row;
242 
243  BOOL8 rej_word;
244  BOOL8 prev_word_rejected;
245  inT16 char_quality = 0;
246  inT16 accepted_char_quality;
247 
248  if (page_res_it.page_res->rej_count * 100.0 /
250  reject_whole_page(page_res_it);
252  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
253  page_res_it.page_res->char_count,
254  page_res_it.page_res->rej_count);
255  }
256  } else {
258  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
259  page_res_it.page_res->char_count,
260  page_res_it.page_res->rej_count);
261  }
262 
263  /* Walk blocks testing for block rejection */
264 
265  page_res_it.restart_page();
266  WERD_RES* word;
267  while ((word = page_res_it.word()) != NULL) {
268  current_block = page_res_it.block();
269  block_no = current_block->block->index();
270  if (current_block->char_count > 0 &&
271  (current_block->rej_count * 100.0 / current_block->char_count) >
274  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
275  block_no, current_block->char_count,
276  current_block->rej_count);
277  }
278  prev_word_rejected = FALSE;
279  while ((word = page_res_it.word()) != NULL &&
280  (page_res_it.block() == current_block)) {
282  rej_word = word->reject_map.reject_count() > 0 ||
284  if (rej_word && tessedit_dont_blkrej_good_wds &&
287  *word->uch_set,
288  word->best_choice->unichar_string().string(),
289  word->best_choice->unichar_lengths().string()) !=
290  AC_UNACCEPTABLE) {
291  word_char_quality(word, page_res_it.row()->row,
292  &char_quality,
293  &accepted_char_quality);
294  rej_word = char_quality != word->reject_map.length();
295  }
296  } else {
297  rej_word = TRUE;
298  }
299  if (rej_word) {
300  /*
301  Reject spacing if both current and prev words are rejected.
302  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
303  generated more space errors.
304  */
306  prev_word_rejected &&
307  page_res_it.prev_row() == page_res_it.row() &&
308  word->word->space() == 1)
309  word->reject_spaces = TRUE;
311  }
312  prev_word_rejected = rej_word;
313  page_res_it.forward();
314  }
315  } else {
317  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
318  block_no, page_res_it.block()->char_count,
319  page_res_it.block()->rej_count);
320  }
321 
322  /* Walk rows in block testing for row rejection */
323  row_no = 0;
324  while (page_res_it.word() != NULL &&
325  page_res_it.block() == current_block) {
326  current_row = page_res_it.row();
327  row_no++;
328  /* Reject whole row if:
329  fraction of chars on row which are rejected exceed a limit AND
330  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
331  limit
332  */
333  if (current_row->char_count > 0 &&
334  (current_row->rej_count * 100.0 / current_row->char_count) >
336  (current_row->whole_word_rej_count * 100.0 /
337  current_row->rej_count) <
340  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
341  row_no, current_row->char_count,
342  current_row->rej_count);
343  }
344  prev_word_rejected = FALSE;
345  while ((word = page_res_it.word()) != NULL &&
346  page_res_it.row () == current_row) {
347  /* Preserve words on good docs unless they are mostly rejected*/
348  if (!tessedit_row_rej_good_docs && good_quality_doc) {
349  rej_word = word->reject_map.reject_count() /
350  static_cast<float>(word->reject_map.length()) >
353  /* Preserve perfect words anyway */
354  rej_word = word->reject_map.reject_count() > 0 ||
356  if (rej_word && tessedit_dont_rowrej_good_wds &&
359  word->best_choice->unichar_string().string(),
360  word->best_choice->unichar_lengths().string()) !=
361  AC_UNACCEPTABLE) {
362  word_char_quality(word, page_res_it.row()->row,
363  &char_quality,
364  &accepted_char_quality);
365  rej_word = char_quality != word->reject_map.length();
366  }
367  } else {
368  rej_word = TRUE;
369  }
370  if (rej_word) {
371  /*
372  Reject spacing if both current and prev words are rejected.
373  NOTE - this is NOT restricted to FUZZY spaces. - When tried
374  this generated more space errors.
375  */
377  prev_word_rejected &&
378  page_res_it.prev_row() == page_res_it.row() &&
379  word->word->space () == 1)
380  word->reject_spaces = TRUE;
382  }
383  prev_word_rejected = rej_word;
384  page_res_it.forward();
385  }
386  } else {
388  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
389  row_no, current_row->char_count, current_row->rej_count);
390  }
391  while (page_res_it.word() != NULL &&
392  page_res_it.row() == current_row)
393  page_res_it.forward();
394  }
395  }
396  }
397  }
398  }
399 }
400 
401 } // namespace tesseract
402 
403 
404 /*************************************************************************
405  * reject_whole_page()
406  * Dont believe any of it - set the reject map to 00..00 in all words
407  *
408  *************************************************************************/
409 
410 void reject_whole_page(PAGE_RES_IT &page_res_it) {
411  page_res_it.restart_page ();
412  while (page_res_it.word () != NULL) {
413  page_res_it.word ()->reject_map.rej_word_doc_rej ();
414  page_res_it.forward ();
415  }
416  //whole page is rejected
417  page_res_it.page_res->rejected = TRUE;
418 }
419 
420 namespace tesseract {
422  WERD_RES *word;
423  GARBAGE_LEVEL garbage_level;
424  PAGE_RES_IT copy_it;
425  BOOL8 prev_potential_marked = FALSE;
426  BOOL8 found_terrible_word = FALSE;
427  BOOL8 ok_dict_word;
428 
429  page_res_it.restart_page();
430  while (page_res_it.word() != NULL) {
431  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
432  if (pb != NULL && !pb->IsText()) {
433  page_res_it.forward();
434  continue;
435  }
436  word = page_res_it.word();
437 
439  convert_bad_unlv_chs(word);
440 
442  word->merge_tess_fails();
443 
444  if (word->reject_map.accept_count () != 0) {
445  found_terrible_word = FALSE;
446  //Forget earlier potential crunches
447  prev_potential_marked = FALSE;
448  }
449  else {
450  ok_dict_word = safe_dict_word(word);
451  garbage_level = garbage_word (word, ok_dict_word);
452 
453  if ((garbage_level != G_NEVER_CRUNCH) &&
454  (terrible_word_crunch (word, garbage_level))) {
455  if (crunch_debug > 0) {
456  tprintf ("T CRUNCHING: \"%s\"\n",
457  word->best_choice->unichar_string().string());
458  }
460  if (prev_potential_marked) {
461  while (copy_it.word () != word) {
462  if (crunch_debug > 0) {
463  tprintf ("P1 CRUNCHING: \"%s\"\n",
464  copy_it.word()->best_choice->unichar_string().string());
465  }
466  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
467  copy_it.forward ();
468  }
469  prev_potential_marked = FALSE;
470  }
471  found_terrible_word = TRUE;
472  }
473  else if ((garbage_level != G_NEVER_CRUNCH) &&
474  (potential_word_crunch (word,
475  garbage_level, ok_dict_word))) {
476  if (found_terrible_word) {
477  if (crunch_debug > 0) {
478  tprintf ("P2 CRUNCHING: \"%s\"\n",
479  word->best_choice->unichar_string().string());
480  }
482  }
483  else if (!prev_potential_marked) {
484  copy_it = page_res_it;
485  prev_potential_marked = TRUE;
486  if (crunch_debug > 1) {
487  tprintf ("P3 CRUNCHING: \"%s\"\n",
488  word->best_choice->unichar_string().string());
489  }
490  }
491  }
492  else {
493  found_terrible_word = FALSE;
494  //Forget earlier potential crunches
495  prev_potential_marked = FALSE;
496  if (crunch_debug > 2) {
497  tprintf ("NO CRUNCH: \"%s\"\n",
498  word->best_choice->unichar_string().string());
499  }
500  }
501  }
502  page_res_it.forward ();
503  }
504 }
505 
506 
508  GARBAGE_LEVEL garbage_level) {
509  float rating_per_ch;
510  int adjusted_len;
511  int crunch_mode = 0;
512 
513  if ((word->best_choice->unichar_string().length () == 0) ||
514  (strspn (word->best_choice->unichar_string().string(), " ") ==
515  word->best_choice->unichar_string().length ()))
516  crunch_mode = 1;
517  else {
518  adjusted_len = word->reject_map.length ();
519  if (adjusted_len > crunch_rating_max)
520  adjusted_len = crunch_rating_max;
521  rating_per_ch = word->best_choice->rating () / adjusted_len;
522 
523  if (rating_per_ch > crunch_terrible_rating)
524  crunch_mode = 2;
525  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
526  crunch_mode = 3;
527  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
528  (garbage_level != G_OK))
529  crunch_mode = 4;
530  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
531  (garbage_level != G_OK))
532  crunch_mode = 5;
533  }
534  if (crunch_mode > 0) {
535  if (crunch_debug > 2) {
536  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
537  crunch_mode, word->best_choice->unichar_string().string());
538  }
539  return TRUE;
540  }
541  else
542  return FALSE;
543 }
544 
546  GARBAGE_LEVEL garbage_level,
547  BOOL8 ok_dict_word) {
548  float rating_per_ch;
549  int adjusted_len;
550  const char *str = word->best_choice->unichar_string().string();
551  const char *lengths = word->best_choice->unichar_lengths().string();
552  BOOL8 word_crunchable;
553  int poor_indicator_count = 0;
554 
555  word_crunchable = !crunch_leave_accept_strings ||
556  word->reject_map.length() < 3 ||
558  str, lengths) == AC_UNACCEPTABLE &&
559  !ok_dict_word);
560 
561  adjusted_len = word->reject_map.length();
562  if (adjusted_len > 10)
563  adjusted_len = 10;
564  rating_per_ch = word->best_choice->rating() / adjusted_len;
565 
566  if (rating_per_ch > crunch_pot_poor_rate) {
567  if (crunch_debug > 2) {
568  tprintf("Potential poor rating on \"%s\"\n",
569  word->best_choice->unichar_string().string());
570  }
571  poor_indicator_count++;
572  }
573 
574  if (word_crunchable &&
576  if (crunch_debug > 2) {
577  tprintf("Potential poor cert on \"%s\"\n",
578  word->best_choice->unichar_string().string());
579  }
580  poor_indicator_count++;
581  }
582 
583  if (garbage_level != G_OK) {
584  if (crunch_debug > 2) {
585  tprintf("Potential garbage on \"%s\"\n",
586  word->best_choice->unichar_string().string());
587  }
588  poor_indicator_count++;
589  }
590  return poor_indicator_count >= crunch_pot_indicators;
591 }
592 
594  WERD_RES *word;
595  PAGE_RES_IT copy_it;
596  BOOL8 deleting_from_bol = FALSE;
597  BOOL8 marked_delete_point = FALSE;
598  inT16 debug_delete_mode;
599  CRUNCH_MODE delete_mode;
600  inT16 x_debug_delete_mode;
601  CRUNCH_MODE x_delete_mode;
602 
603  page_res_it.restart_page();
604  while (page_res_it.word() != NULL) {
605  word = page_res_it.word();
606 
607  delete_mode = word_deletable (word, debug_delete_mode);
608  if (delete_mode != CR_NONE) {
609  if (word->word->flag (W_BOL) || deleting_from_bol) {
610  if (crunch_debug > 0) {
611  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
612  debug_delete_mode,
613  word->best_choice->unichar_string().string());
614  }
615  word->unlv_crunch_mode = delete_mode;
616  deleting_from_bol = TRUE;
617  } else if (word->word->flag(W_EOL)) {
618  if (marked_delete_point) {
619  while (copy_it.word() != word) {
620  x_delete_mode = word_deletable (copy_it.word (),
621  x_debug_delete_mode);
622  if (crunch_debug > 0) {
623  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
624  x_debug_delete_mode,
625  copy_it.word()->best_choice->unichar_string().string());
626  }
627  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
628  copy_it.forward ();
629  }
630  }
631  if (crunch_debug > 0) {
632  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
633  debug_delete_mode,
634  word->best_choice->unichar_string().string());
635  }
636  word->unlv_crunch_mode = delete_mode;
637  deleting_from_bol = FALSE;
638  marked_delete_point = FALSE;
639  }
640  else {
641  if (!marked_delete_point) {
642  copy_it = page_res_it;
643  marked_delete_point = TRUE;
644  }
645  }
646  }
647  else {
648  deleting_from_bol = FALSE;
649  //Forget earlier potential crunches
650  marked_delete_point = FALSE;
651  }
652  /*
653  The following step has been left till now as the tess fails are used to
654  determine if the word is deletable.
655  */
657  word->merge_tess_fails();
658  page_res_it.forward ();
659  }
660 }
661 
662 
664  int i;
665  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
666  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
667  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
668  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
669  for (i = 0; i < word_res->reject_map.length(); ++i) {
670  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
671  word_res->best_choice->set_unichar_id(unichar_dash, i);
672  if (word_res->reject_map[i].accepted ())
673  word_res->reject_map[i].setrej_unlv_rej ();
674  }
675  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
676  word_res->best_choice->set_unichar_id(unichar_space, i);
677  if (word_res->reject_map[i].accepted ())
678  word_res->reject_map[i].setrej_unlv_rej ();
679  }
680  }
681 }
682 
684  enum STATES
685  {
686  JUNK,
687  FIRST_UPPER,
688  FIRST_LOWER,
689  FIRST_NUM,
690  SUBSEQUENT_UPPER,
691  SUBSEQUENT_LOWER,
692  SUBSEQUENT_NUM
693  };
694  const char *str = word->best_choice->unichar_string().string();
695  const char *lengths = word->best_choice->unichar_lengths().string();
696  STATES state = JUNK;
697  int len = 0;
698  int isolated_digits = 0;
699  int isolated_alphas = 0;
700  int bad_char_count = 0;
701  int tess_rejs = 0;
702  int dodgy_chars = 0;
703  int ok_chars;
704  UNICHAR_ID last_char = -1;
705  int alpha_repetition_count = 0;
706  int longest_alpha_repetition_count = 0;
707  int longest_lower_run_len = 0;
708  int lower_string_count = 0;
709  int longest_upper_run_len = 0;
710  int upper_string_count = 0;
711  int total_alpha_count = 0;
712  int total_digit_count = 0;
713 
714  for (; *str != '\0'; str += *(lengths++)) {
715  len++;
716  if (word->uch_set->get_isupper (str, *lengths)) {
717  total_alpha_count++;
718  switch (state) {
719  case SUBSEQUENT_UPPER:
720  case FIRST_UPPER:
721  state = SUBSEQUENT_UPPER;
722  upper_string_count++;
723  if (longest_upper_run_len < upper_string_count)
724  longest_upper_run_len = upper_string_count;
725  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
726  alpha_repetition_count++;
727  if (longest_alpha_repetition_count < alpha_repetition_count) {
728  longest_alpha_repetition_count = alpha_repetition_count;
729  }
730  }
731  else {
732  last_char = word->uch_set->unichar_to_id(str, *lengths);
733  alpha_repetition_count = 1;
734  }
735  break;
736  case FIRST_NUM:
737  isolated_digits++;
738  default:
739  state = FIRST_UPPER;
740  last_char = word->uch_set->unichar_to_id(str, *lengths);
741  alpha_repetition_count = 1;
742  upper_string_count = 1;
743  break;
744  }
745  }
746  else if (word->uch_set->get_islower (str, *lengths)) {
747  total_alpha_count++;
748  switch (state) {
749  case SUBSEQUENT_LOWER:
750  case FIRST_LOWER:
751  state = SUBSEQUENT_LOWER;
752  lower_string_count++;
753  if (longest_lower_run_len < lower_string_count)
754  longest_lower_run_len = lower_string_count;
755  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
756  alpha_repetition_count++;
757  if (longest_alpha_repetition_count < alpha_repetition_count) {
758  longest_alpha_repetition_count = alpha_repetition_count;
759  }
760  }
761  else {
762  last_char = word->uch_set->unichar_to_id(str, *lengths);
763  alpha_repetition_count = 1;
764  }
765  break;
766  case FIRST_NUM:
767  isolated_digits++;
768  default:
769  state = FIRST_LOWER;
770  last_char = word->uch_set->unichar_to_id(str, *lengths);
771  alpha_repetition_count = 1;
772  lower_string_count = 1;
773  break;
774  }
775  }
776  else if (word->uch_set->get_isdigit (str, *lengths)) {
777  total_digit_count++;
778  switch (state) {
779  case FIRST_NUM:
780  state = SUBSEQUENT_NUM;
781  case SUBSEQUENT_NUM:
782  break;
783  case FIRST_UPPER:
784  case FIRST_LOWER:
785  isolated_alphas++;
786  default:
787  state = FIRST_NUM;
788  break;
789  }
790  }
791  else {
792  if (*lengths == 1 && *str == ' ')
793  tess_rejs++;
794  else
795  bad_char_count++;
796  switch (state) {
797  case FIRST_NUM:
798  isolated_digits++;
799  break;
800  case FIRST_UPPER:
801  case FIRST_LOWER:
802  isolated_alphas++;
803  default:
804  break;
805  }
806  state = JUNK;
807  }
808  }
809 
810  switch (state) {
811  case FIRST_NUM:
812  isolated_digits++;
813  break;
814  case FIRST_UPPER:
815  case FIRST_LOWER:
816  isolated_alphas++;
817  default:
818  break;
819  }
820 
822  total_alpha_count += total_digit_count - isolated_digits;
823  }
824 
825  if (crunch_leave_ok_strings && len >= 4 &&
826  2 * (total_alpha_count - isolated_alphas) > len &&
827  longest_alpha_repetition_count < crunch_long_repetitions) {
828  if ((crunch_accept_ok &&
829  acceptable_word_string(*word->uch_set, str, lengths) !=
830  AC_UNACCEPTABLE) ||
831  longest_lower_run_len > crunch_leave_lc_strings ||
832  longest_upper_run_len > crunch_leave_uc_strings)
833  return G_NEVER_CRUNCH;
834  }
835  if (word->reject_map.length() > 1 &&
836  strpbrk(str, " ") == NULL &&
837  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
838  word->best_choice->permuter() == FREQ_DAWG_PERM ||
839  word->best_choice->permuter() == USER_DAWG_PERM ||
840  word->best_choice->permuter() == NUMBER_PERM ||
841  acceptable_word_string(*word->uch_set, str, lengths) !=
842  AC_UNACCEPTABLE || ok_dict_word))
843  return G_OK;
844 
845  ok_chars = len - bad_char_count - isolated_digits -
846  isolated_alphas - tess_rejs;
847 
848  if (crunch_debug > 3) {
849  tprintf("garbage_word: \"%s\"\n",
850  word->best_choice->unichar_string().string());
851  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
852  len,
853  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
854  }
855  if (bad_char_count == 0 &&
856  tess_rejs == 0 &&
857  (len > isolated_digits + isolated_alphas || len <= 2))
858  return G_OK;
859 
860  if (tess_rejs > ok_chars ||
861  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
862  return G_TERRIBLE;
863 
864  if (len > 4) {
865  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
866  isolated_alphas;
867  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
868  return G_DODGY;
869  else
870  return G_OK;
871  } else {
872  dodgy_chars = 2 * tess_rejs + bad_char_count;
873  if ((len == 4 && dodgy_chars > 2) ||
874  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
875  return G_DODGY;
876  else
877  return G_OK;
878  }
879 }
880 
881 
882 /*************************************************************************
883  * word_deletable()
884  * DELETE WERDS AT ENDS OF ROWS IF
885  * Word is crunched &&
886  * ( string length = 0 OR
887  * > 50% of chars are "|" (before merging) OR
888  * certainty < -10 OR
889  * rating /char > 60 OR
890  * TOP of word is more than 0.5 xht BELOW baseline OR
891  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
892  * length of word < 3xht OR
893  * height of word < 0.7 xht OR
894  * height of word > 3.0 xht OR
895  * >75% of the outline BBs have longest dimension < 0.5xht
896  *************************************************************************/
897 
899  int word_len = word->reject_map.length ();
900  float rating_per_ch;
901  TBOX box; //BB of word
902 
903  if (word->unlv_crunch_mode == CR_NONE) {
904  delete_mode = 0;
905  return CR_NONE;
906  }
907 
908  if (word_len == 0) {
909  delete_mode = 1;
910  return CR_DELETE;
911  }
912 
913  if (word->rebuild_word != NULL) {
914  // Cube leaves rebuild_word NULL.
915  box = word->rebuild_word->bounding_box();
916  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
917  delete_mode = 4;
918  return CR_DELETE;
919  }
920 
921  if (noise_outlines(word->rebuild_word)) {
922  delete_mode = 5;
923  return CR_DELETE;
924  }
925  }
926 
927  if ((failure_count (word) * 1.5) > word_len) {
928  delete_mode = 2;
929  return CR_LOOSE_SPACE;
930  }
931 
932  if (word->best_choice->certainty () < crunch_del_cert) {
933  delete_mode = 7;
934  return CR_LOOSE_SPACE;
935  }
936 
937  rating_per_ch = word->best_choice->rating () / word_len;
938 
939  if (rating_per_ch > crunch_del_rating) {
940  delete_mode = 8;
941  return CR_LOOSE_SPACE;
942  }
943 
945  delete_mode = 9;
946  return CR_LOOSE_SPACE;
947  }
948 
949  if (box.bottom () >
951  delete_mode = 10;
952  return CR_LOOSE_SPACE;
953  }
954 
955  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
956  delete_mode = 11;
957  return CR_LOOSE_SPACE;
958  }
959 
960  if (box.width () < crunch_del_min_width * kBlnXHeight) {
961  delete_mode = 3;
962  return CR_LOOSE_SPACE;
963  }
964 
965  delete_mode = 0;
966  return CR_NONE;
967 }
968 
970  const char *str = word->best_choice->unichar_string().string();
971  int tess_rejs = 0;
972 
973  for (; *str != '\0'; str++) {
974  if (*str == ' ')
975  tess_rejs++;
976  }
977  return tess_rejs;
978 }
979 
980 
982  TBOX box; // BB of outline
983  inT16 outline_count = 0;
984  inT16 small_outline_count = 0;
985  inT16 max_dimension;
986  float small_limit = kBlnXHeight * crunch_small_outlines_size;
987 
988  for (int b = 0; b < word->NumBlobs(); ++b) {
989  TBLOB* blob = word->blobs[b];
990  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
991  outline_count++;
992  box = ol->bounding_box();
993  if (box.height() > box.width())
994  max_dimension = box.height();
995  else
996  max_dimension = box.width();
997  if (max_dimension < small_limit)
998  small_outline_count++;
999  }
1000  }
1001  return small_outline_count >= outline_count;
1002 }
1003 
1004 } // namespace tesseract
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:663
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
void rej_stat_word()
Definition: pageres.cpp:1673
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT32 length() const
Definition: rejctmap.h:237
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:545
WERD_CHOICE * best_choice
Definition: pageres.h:219
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:36
REJMAP reject_map
Definition: pageres.h:271
inT32 char_count
Definition: pageres.h:60
#define tprintf(...)
Definition: tprintf.h:31
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
inT32 whole_word_rej_count
Definition: pageres.h:130
BOOL8 reject_spaces
Definition: pageres.h:317
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
PAGE_RES * page_res
Definition: pageres.h:658
const STRING & unichar_lengths() const
Definition: ratngs.h:531
Definition: docqual.h:28
unsigned char BOOL8
Definition: host.h:113
inT32 length() const
Definition: strngs.cpp:188
bool IsText() const
Definition: polyblk.h:52
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:421
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
BLOCK * block
Definition: pageres.h:99
BOOL8 rejected
Definition: pageres.h:63
Definition: ocrrow.h:32
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:354
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: werd.h:35
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:163
double tessedit_whole_wd_rej_row_percent
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:898
BLOCK_RES * block() const
Definition: pageres.h:739
double tessedit_reject_block_percent
WERD_RES * forward()
Definition: pageres.h:713
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:140
bool crunch_early_convert_bad_unlv_chs
WERD_RES * restart_page()
Definition: pageres.h:680
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:969
Definition: werd.h:36
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:128
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
CRUNCH_MODE
Definition: pageres.h:145
float certainty() const
Definition: ratngs.h:327
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
inT32 char_count
Definition: pageres.h:100
void rej_word_block_rej()
Definition: rejctmap.cpp:506
const UNICHARSET * uch_set
Definition: pageres.h:192
ROW_RES * row() const
Definition: pageres.h:736
uinT8 permuter() const
Definition: ratngs.h:343
inT32 rej_count
Definition: pageres.h:61
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
const int kBlnBaselineOffset
Definition: normalis.h:29
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:683
void rej_word_row_rej()
Definition: rejctmap.cpp:515
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663
int UNICHAR_ID
Definition: unichar.h:33
inT16 accept_count()
Definition: rejctmap.cpp:331
inT32 rej_count
Definition: pageres.h:129
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
inT32 char_count
Definition: pageres.h:128
double tessedit_reject_row_percent
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:235
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:881
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:507
WERD * word
Definition: pageres.h:175
bool empty() const
Definition: genericvector.h:84
inT32 rej_count
Definition: pageres.h:101
inT16 height() const
Definition: rect.h:104
Unacceptable word.
Definition: control.h:36
inT16 width() const
Definition: rect.h:111
bool tessedit_preserve_row_rej_perfect_wds
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
#define FALSE
Definition: capi.h:29
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
double tessedit_good_doc_still_rowrej_wd
GARBAGE_LEVEL
Definition: docqual.h:25
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:117
ROW * row
Definition: pageres.h:127
inT16 reject_count()
Definition: rejctmap.h:243
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:981
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
int index() const
Definition: pdblock.h:77
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:410
double tessedit_reject_doc_percent
uinT8 space()
Definition: werd.h:104
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: strngs.h:44
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
#define NULL
Definition: host.h:144
Definition: blobs.h:395
TESSLINE * outlines
Definition: blobs.h:377
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
ROW_RES * prev_row() const
Definition: pageres.h:727
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:593
void merge_tess_fails()
Definition: pageres.cpp:1061
void rej_word_doc_rej()
Definition: rejctmap.cpp:497
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49
int NumOutlines() const
Definition: blobs.cpp:469
bool tessedit_preserve_blk_rej_perfect_wds
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100