tesseract-ocr.github.io/3.x/a00697_source.html

 /******************************************************************

  * File:        docqual.cpp  (Formerly docqual.c)

  * Description: Document Quality Metrics

  * Author:              Phil Cheatle

  * Created:             Mon May  9 11:27:28 BST 1994

  *

  * (C) Copyright 1994, Hewlett-Packard Ltd.

  ** Licensed under the Apache License, Version 2.0 (the "License");

  ** you may not use this file except in compliance with the License.

  ** You may obtain a copy of the License at

  ** http://www.apache.org/licenses/LICENSE-2.0

  ** Unless required by applicable law or agreed to in writing, software

  ** distributed under the License is distributed on an "AS IS" BASIS,

  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  ** See the License for the specific language governing permissions and

  ** limitations under the License.

  *

  **********************************************************************/


 #ifdef _MSC_VER

 #pragma warning(disable:4244)  // Conversion warnings

 #endif


 #include          <ctype.h>

 #include          "docqual.h"

 #include          "reject.h"

 #include          "tesscallback.h"

 #include          "tessvars.h"

 #include          "globals.h"

 #include          "tesseractclass.h"


 namespace tesseract{


 // A little class to provide the callbacks as we have no pre-bound args.

 struct DocQualCallbacks {

   explicit DocQualCallbacks(WERD_RES* word0)

     : word(word0), match_count(0), accepted_match_count(0) {}


   void CountMatchingBlobs(int index) {

     ++match_count;

   }


   void CountAcceptedBlobs(int index) {

     if (word->reject_map[index].accepted())

       ++accepted_match_count;

     ++match_count;

   }


   void AcceptIfGoodQuality(int index) {

     if (word->reject_map[index].accept_if_good_quality())

       word->reject_map[index].setrej_quality_accept();

   }


   WERD_RES* word;

   inT16 match_count;

   inT16 accepted_match_count;

 };


 /*************************************************************************

  * word_blob_quality()

  * How many blobs in the box_word are identical to those of the inword?

  * ASSUME blobs in both initial word and box_word are in ascending order of

  * left hand blob edge.

  *************************************************************************/

 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {

   if (word->bln_boxes == NULL ||

       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())

     return 0;


   DocQualCallbacks cb(word);

   word->bln_boxes->ProcessMatchedBlobs(

       *word->rebuild_word,

       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));

   return cb.match_count;

 }


 inT16 Tesseract::word_outline_errs(WERD_RES *word) {

   inT16 i = 0;

   inT16 err_count = 0;


   if (word->rebuild_word != NULL) {

     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {

       TBLOB* blob = word->rebuild_word->blobs[b];

       err_count += count_outline_errs(word->best_choice->unichar_string()[i],

                                       blob->NumOutlines());

       i++;

     }

   }

   return err_count;

 }


 /*************************************************************************

  * word_char_quality()

  * Combination of blob quality and outline quality - how many good chars are

  * there? - I.e chars which pass the blob AND outline tests.

  *************************************************************************/

 void Tesseract::word_char_quality(WERD_RES *word,

                                   ROW *row,

                                   inT16 *match_count,

                                   inT16 *accepted_match_count) {

   if (word->bln_boxes == NULL ||

       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())

     return;


   DocQualCallbacks cb(word);

   word->bln_boxes->ProcessMatchedBlobs(

       *word->rebuild_word,

       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));

   *match_count = cb.match_count;

   *accepted_match_count = cb.accepted_match_count;

 }


 /*************************************************************************

  * unrej_good_chs()

  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks

  *************************************************************************/

 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {

   if (word->bln_boxes == NULL ||

       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())

     return;


   DocQualCallbacks cb(word);

   word->bln_boxes->ProcessMatchedBlobs(

       *word->rebuild_word,

       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));

 }


 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {

   int expected_outline_count;


   if (STRING (outlines_odd).contains (c))

     return 0;                    //Dont use this char

   else if (STRING (outlines_2).contains (c))

     expected_outline_count = 2;

   else

     expected_outline_count = 1;

   return abs (outline_count - expected_outline_count);

 }


 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,

                                         BOOL8 good_quality_doc) {

   if ((tessedit_good_quality_unrej && good_quality_doc))

     unrej_good_quality_words(page_res_it);

   doc_and_block_rejection(page_res_it, good_quality_doc);

   if (unlv_tilde_crunching) {

     tilde_crunch(page_res_it);

     tilde_delete(page_res_it);

   }

 }


 /*************************************************************************

  * unrej_good_quality_words()

  * Accept potential rejects in words which pass the following checks:

  *    - Contains a potential reject

  *    - Word looks like a sensible alpha word.

  *    - Word segmentation is the same as the original image

  *              - All characters have the expected number of outlines

  * NOTE - the rejection counts are recalculated after unrejection

  *      - CANT do it in a single pass without a bit of fiddling

  *              - keep it simple but inefficient

  *************************************************************************/

 void Tesseract::unrej_good_quality_words(  //unreject potential

                                          PAGE_RES_IT &page_res_it) {

   WERD_RES *word;

   ROW_RES *current_row;

   BLOCK_RES *current_block;

   int i;


   page_res_it.restart_page ();

   while (page_res_it.word () != NULL) {

     check_debug_pt (page_res_it.word (), 100);

     if (bland_unrej) {

       word = page_res_it.word ();

       for (i = 0; i < word->reject_map.length (); i++) {

         if (word->reject_map[i].accept_if_good_quality ())

           word->reject_map[i].setrej_quality_accept ();

       }

       page_res_it.forward ();

     }

     else if ((page_res_it.row ()->char_count > 0) &&

       ((page_res_it.row ()->rej_count /

       (float) page_res_it.row ()->char_count) <=

     quality_rowrej_pc)) {

       word = page_res_it.word ();

       if (word->reject_map.quality_recoverable_rejects() &&

           (tessedit_unrej_any_wd ||

            acceptable_word_string(*word->uch_set,

                                   word->best_choice->unichar_string().string(),

                                   word->best_choice->unichar_lengths().string())

                != AC_UNACCEPTABLE)) {

         unrej_good_chs(word, page_res_it.row ()->row);

       }

       page_res_it.forward ();

     }

     else {

       /* Skip to end of dodgy row */

       current_row = page_res_it.row ();

       while ((page_res_it.word () != NULL) &&

         (page_res_it.row () == current_row))

         page_res_it.forward ();

     }

     check_debug_pt (page_res_it.word (), 110);

   }

   page_res_it.restart_page ();

   page_res_it.page_res->char_count = 0;

   page_res_it.page_res->rej_count = 0;

   current_block = NULL;

   current_row = NULL;

   while (page_res_it.word () != NULL) {

     if (current_block != page_res_it.block ()) {

       current_block = page_res_it.block ();

       current_block->char_count = 0;

       current_block->rej_count = 0;

     }

     if (current_row != page_res_it.row ()) {

       current_row = page_res_it.row ();

       current_row->char_count = 0;

       current_row->rej_count = 0;

       current_row->whole_word_rej_count = 0;

     }

     page_res_it.rej_stat_word ();

     page_res_it.forward ();

   }

 }


 /*************************************************************************

  * doc_and_block_rejection()

  *

  * If the page has too many rejects - reject all of it.

  * If any block has too many rejects - reject all words in the block

  *************************************************************************/


 void Tesseract::doc_and_block_rejection(  //reject big chunks

                                         PAGE_RES_IT &page_res_it,

                                         BOOL8 good_quality_doc) {

   inT16 block_no = 0;

   inT16 row_no = 0;

   BLOCK_RES *current_block;

   ROW_RES *current_row;


   BOOL8 rej_word;

   BOOL8 prev_word_rejected;

   inT16 char_quality = 0;

   inT16 accepted_char_quality;


   if (page_res_it.page_res->rej_count * 100.0 /

       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {

     reject_whole_page(page_res_it);

     if (tessedit_debug_doc_rejection) {

       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",

               page_res_it.page_res->char_count,

               page_res_it.page_res->rej_count);

     }

   } else {

     if (tessedit_debug_doc_rejection) {

       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",

               page_res_it.page_res->char_count,

               page_res_it.page_res->rej_count);

     }


     /* Walk blocks testing for block rejection */


     page_res_it.restart_page();

     WERD_RES* word;

     while ((word = page_res_it.word()) != NULL) {

       current_block = page_res_it.block();

       block_no = current_block->block->index();

       if (current_block->char_count > 0 &&

           (current_block->rej_count * 100.0 / current_block->char_count) >

            tessedit_reject_block_percent) {

         if (tessedit_debug_block_rejection) {

           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",

                   block_no, current_block->char_count,

                   current_block->rej_count);

         }

         prev_word_rejected = FALSE;

         while ((word = page_res_it.word()) != NULL &&

                (page_res_it.block() == current_block)) {

           if (tessedit_preserve_blk_rej_perfect_wds) {

             rej_word = word->reject_map.reject_count() > 0 ||

                 word->reject_map.length () < tessedit_preserve_min_wd_len;

             if (rej_word && tessedit_dont_blkrej_good_wds &&

                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&

                 acceptable_word_string(

                     *word->uch_set,

                     word->best_choice->unichar_string().string(),

                     word->best_choice->unichar_lengths().string()) !=

                 AC_UNACCEPTABLE) {

               word_char_quality(word, page_res_it.row()->row,

                                 &char_quality,

                                 &accepted_char_quality);

               rej_word = char_quality !=  word->reject_map.length();

             }

           } else {

             rej_word = TRUE;

           }

           if (rej_word) {

             /*

               Reject spacing if both current and prev words are rejected.

               NOTE - this is NOT restricted to FUZZY spaces. - When tried this

               generated more space errors.

             */

             if (tessedit_use_reject_spaces &&

                 prev_word_rejected &&

                 page_res_it.prev_row() == page_res_it.row() &&

                 word->word->space() == 1)

               word->reject_spaces = TRUE;

             word->reject_map.rej_word_block_rej();

           }

           prev_word_rejected = rej_word;

           page_res_it.forward();

         }

       } else {

         if (tessedit_debug_block_rejection) {

           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",

                   block_no, page_res_it.block()->char_count,

                   page_res_it.block()->rej_count);

         }


         /* Walk rows in block testing for row rejection */

         row_no = 0;

         while (page_res_it.word() != NULL &&

                page_res_it.block() == current_block) {

           current_row = page_res_it.row();

           row_no++;

           /* Reject whole row if:

             fraction of chars on row which are rejected exceed a limit AND

             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a

             limit

           */

           if (current_row->char_count > 0 &&

               (current_row->rej_count * 100.0 / current_row->char_count) >

               tessedit_reject_row_percent &&

               (current_row->whole_word_rej_count * 100.0 /

                   current_row->rej_count) <

               tessedit_whole_wd_rej_row_percent) {

             if (tessedit_debug_block_rejection) {

               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",

                       row_no, current_row->char_count,

                       current_row->rej_count);

             }

             prev_word_rejected = FALSE;

             while ((word = page_res_it.word()) != NULL &&

                    page_res_it.row () == current_row) {

               /* Preserve words on good docs unless they are mostly rejected*/

               if (!tessedit_row_rej_good_docs && good_quality_doc) {

                 rej_word = word->reject_map.reject_count() /

                     static_cast<float>(word->reject_map.length()) >

                     tessedit_good_doc_still_rowrej_wd;

               } else if (tessedit_preserve_row_rej_perfect_wds) {

                 /* Preserve perfect words anyway */

                 rej_word = word->reject_map.reject_count() > 0 ||

                     word->reject_map.length () < tessedit_preserve_min_wd_len;

                 if (rej_word && tessedit_dont_rowrej_good_wds &&

                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&

                     acceptable_word_string(*word->uch_set,

                         word->best_choice->unichar_string().string(),

                         word->best_choice->unichar_lengths().string()) !=

                             AC_UNACCEPTABLE) {

                   word_char_quality(word, page_res_it.row()->row,

                                     &char_quality,

                                     &accepted_char_quality);

                   rej_word = char_quality != word->reject_map.length();

                 }

               } else {

                 rej_word = TRUE;

               }

               if (rej_word) {

                 /*

                   Reject spacing if both current and prev words are rejected.

                   NOTE - this is NOT restricted to FUZZY spaces. - When tried

                   this generated more space errors.

                 */

                 if (tessedit_use_reject_spaces &&

                     prev_word_rejected &&

                     page_res_it.prev_row() == page_res_it.row() &&

                     word->word->space () == 1)

                   word->reject_spaces = TRUE;

                 word->reject_map.rej_word_row_rej();

               }

               prev_word_rejected = rej_word;

               page_res_it.forward();

             }

           } else {

             if (tessedit_debug_block_rejection) {

               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",

                       row_no, current_row->char_count, current_row->rej_count);

             }

             while (page_res_it.word() != NULL &&

                    page_res_it.row() == current_row)

               page_res_it.forward();

           }

         }

       }

     }

   }

 }


 }  // namespace tesseract


 /*************************************************************************

  * reject_whole_page()

  * Dont believe any of it - set the reject map to 00..00 in all words

  *

  *************************************************************************/


 void reject_whole_page(PAGE_RES_IT &page_res_it) {

   page_res_it.restart_page ();

   while (page_res_it.word () != NULL) {

     page_res_it.word ()->reject_map.rej_word_doc_rej ();

     page_res_it.forward ();

   }

                                  //whole page is rejected

   page_res_it.page_res->rejected = TRUE;

 }


 namespace tesseract {

 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {

   WERD_RES *word;

   GARBAGE_LEVEL garbage_level;

   PAGE_RES_IT copy_it;

   BOOL8 prev_potential_marked = FALSE;

   BOOL8 found_terrible_word = FALSE;

   BOOL8 ok_dict_word;


   page_res_it.restart_page();

   while (page_res_it.word() != NULL) {

     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();

     if (pb != NULL && !pb->IsText()) {

       page_res_it.forward();

       continue;

     }

     word = page_res_it.word();


     if (crunch_early_convert_bad_unlv_chs)

       convert_bad_unlv_chs(word);


     if (crunch_early_merge_tess_fails)

       word->merge_tess_fails();


     if (word->reject_map.accept_count () != 0) {

       found_terrible_word = FALSE;

                                  //Forget earlier potential crunches

       prev_potential_marked = FALSE;

     }

     else {

       ok_dict_word = safe_dict_word(word);

       garbage_level = garbage_word (word, ok_dict_word);


       if ((garbage_level != G_NEVER_CRUNCH) &&

       (terrible_word_crunch (word, garbage_level))) {

         if (crunch_debug > 0) {

           tprintf ("T CRUNCHING: \"%s\"\n",

             word->best_choice->unichar_string().string());

         }

         word->unlv_crunch_mode = CR_KEEP_SPACE;

         if (prev_potential_marked) {

           while (copy_it.word () != word) {

             if (crunch_debug > 0) {

               tprintf ("P1 CRUNCHING: \"%s\"\n",

                 copy_it.word()->best_choice->unichar_string().string());

             }

             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;

             copy_it.forward ();

           }

           prev_potential_marked = FALSE;

         }

         found_terrible_word = TRUE;

       }

       else if ((garbage_level != G_NEVER_CRUNCH) &&

         (potential_word_crunch (word,

       garbage_level, ok_dict_word))) {

         if (found_terrible_word) {

           if (crunch_debug > 0) {

             tprintf ("P2 CRUNCHING: \"%s\"\n",

               word->best_choice->unichar_string().string());

           }

           word->unlv_crunch_mode = CR_KEEP_SPACE;

         }

         else if (!prev_potential_marked) {

           copy_it = page_res_it;

           prev_potential_marked = TRUE;

           if (crunch_debug > 1) {

             tprintf ("P3 CRUNCHING: \"%s\"\n",

               word->best_choice->unichar_string().string());

           }

         }

       }

       else {

         found_terrible_word = FALSE;

                                  //Forget earlier potential crunches

         prev_potential_marked = FALSE;

         if (crunch_debug > 2) {

           tprintf ("NO CRUNCH: \"%s\"\n",

             word->best_choice->unichar_string().string());

         }

       }

     }

     page_res_it.forward ();

   }

 }


 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,

                                       GARBAGE_LEVEL garbage_level) {

   float rating_per_ch;

   int adjusted_len;

   int crunch_mode = 0;


   if ((word->best_choice->unichar_string().length () == 0) ||

     (strspn (word->best_choice->unichar_string().string(), " ") ==

     word->best_choice->unichar_string().length ()))

     crunch_mode = 1;

   else {

     adjusted_len = word->reject_map.length ();

     if (adjusted_len > crunch_rating_max)

       adjusted_len = crunch_rating_max;

     rating_per_ch = word->best_choice->rating () / adjusted_len;


     if (rating_per_ch > crunch_terrible_rating)

       crunch_mode = 2;

     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))

       crunch_mode = 3;

     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&

       (garbage_level != G_OK))

       crunch_mode = 4;

     else if ((rating_per_ch > crunch_poor_garbage_rate) &&

       (garbage_level != G_OK))

       crunch_mode = 5;

   }

   if (crunch_mode > 0) {

     if (crunch_debug > 2) {

       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",

         crunch_mode, word->best_choice->unichar_string().string());

     }

     return TRUE;

   }

   else

     return FALSE;

 }


 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,

                                        GARBAGE_LEVEL garbage_level,

                                        BOOL8 ok_dict_word) {

   float rating_per_ch;

   int adjusted_len;

   const char *str = word->best_choice->unichar_string().string();

   const char *lengths = word->best_choice->unichar_lengths().string();

   BOOL8 word_crunchable;

   int poor_indicator_count = 0;


   word_crunchable = !crunch_leave_accept_strings ||

                     word->reject_map.length() < 3 ||

                     (acceptable_word_string(*word->uch_set,

                                             str, lengths) == AC_UNACCEPTABLE &&

                      !ok_dict_word);


   adjusted_len = word->reject_map.length();

   if (adjusted_len > 10)

     adjusted_len = 10;

   rating_per_ch = word->best_choice->rating() / adjusted_len;


   if (rating_per_ch > crunch_pot_poor_rate) {

     if (crunch_debug > 2) {

       tprintf("Potential poor rating on \"%s\"\n",

               word->best_choice->unichar_string().string());

     }

     poor_indicator_count++;

   }


   if (word_crunchable &&

       word->best_choice->certainty() < crunch_pot_poor_cert) {

     if (crunch_debug > 2) {

       tprintf("Potential poor cert on \"%s\"\n",

               word->best_choice->unichar_string().string());

     }

     poor_indicator_count++;

   }


   if (garbage_level != G_OK) {

     if (crunch_debug > 2) {

       tprintf("Potential garbage on \"%s\"\n",

               word->best_choice->unichar_string().string());

     }

     poor_indicator_count++;

   }

   return poor_indicator_count >= crunch_pot_indicators;

 }


 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {

   WERD_RES *word;

   PAGE_RES_IT copy_it;

   BOOL8 deleting_from_bol = FALSE;

   BOOL8 marked_delete_point = FALSE;

   inT16 debug_delete_mode;

   CRUNCH_MODE delete_mode;

   inT16 x_debug_delete_mode;

   CRUNCH_MODE x_delete_mode;


   page_res_it.restart_page();

   while (page_res_it.word() != NULL) {

     word = page_res_it.word();


     delete_mode = word_deletable (word, debug_delete_mode);

     if (delete_mode != CR_NONE) {

       if (word->word->flag (W_BOL) || deleting_from_bol) {

         if (crunch_debug > 0) {

           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",

             debug_delete_mode,

             word->best_choice->unichar_string().string());

         }

         word->unlv_crunch_mode = delete_mode;

         deleting_from_bol = TRUE;

       } else if (word->word->flag(W_EOL)) {

         if (marked_delete_point) {

           while (copy_it.word() != word) {

             x_delete_mode = word_deletable (copy_it.word (),

               x_debug_delete_mode);

             if (crunch_debug > 0) {

               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",

                 x_debug_delete_mode,

                 copy_it.word()->best_choice->unichar_string().string());

             }

             copy_it.word ()->unlv_crunch_mode = x_delete_mode;

             copy_it.forward ();

           }

         }

         if (crunch_debug > 0) {

           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",

             debug_delete_mode,

             word->best_choice->unichar_string().string());

         }

         word->unlv_crunch_mode = delete_mode;

         deleting_from_bol = FALSE;

         marked_delete_point = FALSE;

       }

       else {

         if (!marked_delete_point) {

           copy_it = page_res_it;

           marked_delete_point = TRUE;

         }

       }

     }

     else {

       deleting_from_bol = FALSE;

                                  //Forget earlier potential crunches

       marked_delete_point = FALSE;

     }

     /*

       The following step has been left till now as the tess fails are used to

       determine if the word is deletable.

     */

     if (!crunch_early_merge_tess_fails)

       word->merge_tess_fails();

     page_res_it.forward ();

   }

 }


 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {

   int i;

   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");

   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");

   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");

   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");

   for (i = 0; i < word_res->reject_map.length(); ++i) {

     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {

       word_res->best_choice->set_unichar_id(unichar_dash, i);

       if (word_res->reject_map[i].accepted ())

         word_res->reject_map[i].setrej_unlv_rej ();

     }

     if (word_res->best_choice->unichar_id(i) == unichar_pow) {

       word_res->best_choice->set_unichar_id(unichar_space, i);

       if (word_res->reject_map[i].accepted ())

         word_res->reject_map[i].setrej_unlv_rej ();

     }

   }

 }


 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {

   enum STATES

   {

     JUNK,

     FIRST_UPPER,

     FIRST_LOWER,

     FIRST_NUM,

     SUBSEQUENT_UPPER,

     SUBSEQUENT_LOWER,

     SUBSEQUENT_NUM

   };

   const char *str = word->best_choice->unichar_string().string();

   const char *lengths = word->best_choice->unichar_lengths().string();

   STATES state = JUNK;

   int len = 0;

   int isolated_digits = 0;

   int isolated_alphas = 0;

   int bad_char_count = 0;

   int tess_rejs = 0;

   int dodgy_chars = 0;

   int ok_chars;

   UNICHAR_ID last_char = -1;

   int alpha_repetition_count = 0;

   int longest_alpha_repetition_count = 0;

   int longest_lower_run_len = 0;

   int lower_string_count = 0;

   int longest_upper_run_len = 0;

   int upper_string_count = 0;

   int total_alpha_count = 0;

   int total_digit_count = 0;


   for (; *str != '\0'; str += *(lengths++)) {

     len++;

     if (word->uch_set->get_isupper (str, *lengths)) {

       total_alpha_count++;

       switch (state) {

         case SUBSEQUENT_UPPER:

         case FIRST_UPPER:

           state = SUBSEQUENT_UPPER;

           upper_string_count++;

           if (longest_upper_run_len < upper_string_count)

             longest_upper_run_len = upper_string_count;

           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {

             alpha_repetition_count++;

             if (longest_alpha_repetition_count < alpha_repetition_count) {

               longest_alpha_repetition_count = alpha_repetition_count;

             }

           }

           else {

             last_char = word->uch_set->unichar_to_id(str, *lengths);

             alpha_repetition_count = 1;

           }

           break;

         case FIRST_NUM:

           isolated_digits++;

         default:

           state = FIRST_UPPER;

           last_char = word->uch_set->unichar_to_id(str, *lengths);

           alpha_repetition_count = 1;

           upper_string_count = 1;

           break;

       }

     }

     else if (word->uch_set->get_islower (str, *lengths)) {

       total_alpha_count++;

       switch (state) {

         case SUBSEQUENT_LOWER:

         case FIRST_LOWER:

           state = SUBSEQUENT_LOWER;

           lower_string_count++;

           if (longest_lower_run_len < lower_string_count)

             longest_lower_run_len = lower_string_count;

           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {

             alpha_repetition_count++;

             if (longest_alpha_repetition_count < alpha_repetition_count) {

               longest_alpha_repetition_count = alpha_repetition_count;

             }

           }

           else {

             last_char = word->uch_set->unichar_to_id(str, *lengths);

             alpha_repetition_count = 1;

           }

           break;

         case FIRST_NUM:

           isolated_digits++;

         default:

           state = FIRST_LOWER;

           last_char = word->uch_set->unichar_to_id(str, *lengths);

           alpha_repetition_count = 1;

           lower_string_count = 1;

           break;

       }

     }

     else if (word->uch_set->get_isdigit (str, *lengths)) {

       total_digit_count++;

       switch (state) {

         case FIRST_NUM:

           state = SUBSEQUENT_NUM;

         case SUBSEQUENT_NUM:

           break;

         case FIRST_UPPER:

         case FIRST_LOWER:

           isolated_alphas++;

         default:

           state = FIRST_NUM;

           break;

       }

     }

     else {

       if (*lengths == 1 && *str == ' ')

         tess_rejs++;

       else

         bad_char_count++;

       switch (state) {

         case FIRST_NUM:

           isolated_digits++;

           break;

         case FIRST_UPPER:

         case FIRST_LOWER:

           isolated_alphas++;

         default:

           break;

       }

       state = JUNK;

     }

   }


   switch (state) {

     case FIRST_NUM:

       isolated_digits++;

       break;

     case FIRST_UPPER:

     case FIRST_LOWER:

       isolated_alphas++;

     default:

       break;

   }


   if (crunch_include_numerals) {

     total_alpha_count += total_digit_count - isolated_digits;

   }


   if (crunch_leave_ok_strings && len >= 4 &&

       2 * (total_alpha_count - isolated_alphas) > len &&

       longest_alpha_repetition_count < crunch_long_repetitions) {

     if ((crunch_accept_ok &&

          acceptable_word_string(*word->uch_set, str, lengths) !=

              AC_UNACCEPTABLE) ||

         longest_lower_run_len > crunch_leave_lc_strings ||

         longest_upper_run_len > crunch_leave_uc_strings)

       return G_NEVER_CRUNCH;

   }

   if (word->reject_map.length() > 1 &&

       strpbrk(str, " ") == NULL &&

       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

        word->best_choice->permuter() == FREQ_DAWG_PERM ||

        word->best_choice->permuter() == USER_DAWG_PERM ||

        word->best_choice->permuter() == NUMBER_PERM ||

        acceptable_word_string(*word->uch_set, str, lengths) !=

            AC_UNACCEPTABLE || ok_dict_word))

     return G_OK;


   ok_chars = len - bad_char_count - isolated_digits -

     isolated_alphas - tess_rejs;


   if (crunch_debug > 3) {

     tprintf("garbage_word: \"%s\"\n",

             word->best_choice->unichar_string().string());

     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",

             len,

             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);

   }

   if (bad_char_count == 0 &&

       tess_rejs == 0 &&

       (len > isolated_digits + isolated_alphas || len <= 2))

     return G_OK;


   if (tess_rejs > ok_chars ||

       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))

     return G_TERRIBLE;


   if (len > 4) {

     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +

         isolated_alphas;

     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)

       return G_DODGY;

     else

       return G_OK;

   } else {

     dodgy_chars = 2 * tess_rejs + bad_char_count;

     if ((len == 4 && dodgy_chars > 2) ||

         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)

       return G_DODGY;

     else

       return G_OK;

   }

 }


 /*************************************************************************

  * word_deletable()

  *     DELETE WERDS AT ENDS OF ROWS IF

  *        Word is crunched &&

  *        ( string length = 0                                          OR

  *          > 50% of chars are "|" (before merging)                    OR

  *          certainty < -10                                            OR

  *          rating /char > 60                                          OR

  *          TOP of word is more than 0.5 xht BELOW baseline            OR

  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR

  *          length of word < 3xht                                      OR

  *          height of word < 0.7 xht                                   OR

  *          height of word > 3.0 xht                                   OR

  *          >75% of the outline BBs have longest dimension < 0.5xht

  *************************************************************************/


 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {

   int word_len = word->reject_map.length ();

   float rating_per_ch;

   TBOX box;                       //BB of word


   if (word->unlv_crunch_mode == CR_NONE) {

     delete_mode = 0;

     return CR_NONE;

   }


   if (word_len == 0) {

     delete_mode = 1;

     return CR_DELETE;

   }


   if (word->rebuild_word != NULL) {

     // Cube leaves rebuild_word NULL.

     box = word->rebuild_word->bounding_box();

     if (box.height () < crunch_del_min_ht * kBlnXHeight) {

       delete_mode = 4;

       return CR_DELETE;

     }


     if (noise_outlines(word->rebuild_word)) {

       delete_mode = 5;

       return CR_DELETE;

     }

   }


   if ((failure_count (word) * 1.5) > word_len) {

     delete_mode = 2;

     return CR_LOOSE_SPACE;

   }


   if (word->best_choice->certainty () < crunch_del_cert) {

     delete_mode = 7;

     return CR_LOOSE_SPACE;

   }


   rating_per_ch = word->best_choice->rating () / word_len;


   if (rating_per_ch > crunch_del_rating) {

     delete_mode = 8;

     return CR_LOOSE_SPACE;

   }


   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {

     delete_mode = 9;

     return CR_LOOSE_SPACE;

   }


   if (box.bottom () >

   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {

     delete_mode = 10;

     return CR_LOOSE_SPACE;

   }


   if (box.height () > crunch_del_max_ht * kBlnXHeight) {

     delete_mode = 11;

     return CR_LOOSE_SPACE;

   }


   if (box.width () < crunch_del_min_width * kBlnXHeight) {

     delete_mode = 3;

     return CR_LOOSE_SPACE;

   }


   delete_mode = 0;

   return CR_NONE;

 }


 inT16 Tesseract::failure_count(WERD_RES *word) {

   const char *str = word->best_choice->unichar_string().string();

   int tess_rejs = 0;


   for (; *str != '\0'; str++) {

     if (*str == ' ')

       tess_rejs++;

   }

   return tess_rejs;

 }


 BOOL8 Tesseract::noise_outlines(TWERD *word) {

   TBOX box;                       // BB of outline

   inT16 outline_count = 0;

   inT16 small_outline_count = 0;

   inT16 max_dimension;

   float small_limit = kBlnXHeight * crunch_small_outlines_size;


   for (int b = 0; b < word->NumBlobs(); ++b) {

     TBLOB* blob = word->blobs[b];

     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {

       outline_count++;

       box = ol->bounding_box();

       if (box.height() > box.width())

         max_dimension = box.height();

       else

         max_dimension = box.width();

       if (max_dimension < small_limit)

         small_outline_count++;

     }

   }

   return small_outline_count >= outline_count;

 }


 }  // namespace tesseract

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:28

TBLOB
Definition: blobs.h:261

globals.h

tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:663

POLY_BLOCK
Definition: polyblk.h:28

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356

CR_LOOSE_SPACE
Definition: pageres.h:149

PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1673

tesseract::Tesseract::word_outline_errs
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

UNICHARSET::unichar_to_id
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

WERD_RES
Definition: pageres.h:155

REJMAP::length
inT32 length() const
Definition: rejctmap.h:237

tesseract::Tesseract::crunch_include_numerals
bool crunch_include_numerals
Definition: tesseractclass.h:961

tesseract::Tesseract::crunch_terrible_rating
double crunch_terrible_rating
Definition: tesseractclass.h:938

tesseract::Tesseract::crunch_del_max_ht
double crunch_del_max_ht
Definition: tesseractclass.h:949

CR_KEEP_SPACE
Definition: pageres.h:148

tesseract::Tesseract::potential_word_crunch
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:545

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::DocQualCallbacks::DocQualCallbacks
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:36

ROW_RES
Definition: pageres.h:125

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:271

USER_DAWG_PERM
Definition: ratngs.h:251

TESSLINE
Definition: blobs.h:180

PAGE_RES_IT
Definition: pageres.h:656

tesseract::Tesseract::outlines_odd
char * outlines_odd
Definition: tesseractclass.h:894

PAGE_RES::char_count
inT32 char_count
Definition: pageres.h:60

tesseract::Tesseract::outlines_2
char * outlines_2
Definition: tesseractclass.h:895

tprintf
#define tprintf(...)
Definition: tprintf.h:31

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

ROW_RES::whole_word_rej_count
inT32 whole_word_rej_count
Definition: pageres.h:130

WERD_RES::reject_spaces
BOOL8 reject_spaces
Definition: pageres.h:317

tesseract::DocQualCallbacks
Definition: docqual.cpp:35

tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193

tesseract::Tesseract::tessedit_debug_block_rejection
bool tessedit_debug_block_rejection
Definition: tesseractclass.h:842

tesseract::Tesseract::crunch_long_repetitions
int crunch_long_repetitions
Definition: tesseractclass.h:966

PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:658

tesseract::Tesseract::quality_rowrej_pc
double quality_rowrej_pc
Definition: tesseractclass.h:931

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:531

G_OK
Definition: docqual.h:28

BOOL8
unsigned char BOOL8
Definition: host.h:113

STRING::length
inT32 length() const
Definition: strngs.cpp:188

tesseract::Tesseract::tessedit_use_reject_spaces
bool tessedit_use_reject_spaces
Definition: tesseractclass.h:900

POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::Tesseract::crunch_leave_ok_strings
bool crunch_leave_ok_strings
Definition: tesseractclass.h:957

tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:421

tesseract::DocQualCallbacks::accepted_match_count
inT16 accepted_match_count
Definition: docqual.cpp:56

tesseract::DocQualCallbacks::word
WERD_RES * word
Definition: docqual.cpp:54

tesseract::Tesseract::safe_dict_word
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:99

G_NEVER_CRUNCH
Definition: docqual.h:27

PAGE_RES::rejected
BOOL8 rejected
Definition: pageres.h:63

ROW
Definition: ocrrow.h:32

REJMAP::quality_recoverable_rejects
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:354

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

W_BOL
Definition: werd.h:35

tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:163

tesseract::Tesseract::tessedit_good_quality_unrej
bool tessedit_good_quality_unrej
Definition: tesseractclass.h:899

tesseract::Tesseract::tessedit_whole_wd_rej_row_percent
double tessedit_whole_wd_rej_row_percent
Definition: tesseractclass.h:909

tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:898

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:739

tesseract::Tesseract::crunch_small_outlines_size
double crunch_small_outlines_size
Definition: tesseractclass.h:954

tesseract::Tesseract::tessedit_reject_block_percent
double tessedit_reject_block_percent
Definition: tesseractclass.h:904

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:713

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:425

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

tessvars.h

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:140

tesseract::Tesseract::crunch_pot_indicators
int crunch_pot_indicators
Definition: tesseractclass.h:956

tesseract::Tesseract::tessedit_dont_blkrej_good_wds
bool tessedit_dont_blkrej_good_wds
Definition: tesseractclass.h:915

tesseract::Tesseract::crunch_early_convert_bad_unlv_chs
bool crunch_early_convert_bad_unlv_chs
Definition: tesseractclass.h:937

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:680

tesseract::Tesseract::failure_count
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:969

W_EOL
Definition: werd.h:36

tesseract::Tesseract::count_outline_errs
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:128

tesseract::Tesseract::check_debug_pt
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97

CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:145

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:327

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:244

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

BLOCK_RES::char_count
inT32 char_count
Definition: pageres.h:100

tesseract::DocQualCallbacks::match_count
inT16 match_count
Definition: docqual.cpp:55

REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:506

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

tesseract::Tesseract::crunch_rating_max
int crunch_rating_max
Definition: tesseractclass.h:955

FREQ_DAWG_PERM
Definition: ratngs.h:252

NUMBER_PERM
Definition: ratngs.h:247

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:736

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

tesseract::Tesseract::unlv_tilde_crunching
bool unlv_tilde_crunching
Definition: tesseractclass.h:933

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:343

tesseract::Tesseract::crunch_debug
int crunch_debug
Definition: tesseractclass.h:967

G_DODGY
Definition: docqual.h:29

tesseract::Tesseract::crunch_poor_garbage_cert
double crunch_poor_garbage_cert
Definition: tesseractclass.h:941

tesseract::Tesseract::crunch_del_min_ht
double crunch_del_min_ht
Definition: tesseractclass.h:948

tesseract::Tesseract::tessedit_unrej_any_wd
bool tessedit_unrej_any_wd
Definition: tesseractclass.h:836

PAGE_RES::rej_count
inT32 rej_count
Definition: pageres.h:61

NewPermanentTessCallback
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:29

tesseract::Tesseract::crunch_pot_poor_rate
double crunch_pot_poor_rate
Definition: tesseractclass.h:943

tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:683

REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:515

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesscallback.h

REJMAP::accept_count
inT16 accept_count()
Definition: rejctmap.cpp:331

ROW_RES::rej_count
inT32 rej_count
Definition: pageres.h:129

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

tesseract::Tesseract::crunch_leave_uc_strings
int crunch_leave_uc_strings
Definition: tesseractclass.h:965

ROW_RES::char_count
inT32 char_count
Definition: pageres.h:128

tesseract::Tesseract::tessedit_reject_row_percent
double tessedit_reject_row_percent
Definition: tesseractclass.h:906

tesseract::Tesseract::word_blob_quality
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65

tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:235

reject.h

tesseract::Tesseract::tessedit_debug_doc_rejection
bool tessedit_debug_doc_rejection
Definition: tesseractclass.h:926

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

G_TERRIBLE
Definition: docqual.h:30

TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:881

tesseract::Tesseract::terrible_word_crunch
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:507

tesseract::Tesseract::crunch_early_merge_tess_fails
bool crunch_early_merge_tess_fails
Definition: tesseractclass.h:936

WERD_RES::word
WERD * word
Definition: pageres.h:175

GenericVector::empty
bool empty() const
Definition: genericvector.h:84

BLOCK_RES::rej_count
inT32 rej_count
Definition: pageres.h:101

TBOX::height
inT16 height() const
Definition: rect.h:104

tesseract::Tesseract::crunch_del_rating
double crunch_del_rating
Definition: tesseractclass.h:946

tesseract::Tesseract::tessedit_row_rej_good_docs
bool tessedit_row_rej_good_docs
Definition: tesseractclass.h:921

docqual.h

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:36

TBOX::width
inT16 width() const
Definition: rect.h:111

tesseract::Tesseract::crunch_del_cert
double crunch_del_cert
Definition: tesseractclass.h:947

BLOCK_RES
Definition: pageres.h:97

tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds
bool tessedit_preserve_row_rej_perfect_wds
Definition: tesseractclass.h:913

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

FALSE
#define FALSE
Definition: capi.h:29

tesseract
Definition: baseapi.cpp:83

WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184

tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd
double tessedit_good_doc_still_rowrej_wd
Definition: tesseractclass.h:923

GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:25

tesseract::DocQualCallbacks::CountAcceptedBlobs
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43

tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:117

ROW_RES::row
ROW * row
Definition: pageres.h:127

REJMAP::reject_count
inT16 reject_count()
Definition: rejctmap.h:243

tesseract::Tesseract::bland_unrej
bool bland_unrej
Definition: tesseractclass.h:929

tesseract::Tesseract::noise_outlines
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:981

tesseract::Tesseract::tessedit_dont_rowrej_good_wds
bool tessedit_dont_rowrej_good_wds
Definition: tesseractclass.h:917

TBOX
Definition: rect.h:30

CR_NONE
Definition: pageres.h:147

TRUE
#define TRUE
Definition: capi.h:28

tesseract::Tesseract::crunch_pot_poor_cert
double crunch_pot_poor_cert
Definition: tesseractclass.h:944

PDBLK::index
int index() const
Definition: pdblock.h:77

reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:410

tesseract::Tesseract::tessedit_reject_doc_percent
double tessedit_reject_doc_percent
Definition: tesseractclass.h:902

WERD::space
uinT8 space()
Definition: werd.h:104

WERD::flag
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::Tesseract::crunch_del_low_word
double crunch_del_low_word
Definition: tesseractclass.h:953

STRING
Definition: strngs.h:44

tesseract::Tesseract::tessedit_preserve_min_wd_len
int tessedit_preserve_min_wd_len
Definition: tesseractclass.h:919

tesseract::Tesseract::crunch_del_high_word
double crunch_del_high_word
Definition: tesseractclass.h:952

WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294

NULL
#define NULL
Definition: host.h:144

tesseract::Tesseract::crunch_del_min_width
double crunch_del_min_width
Definition: tesseractclass.h:950

tesseract::Tesseract::crunch_accept_ok
bool crunch_accept_ok
Definition: tesseractclass.h:958

CR_DELETE
Definition: pageres.h:150

tesseract::Tesseract::crunch_leave_lc_strings
int crunch_leave_lc_strings
Definition: tesseractclass.h:963

TWERD
Definition: blobs.h:395

TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:377

tesseract::Tesseract::crunch_poor_garbage_rate
double crunch_poor_garbage_rate
Definition: tesseractclass.h:942

tesseract::DocQualCallbacks::CountMatchingBlobs
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39

STRING::string
const char * string() const
Definition: strngs.cpp:193

TBOX::top
inT16 top() const
Definition: rect.h:54

PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59

tesseract::Tesseract::crunch_leave_accept_strings
bool crunch_leave_accept_strings
Definition: tesseractclass.h:960

PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:727

tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:593

tesseract::Tesseract::crunch_terrible_garbage
bool crunch_terrible_garbage
Definition: tesseractclass.h:939

WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1061

REJMAP::rej_word_doc_rej
void rej_word_doc_rej()
Definition: rejctmap.cpp:497

tesseract::DocQualCallbacks::AcceptIfGoodQuality
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49

TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:469

tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds
bool tessedit_preserve_blk_rej_perfect_wds
Definition: tesseractclass.h:911

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:733

inT16
short inT16
Definition: host.h:100

tesseractclass.h