tesseract-ocr.github.io/3.x/a00724_source.html

 /**********************************************************************

  * File:        reject.cpp  (Formerly reject.c)

  * Description: Rejection functions used in tessedit

  * Author:              Phil Cheatle

  * Created:             Wed Sep 23 16:50:21 BST 1992

  *

  * (C) Copyright 1992, Hewlett-Packard Ltd.

  ** Licensed under the Apache License, Version 2.0 (the "License");

  ** you may not use this file except in compliance with the License.

  ** You may obtain a copy of the License at

  ** http://www.apache.org/licenses/LICENSE-2.0

  ** Unless required by applicable law or agreed to in writing, software

  ** distributed under the License is distributed on an "AS IS" BASIS,

  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  ** See the License for the specific language governing permissions and

  ** limitations under the License.

  *

  **********************************************************************/


 #ifdef _MSC_VER

 #pragma warning(disable:4244)  // Conversion warnings

 #pragma warning(disable:4305)  // int/float warnings

 #endif


 #include          "tessvars.h"

 #ifdef __UNIX__

 #include          <assert.h>

 #include          <errno.h>

 #endif

 #include          "scanutils.h"

 #include          <ctype.h>

 #include          <string.h>

 #include          "genericvector.h"

 #include          "reject.h"

 #include          "control.h"

 #include          "docqual.h"

 #include          "globaloc.h"  // For err_exit.

 #include          "globals.h"

 #include          "helpers.h"


 #include "tesseractclass.h"


 // Include automatically generated configuration file if running autoconf.

 #ifdef HAVE_CONFIG_H

 #include "config_auto.h"

 #endif


 CLISTIZEH (STRING) CLISTIZE (STRING)


 /*************************************************************************

  * set_done()

  *

  * Set the done flag based on the word acceptability criteria

  *************************************************************************/


 namespace tesseract {

 void Tesseract::set_done(WERD_RES *word, inT16 pass) {

   word->done = word->tess_accepted &&

       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);

   bool word_is_ambig = word->best_choice->dangerous_ambig_found();

   bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

       word->best_choice->permuter() == FREQ_DAWG_PERM ||

       word->best_choice->permuter() == USER_DAWG_PERM;

   if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&

       one_ell_conflict(word, FALSE)) {

     if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");

     word->done = FALSE;

   }

   if (word->done && ((!word_from_dict &&

       word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {

     if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");

       word->done = FALSE;

   }

   if (tessedit_rejection_debug) {

     tprintf("set_done(): done=%d\n", word->done);

     word->best_choice->print("");

   }

 }


 /*************************************************************************

  * make_reject_map()

  *

  * Sets the done flag to indicate whether the resylt is acceptable.

  *

  * Sets a reject map for the word.

  *************************************************************************/

 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {

   int i;

   int offset;


   flip_0O(word);

   check_debug_pt(word, -1);     // For trap only

   set_done(word, pass);  // Set acceptance

   word->reject_map.initialise(word->best_choice->unichar_lengths().length());

   reject_blanks(word);

   /*

   0: Rays original heuristic - the baseline

   */

   if (tessedit_reject_mode == 0) {

     if (!word->done)

       reject_poor_matches(word);

   } else if (tessedit_reject_mode == 5) {

     /*

     5: Reject I/1/l from words where there is no strong contextual confirmation;

       the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);

       and the whole of any words which are very small

     */

     if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {

       word->reject_map.rej_word_small_xht();

     } else {

       one_ell_conflict(word, TRUE);

       /*

         Originally the code here just used the done flag. Now I have duplicated

         and unpacked the conditions for setting the done flag so that each

         mechanism can be turned on or off independently. This works WITHOUT

         affecting the done flag setting.

       */

       if (rej_use_tess_accepted && !word->tess_accepted)

         word->reject_map.rej_word_not_tess_accepted ();


       if (rej_use_tess_blanks &&

         (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))

         word->reject_map.rej_word_contains_blanks ();


       WERD_CHOICE* best_choice = word->best_choice;

       if (rej_use_good_perm) {

         if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||

              best_choice->permuter() == FREQ_DAWG_PERM ||

              best_choice->permuter() == USER_DAWG_PERM) &&

             (!rej_use_sensible_wd ||

              acceptable_word_string(*word->uch_set,

                                     best_choice->unichar_string().string(),

                                     best_choice->unichar_lengths().string()) !=

                                         AC_UNACCEPTABLE)) {

           // PASSED TEST

         } else if (best_choice->permuter() == NUMBER_PERM) {

           if (rej_alphas_in_number_perm) {

             for (i = 0, offset = 0;

                  best_choice->unichar_string()[offset] != '\0';

                  offset += best_choice->unichar_lengths()[i++]) {

               if (word->reject_map[i].accepted() &&

                   word->uch_set->get_isalpha(

                       best_choice->unichar_string().string() + offset,

                       best_choice->unichar_lengths()[i]))

                 word->reject_map[i].setrej_bad_permuter();

               // rej alpha

             }

           }

         } else {

           word->reject_map.rej_word_bad_permuter();

         }

       }

       /* Ambig word rejection was here once !!*/

     }

   } else {

     tprintf("BAD tessedit_reject_mode\n");

     err_exit();

   }


   if (tessedit_image_border > -1)

     reject_edge_blobs(word);


   check_debug_pt (word, 10);

   if (tessedit_rejection_debug) {

     tprintf("Permuter Type = %d\n", word->best_choice->permuter ());

     tprintf("Certainty: %f     Rating: %f\n",

       word->best_choice->certainty (), word->best_choice->rating ());

     tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));

   }


   flip_hyphens(word);

   check_debug_pt(word, 20);

 }

 }  // namespace tesseract


 void reject_blanks(WERD_RES *word) {

   inT16 i;

   inT16 offset;


   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';

        offset += word->best_choice->unichar_lengths()[i], i += 1) {

     if (word->best_choice->unichar_string()[offset] == ' ')

                                  //rej unrecognised blobs

       word->reject_map[i].setrej_tess_failure ();

   }

 }


 namespace tesseract {

 void Tesseract::reject_I_1_L(WERD_RES *word) {

   inT16 i;

   inT16 offset;


   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';

        offset += word->best_choice->unichar_lengths()[i], i += 1) {

     if (STRING (conflict_set_I_l_1).

     contains (word->best_choice->unichar_string()[offset])) {

                                  //rej 1Il conflict

       word->reject_map[i].setrej_1Il_conflict ();

     }

   }

 }

 }  // namespace tesseract


 void reject_poor_matches(WERD_RES *word) {

   float threshold = compute_reject_threshold(word->best_choice);

   for (int i = 0; i < word->best_choice->length(); ++i) {

     if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)

       word->reject_map[i].setrej_tess_failure();

     else if (word->best_choice->certainty(i) < threshold)

       word->reject_map[i].setrej_poor_match();

   }

 }


 /**********************************************************************

  * compute_reject_threshold

  *

  * Set a rejection threshold for this word.

  * Initially this is a trivial function which looks for the largest

  * gap in the certainty value.

  **********************************************************************/


 float compute_reject_threshold(WERD_CHOICE* word) {

   float threshold;               // rejection threshold

   float bestgap = 0.0f;          // biggest gap

   float gapstart;                // bottom of gap

                                  // super iterator

   BLOB_CHOICE_IT choice_it;      // real iterator


   int blob_count = word->length();

   GenericVector<float> ratings;

   ratings.init_to_size(blob_count, 0.0f);

   for (int i = 0; i < blob_count; ++i) {

     ratings[i] = word->certainty(i);

   }

   ratings.sort();

   gapstart = ratings[0] - 1;     // all reject if none better

   if (blob_count >= 3) {

     for (int index = 0; index < blob_count - 1; index++) {

       if (ratings[index + 1] - ratings[index] > bestgap) {

         bestgap = ratings[index + 1] - ratings[index];

         // find biggest

         gapstart = ratings[index];

       }

     }

   }

   threshold = gapstart + bestgap / 2;


   return threshold;

 }


 /*************************************************************************

  * reject_edge_blobs()

  *

  * If the word is perilously close to the edge of the image, reject those blobs

  * in the word which are too close to the edge as they could be clipped.

  *************************************************************************/

 namespace tesseract {

 void Tesseract::reject_edge_blobs(WERD_RES *word) {

   TBOX word_box = word->word->bounding_box();

   // Use the box_word as it is already denormed back to image coordinates.

   int blobcount = word->box_word->length();


   if (word_box.left() < tessedit_image_border ||

       word_box.bottom() < tessedit_image_border ||

       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||

       word_box.top() + tessedit_image_border > ImageHeight() - 1) {

     ASSERT_HOST(word->reject_map.length() == blobcount);

     for (int blobindex = 0; blobindex < blobcount; blobindex++) {

       TBOX blob_box = word->box_word->BlobBox(blobindex);

       if (blob_box.left() < tessedit_image_border ||

           blob_box.bottom() < tessedit_image_border ||

           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||

           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {

         word->reject_map[blobindex].setrej_edge_char();

         // Close to edge

       }

     }

   }

 }


 /**********************************************************************

  * one_ell_conflict()

  *

  * Identify words where there is a potential I/l/1 error.

  * - A bundle of contextual heuristics!

  **********************************************************************/

 BOOL8 Tesseract::one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {

   const char *word;

   const char *lengths;

   inT16 word_len;                //its length

   inT16 first_alphanum_index_;

   inT16 first_alphanum_offset_;

   inT16 i;

   inT16 offset;

   BOOL8 non_conflict_set_char;   //non conf set a/n?

   BOOL8 conflict = FALSE;

   BOOL8 allow_1s;

   ACCEPTABLE_WERD_TYPE word_type;

   BOOL8 dict_perm_type;

   BOOL8 dict_word_ok;

   int dict_word_type;


   word = word_res->best_choice->unichar_string().string ();

   lengths = word_res->best_choice->unichar_lengths().string();

   word_len = strlen (lengths);

   /*

     If there are no occurrences of the conflict set characters then the word

     is OK.

   */

   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)

     return FALSE;


   /*

     There is a conflict if there are NO other (confirmed) alphanumerics apart

     from those in the conflict set.

   */


   for (i = 0, offset = 0, non_conflict_set_char = FALSE;

        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])

     non_conflict_set_char =

         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||

             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&

         !STRING (conflict_set_I_l_1).contains (word[offset]);

   if (!non_conflict_set_char) {

     if (update_map)

       reject_I_1_L(word_res);

     return TRUE;

   }


   /*

     If the word is accepted by a dawg permuter, and the first alpha character

     is "I" or "l", check to see if the alternative is also a dawg word. If it

     is, then there is a potential error otherwise the word is ok.

   */


   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||

     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||

     (rej_trust_doc_dawg &&

     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||

     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);

   dict_word_type = dict_word(*(word_res->best_choice));

   dict_word_ok = (dict_word_type > 0) &&

     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));


   if ((rej_1Il_use_dict_word && dict_word_ok) ||

     (rej_1Il_trust_permuter_type && dict_perm_type) ||

   (dict_perm_type && dict_word_ok)) {

     first_alphanum_index_ = first_alphanum_index (word, lengths);

     first_alphanum_offset_ = first_alphanum_offset (word, lengths);

     if (lengths[first_alphanum_index_] == 1 &&

         word[first_alphanum_offset_] == 'I') {

       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

       if (safe_dict_word(word_res) > 0) {

         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

         if (update_map)

           word_res->reject_map[first_alphanum_index_].

             setrej_1Il_conflict();

         return TRUE;

       }

       else {

         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

         return FALSE;

       }

     }


     if (lengths[first_alphanum_index_] == 1 &&

         word[first_alphanum_offset_] == 'l') {

       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

       if (safe_dict_word(word_res) > 0) {

         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

         if (update_map)

           word_res->reject_map[first_alphanum_index_].

             setrej_1Il_conflict();

         return TRUE;

       }

       else {

         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

         return FALSE;

       }

     }

     return FALSE;

   }


   /*

     NEW 1Il code. The old code relied on permuter types too much. In fact,

     tess will use TOP_CHOICE permute for good things like "palette".

     In this code the string is examined independently to see if it looks like

     a well formed word.

   */


   /*

     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a

     dictionary word.

   */

   first_alphanum_index_ = first_alphanum_index (word, lengths);

   first_alphanum_offset_ = first_alphanum_offset (word, lengths);

   if (lengths[first_alphanum_index_] == 1 &&

       word[first_alphanum_offset_] == 'l') {

     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

     if (safe_dict_word(word_res) > 0)

       return FALSE;

     else

       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

   }

   else if (lengths[first_alphanum_index_] == 1 &&

            word[first_alphanum_offset_] == 'I') {

     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

     if (safe_dict_word(word_res) > 0)

       return FALSE;

     else

       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

   }

   /*

     For strings containing digits:

       If there are no alphas OR the numeric permuter liked the word,

         reject any non 1 conflict chs

       Else reject all conflict chs

   */

   if (word_contains_non_1_digit (word, lengths)) {

     allow_1s = (alpha_count (word, lengths) == 0) ||

       (word_res->best_choice->permuter () == NUMBER_PERM);


     inT16 offset;

     conflict = FALSE;

     for (i = 0, offset = 0; word[offset] != '\0';

          offset += word_res->best_choice->unichar_lengths()[i++]) {

       if ((!allow_1s || (word[offset] != '1')) &&

       STRING (conflict_set_I_l_1).contains (word[offset])) {

         if (update_map)

           word_res->reject_map[i].setrej_1Il_conflict ();

         conflict = TRUE;

       }

     }

     return conflict;

   }

   /*

     For anything else. See if it conforms to an acceptable word type. If so,

     treat accordingly.

   */

   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);

   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {

     first_alphanum_index_ = first_alphanum_index (word, lengths);

     first_alphanum_offset_ = first_alphanum_offset (word, lengths);

     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {

       if (update_map)

         word_res->reject_map[first_alphanum_index_].

             setrej_1Il_conflict ();

       return TRUE;

     }

     else

       return FALSE;

   }

   else if (word_type == AC_UPPER_CASE) {

     return FALSE;

   }

   else {

     if (update_map)

       reject_I_1_L(word_res);

     return TRUE;

   }

 }


 inT16 Tesseract::first_alphanum_index(const char *word,

                                       const char *word_lengths) {

   inT16 i;

   inT16 offset;


   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||

         unicharset.get_isdigit(word + offset, word_lengths[i]))

       return i;

   }

   return -1;

 }


 inT16 Tesseract::first_alphanum_offset(const char *word,

                                        const char *word_lengths) {

   inT16 i;

   inT16 offset;


   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||

         unicharset.get_isdigit(word + offset, word_lengths[i]))

       return offset;

   }

   return -1;

 }


 inT16 Tesseract::alpha_count(const char *word,

                              const char *word_lengths) {

   inT16 i;

   inT16 offset;

   inT16 count = 0;


   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

     if (unicharset.get_isalpha (word + offset, word_lengths[i]))

       count++;

   }

   return count;

 }


 BOOL8 Tesseract::word_contains_non_1_digit(const char *word,

                                            const char *word_lengths) {

   inT16 i;

   inT16 offset;


   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&

         (word_lengths[i] != 1 || word[offset] != '1'))

       return TRUE;

   }

   return FALSE;

 }


 /*************************************************************************

  * dont_allow_1Il()

  * Dont unreject LONE accepted 1Il conflict set chars

  *************************************************************************/

 void Tesseract::dont_allow_1Il(WERD_RES *word) {

   int i = 0;

   int offset;

   int word_len = word->reject_map.length();

   const char *s = word->best_choice->unichar_string().string();

   const char *lengths = word->best_choice->unichar_lengths().string();

   BOOL8 accepted_1Il = FALSE;


   for (i = 0, offset = 0; i < word_len;

        offset += word->best_choice->unichar_lengths()[i++]) {

     if (word->reject_map[i].accepted()) {

       if (STRING(conflict_set_I_l_1).contains(s[offset])) {

         accepted_1Il = TRUE;

       } else {

         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||

             word->uch_set->get_isdigit(s + offset, lengths[i]))

           return;                // >=1 non 1Il ch accepted

       }

     }

   }

   if (!accepted_1Il)

     return;                      //Nothing to worry about


   for (i = 0, offset = 0; i < word_len;

        offset += word->best_choice->unichar_lengths()[i++]) {

     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&

       word->reject_map[i].accepted())

       word->reject_map[i].setrej_postNN_1Il();

   }

 }


 inT16 Tesseract::count_alphanums(WERD_RES *word_res) {

   int count = 0;

   const WERD_CHOICE *best_choice = word_res->best_choice;

   for (int i = 0; i < word_res->reject_map.length(); ++i) {

     if ((word_res->reject_map[i].accepted()) &&

         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||

             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {

       count++;

     }

   }

   return count;

 }


 // reject all if most rejected.

 void Tesseract::reject_mostly_rejects(WERD_RES *word) {

   /* Reject the whole of the word if the fraction of rejects exceeds a limit */


   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=

     rej_whole_of_mostly_reject_word_fract)

     word->reject_map.rej_word_mostly_rej();

 }


 BOOL8 Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {

   inT16 char_quality;

   inT16 accepted_char_quality;


   if (word->best_choice->unichar_lengths().length() <= 1)

     return FALSE;


   if (!STRING(ok_repeated_ch_non_alphanum_wds).

     contains(word->best_choice->unichar_string()[0]))

     return FALSE;


   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);

   for (int i = 1; i < word->best_choice->length(); ++i) {

     if (word->best_choice->unichar_id(i) != uch_id) return FALSE;

   }


   word_char_quality(word, row, &char_quality, &accepted_char_quality);


   if ((word->best_choice->unichar_lengths().length () == char_quality) &&

     (char_quality == accepted_char_quality))

     return TRUE;

   else

     return FALSE;

 }


 inT16 Tesseract::safe_dict_word(const WERD_RES *werd_res) {

   const WERD_CHOICE &word = *werd_res->best_choice;

   int dict_word_type = werd_res->tesseract->dict_word(word);

   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;

 }


 // Note: After running this function word_res->ratings

 // might not contain the right BLOB_CHOICE corresponding to each character

 // in word_res->best_choice.

 void Tesseract::flip_hyphens(WERD_RES *word_res) {

   WERD_CHOICE *best_choice = word_res->best_choice;

   int i;

   int prev_right = -9999;

   int next_left;

   TBOX out_box;

   float aspect_ratio;


   if (tessedit_lower_flip_hyphen <= 1)

     return;


   int num_blobs = word_res->rebuild_word->NumBlobs();

   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");

   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {

     TBLOB* blob = word_res->rebuild_word->blobs[i];

     out_box = blob->bounding_box();

     if (i + 1 == num_blobs)

       next_left = 9999;

     else

       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();

     // Dont touch small or touching blobs - it is too dangerous.

     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&

         (out_box.left() > prev_right) && (out_box.right() < next_left)) {

       aspect_ratio = out_box.width() / (float) out_box.height();

       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {

         if (aspect_ratio >= tessedit_upper_flip_hyphen &&

             word_res->uch_set->contains_unichar_id(unichar_dash) &&

             word_res->uch_set->get_enabled(unichar_dash)) {

           /* Certain HYPHEN */

           best_choice->set_unichar_id(unichar_dash, i);

           if (word_res->reject_map[i].rejected())

             word_res->reject_map[i].setrej_hyphen_accept();

         }

         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&

           word_res->reject_map[i].accepted())

                                  //Suspected HYPHEN

           word_res->reject_map[i].setrej_hyphen ();

       }

       else if (best_choice->unichar_id(i) == unichar_dash) {

         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&

           (word_res->reject_map[i].rejected()))

           word_res->reject_map[i].setrej_hyphen_accept();

         //Certain HYPHEN


         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&

           (word_res->reject_map[i].accepted()))

                                  //Suspected HYPHEN

           word_res->reject_map[i].setrej_hyphen();

       }

     }

     prev_right = out_box.right();

   }

 }


 // Note: After running this function word_res->ratings

 // might not contain the right BLOB_CHOICE corresponding to each character

 // in word_res->best_choice.

 void Tesseract::flip_0O(WERD_RES *word_res) {

   WERD_CHOICE *best_choice = word_res->best_choice;

   int i;

   TBOX out_box;


   if (!tessedit_flip_0O)

     return;


   int num_blobs = word_res->rebuild_word->NumBlobs();

   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {

     TBLOB* blob = word_res->rebuild_word->blobs[i];

     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||

         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {

       out_box = blob->bounding_box();

       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||

         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))

         return;                  //Beware words with sub/superscripts

     }

   }

   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");

   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");

   if (unichar_0 == INVALID_UNICHAR_ID ||

       !word_res->uch_set->get_enabled(unichar_0) ||

       unichar_O == INVALID_UNICHAR_ID ||

       !word_res->uch_set->get_enabled(unichar_O)) {

     return;  // 0 or O are not present/enabled in unicharset

   }

   for (i = 1; i < best_choice->length(); ++i) {

     if (best_choice->unichar_id(i) == unichar_0 ||

         best_choice->unichar_id(i) == unichar_O) {

       /* A0A */

       if ((i+1) < best_choice->length() &&

           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {

         best_choice->set_unichar_id(unichar_O, i);

       }

       /* A00A */

       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (i+1) < best_choice->length() &&

           (best_choice->unichar_id(i+1) == unichar_0 ||

            best_choice->unichar_id(i+1) == unichar_O) &&

           (i+2) < best_choice->length() &&

           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {

         best_choice->set_unichar_id(unichar_O, i);

         i++;

       }

       /* AA0<non digit or end of word> */

       if ((i > 1) &&

           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&

           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (((i+1) < best_choice->length() &&

             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&

             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&

             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||

            (i == best_choice->length() - 1))) {

         best_choice->set_unichar_id(unichar_O, i);

       }

       /* 9O9 */

       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (i+1) < best_choice->length() &&

           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {

         best_choice->set_unichar_id(unichar_0, i);

       }

       /* 9OOO */

       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (i+2) < best_choice->length() &&

           (best_choice->unichar_id(i+1) == unichar_0 ||

            best_choice->unichar_id(i+1) == unichar_O) &&

           (best_choice->unichar_id(i+2) == unichar_0 ||

            best_choice->unichar_id(i+2) == unichar_O)) {

         best_choice->set_unichar_id(unichar_0, i);

         best_choice->set_unichar_id(unichar_0, i+1);

         best_choice->set_unichar_id(unichar_0, i+2);

         i += 2;

       }

       /* 9OO<non upper> */

       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (i+2) < best_choice->length() &&

           (best_choice->unichar_id(i+1) == unichar_0 ||

           best_choice->unichar_id(i+1) == unichar_O) &&

           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {

         best_choice->set_unichar_id(unichar_0, i);

         best_choice->set_unichar_id(unichar_0, i+1);

         i++;

       }

       /* 9O<non upper> */

       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&

           (i+1) < best_choice->length() &&

           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {

         best_choice->set_unichar_id(unichar_0, i);

       }

       /* 9[.,]OOO.. */

       if ((i > 1) &&

           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||

               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&

           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||

            best_choice->unichar_id(i-2) == unichar_O)) {

         if (best_choice->unichar_id(i-2) == unichar_O) {

           best_choice->set_unichar_id(unichar_0, i-2);

         }

         while (i < best_choice->length() &&

                (best_choice->unichar_id(i) == unichar_O ||

                 best_choice->unichar_id(i) == unichar_0)) {

           best_choice->set_unichar_id(unichar_0, i);

           i++;

         }

         i--;

       }

     }

   }

 }


 BOOL8 Tesseract::non_O_upper(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {

   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");

 }


 BOOL8 Tesseract::non_0_digit(const UNICHARSET& ch_set, UNICHAR_ID unichar_id) {

   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");

 }

 }  // namespace tesseract

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:28

WERD_RES::tess_accepted
BOOL8 tess_accepted
Definition: pageres.h:280

TBLOB
Definition: blobs.h:261

globals.h

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356

tesseract::Tesseract::first_alphanum_offset
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:482

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:250

tesseract::Tesseract::reject_mostly_rejects
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:573

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

UNICHARSET::unichar_to_id
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

err_exit
void err_exit()
Definition: globaloc.cpp:74

WERD_RES
Definition: pageres.h:155

REJMAP::length
inT32 length() const
Definition: rejctmap.h:237

control.h

CLISTIZEH
CLISTIZEH(STRING) CLISTIZE(STRING) namespace tesseract
Definition: reject.cpp:48

tesseract::Tesseract::one_ell_conflict
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
Definition: reject.cpp:292

UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::Tesseract::dont_allow_1Il
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:526

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:271

USER_DAWG_PERM
Definition: ratngs.h:251

tprintf
#define tprintf(...)
Definition: tprintf.h:31

tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:228

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:88

tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds
char * ok_repeated_ch_non_alphanum_wds
Definition: tesseractclass.h:1042

flip_0O
void flip_0O(WERD_RES *word)

GenericVector::sort
void sort()
Definition: genericvector.h:998

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:72

reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178

flip_hyphens
void flip_hyphens(WERD_RES *word)

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:531

BOOL8
unsigned char BOOL8
Definition: host.h:113

globaloc.h

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:160

STRING::length
inT32 length() const
Definition: strngs.cpp:188

WERD_CHOICE
Definition: ratngs.h:271

DOC_DAWG_PERM
Definition: ratngs.h:250

TBOX::right
inT16 right() const
Definition: rect.h:75

tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:124

tesseract::Tesseract::rej_1Il_trust_permuter_type
bool rej_1Il_trust_permuter_type
Definition: tesseractclass.h:1033

UNICHAR_SPACE
Definition: unicharset.h:35

tesseract::Tesseract::safe_dict_word
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607

scanutils.h

WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:360

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

ROW
Definition: ocrrow.h:32

tesseract::Tesseract::count_alphanums
inT16 count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:410

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

AC_UPPER_CASE
ALL upper case.
Definition: control.h:38

tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:225

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:425

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:416

tessvars.h

tesseract::Tesseract::alpha_count
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495

tesseract::Tesseract::first_alphanum_index
inT16 first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:469

CLISTIZE
#define CLISTIZE(CLASSNAME)
Definition: clst.h:958

tesseract::Tesseract::reject_edge_blobs
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:263

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97

TBOX::left
inT16 left() const
Definition: rect.h:68

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:327

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:244

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

tesseract::Tesseract::conflict_set_I_l_1
char * conflict_set_I_l_1
Definition: tesseractclass.h:1043

ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:34

GenericVector::init_to_size
void init_to_size(int size, T t)
Definition: genericvector.h:646

FREQ_DAWG_PERM
Definition: ratngs.h:252

NUMBER_PERM
Definition: ratngs.h:247

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

WERD_RES::denorm
DENORM denorm
Definition: pageres.h:190

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:343

tesseract::Tesseract::tessedit_flip_0O
bool tessedit_flip_0O
Definition: tesseractclass.h:1026

tesseract::Tesseract::word_contains_non_1_digit
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:509

tesseract::Tesseract::tessedit_lower_flip_hyphen
double tessedit_lower_flip_hyphen
Definition: tesseractclass.h:1028

WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:266

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:29

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

reject.h

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

WERD_RES::done
BOOL8 done
Definition: pageres.h:282

REJMAP::rej_word_mostly_rej
void rej_word_mostly_rej()
Definition: rejctmap.cpp:479

tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract
double rej_whole_of_mostly_reject_word_fract
Definition: tesseractclass.h:1039

helpers.h

WERD_RES::word
WERD * word
Definition: pageres.h:175

TBOX::height
inT16 height() const
Definition: rect.h:104

AC_LOWER_CASE
ALL lower case.
Definition: control.h:37

docqual.h

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:36

TBOX::width
inT16 width() const
Definition: rect.h:111

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

tesseract::Tesseract::tessedit_upper_flip_hyphen
double tessedit_upper_flip_hyphen
Definition: tesseractclass.h:1030

tesseract::BoxWord::length
const int length() const
Definition: boxword.h:85

FALSE
#define FALSE
Definition: capi.h:29

tesseract
Definition: baseapi.cpp:83

tesseract::Tesseract::non_O_upper
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:785

count
int count(LIST var_list)
Definition: oldlist.cpp:108

tesseract::Tesseract::rej_1Il_use_dict_word
bool rej_1Il_use_dict_word
Definition: tesseractclass.h:1032

UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:826

UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449

tesseract::Tesseract::flip_hyphens
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:616

REJMAP::reject_count
inT16 reject_count()
Definition: rejctmap.h:243

DENORM::x_scale
float x_scale() const
Definition: normalis.h:269

TBOX
Definition: rect.h:30

TRUE
#define TRUE
Definition: capi.h:28

DENORM::y_scale
float y_scale() const
Definition: normalis.h:272

UNICHARSET
Definition: unicharset.h:139

WERD_CHOICE::print
void print() const
Definition: ratngs.h:563

AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:39

STRING
Definition: strngs.h:44

REJMAP::initialise
void initialise(inT16 length)
Definition: rejctmap.cpp:318

NULL
#define NULL
Definition: host.h:144

compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226

TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:482

tesseract::Tesseract::reject_I_1_L
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191

GenericVector< float >

STRING::string
const char * string() const
Definition: strngs.cpp:193

tesseract::Tesseract::repeated_nonalphanum_wd
BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:582

TBOX::top
inT16 top() const
Definition: rect.h:54

REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:443

REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:452

reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207

tesseract::Tesseract::rej_trust_doc_dawg
bool rej_trust_doc_dawg
Definition: tesseractclass.h:1031

tesseract::Tesseract::flip_0O
void flip_0O(WERD_RES *word)
Definition: reject.cpp:673

genericvector.h

tesseract::Tesseract::tessedit_image_border
int tessedit_image_border
Definition: tesseractclass.h:1040

STRING::contains
BOOL8 contains(const char c) const
Definition: strngs.cpp:184

REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:434

tesseract::Tesseract::non_0_digit
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:789

inT16
short inT16
Definition: host.h:100

UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:242

tesseractclass.h