tesseract-ocr.github.io/5.3.3/a00056_source.html

/**********************************************************************

 * File:        reject.cpp  (Formerly reject.c)

 * Description: Rejection functions used in tessedit

 * Author:      Phil Cheatle

 *

 * (C) Copyright 1992, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


// Include automatically generated configuration file if running autoconf.

#ifdef HAVE_CONFIG_H

#  include "config_auto.h"

#endif


#include "reject.h"


#ifdef DISABLED_LEGACY_ENGINE


#  include "tesseractclass.h"


namespace tesseract {


int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {

  const WERD_CHOICE &word = *werd_res->best_choice;

  int dict_word_type = werd_res->tesseract->dict_word(word);

  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;

}

} // namespace tesseract


#else


#  include "control.h"

#  include "docqual.h"

#  include "tesseractclass.h"

#  include "tessvars.h"


#  include "helpers.h"


#  include <algorithm> // for std::sort

#  include <cctype>

#  include <cerrno>

#  include <cstring>

#  include <vector> // for std::vector


namespace tesseract {


/*************************************************************************

 * set_done()

 *

 * Set the done flag based on the word acceptability criteria

 *************************************************************************/


void Tesseract::set_done(WERD_RES *word, int16_t pass) {

  word->done =

      word->tess_accepted && (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr);

  bool word_is_ambig = word->best_choice->dangerous_ambig_found();

  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

                        word->best_choice->permuter() == FREQ_DAWG_PERM ||

                        word->best_choice->permuter() == USER_DAWG_PERM;

  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&

      one_ell_conflict(word, false)) {

    if (tessedit_rejection_debug) {

      tprintf("one_ell_conflict detected\n");

    }

    word->done = false;

  }

  if (word->done &&

      ((!word_from_dict && word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {

    if (tessedit_rejection_debug) {

      tprintf("non-dict or ambig word detected\n");

    }

    word->done = false;

  }

  if (tessedit_rejection_debug) {

    tprintf("set_done(): done=%d\n", word->done);

    word->best_choice->print("");

  }

}


/*************************************************************************

 * make_reject_map()

 *

 * Sets the done flag to indicate whether the resylt is acceptable.

 *

 * Sets a reject map for the word.

 *************************************************************************/

void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {

  flip_0O(word);

  check_debug_pt(word, -1); // For trap only

  set_done(word, pass);     // Set acceptance

  word->reject_map.initialise(word->best_choice->unichar_lengths().length());

  reject_blanks(word);

  /*

0: Rays original heuristic - the baseline

*/

  if (tessedit_reject_mode == 0) {

    if (!word->done) {

      reject_poor_matches(word);

    }

  } else if (tessedit_reject_mode == 5) {

    /*

5: Reject I/1/l from words where there is no strong contextual confirmation;

  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);

  and the whole of any words which are very small

*/

    if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {

      word->reject_map.rej_word_small_xht();

    } else {

      one_ell_conflict(word, true);

      /*

  Originally the code here just used the done flag. Now I have duplicated

  and unpacked the conditions for setting the done flag so that each

  mechanism can be turned on or off independently. This works WITHOUT

  affecting the done flag setting.

*/

      if (rej_use_tess_accepted && !word->tess_accepted) {

        word->reject_map.rej_word_not_tess_accepted();

      }


      if (rej_use_tess_blanks &&

          (strchr(word->best_choice->unichar_string().c_str(), ' ') != nullptr)) {

        word->reject_map.rej_word_contains_blanks();

      }


      WERD_CHOICE *best_choice = word->best_choice;

      if (rej_use_good_perm) {

        if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||

             best_choice->permuter() == FREQ_DAWG_PERM ||

             best_choice->permuter() == USER_DAWG_PERM) &&

            (!rej_use_sensible_wd ||

             acceptable_word_string(*word->uch_set, best_choice->unichar_string().c_str(),

                                    best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE)) {

          // PASSED TEST

        } else if (best_choice->permuter() == NUMBER_PERM) {

          if (rej_alphas_in_number_perm) {

            for (int i = 0, offset = 0; best_choice->unichar_string()[offset] != '\0';

                 offset += best_choice->unichar_lengths()[i++]) {

              if (word->reject_map[i].accepted() &&

                  word->uch_set->get_isalpha(best_choice->unichar_string().c_str() + offset,

                                             best_choice->unichar_lengths()[i])) {

                word->reject_map[i].setrej_bad_permuter();

              }

              // rej alpha

            }

          }

        } else {

          word->reject_map.rej_word_bad_permuter();

        }

      }

      /* Ambig word rejection was here once !!*/

    }

  } else {

    tprintf("BAD tessedit_reject_mode\n");

    ASSERT_HOST("Fatal error encountered!" == nullptr);

  }


  if (tessedit_image_border > -1) {

    reject_edge_blobs(word);

  }


  check_debug_pt(word, 10);

  if (tessedit_rejection_debug) {

    tprintf("Permuter Type = %d\n", word->best_choice->permuter());

    tprintf("Certainty: %f     Rating: %f\n", word->best_choice->certainty(),

            word->best_choice->rating());

    tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));

  }


  flip_hyphens(word);

  check_debug_pt(word, 20);

}


void reject_blanks(WERD_RES *word) {

  int16_t i;

  int16_t offset;


  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';

       offset += word->best_choice->unichar_lengths()[i], i += 1) {

    if (word->best_choice->unichar_string()[offset] == ' ') {

      // rej unrecognised blobs

      word->reject_map[i].setrej_tess_failure();

    }

  }

}


void Tesseract::reject_I_1_L(WERD_RES *word) {

  int16_t i;

  int16_t offset;


  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';

       offset += word->best_choice->unichar_lengths()[i], i += 1) {

    if (conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])) {

      // rej 1Il conflict

      word->reject_map[i].setrej_1Il_conflict();

    }

  }

}


void reject_poor_matches(WERD_RES *word) {

  float threshold = compute_reject_threshold(word->best_choice);

  for (unsigned i = 0; i < word->best_choice->length(); ++i) {

    if (word->best_choice->unichar_id(i) == UNICHAR_SPACE) {

      word->reject_map[i].setrej_tess_failure();

    } else if (word->best_choice->certainty(i) < threshold) {

      word->reject_map[i].setrej_poor_match();

    }

  }

}


/**********************************************************************

 * compute_reject_threshold

 *

 * Set a rejection threshold for this word.

 * Initially this is a trivial function which looks for the largest

 * gap in the certainty value.

 **********************************************************************/


float compute_reject_threshold(WERD_CHOICE *word) {

  float threshold;      // rejection threshold

  float bestgap = 0.0f; // biggest gap

  float gapstart;       // bottom of gap


  auto blob_count = word->length();

  std::vector<float> ratings;

  ratings.reserve(blob_count);

  for (unsigned i = 0; i < blob_count; ++i) {

    ratings.push_back(word->certainty(i));

  }

  std::sort(ratings.begin(), ratings.end());

  gapstart = ratings[0] - 1; // all reject if none better

  if (blob_count >= 3) {

    for (unsigned index = 0; index < blob_count - 1; index++) {

      if (ratings[index + 1] - ratings[index] > bestgap) {

        bestgap = ratings[index + 1] - ratings[index];

        // find biggest

        gapstart = ratings[index];

      }

    }

  }

  threshold = gapstart + bestgap / 2;


  return threshold;

}


/*************************************************************************

 * reject_edge_blobs()

 *

 * If the word is perilously close to the edge of the image, reject those blobs

 * in the word which are too close to the edge as they could be clipped.

 *************************************************************************/

void Tesseract::reject_edge_blobs(WERD_RES *word) {

  TBOX word_box = word->word->bounding_box();

  // Use the box_word as it is already denormed back to image coordinates.

  int blobcount = word->box_word->length();


  if (word_box.left() < tessedit_image_border || word_box.bottom() < tessedit_image_border ||

      word_box.right() + tessedit_image_border > ImageWidth() - 1 ||

      word_box.top() + tessedit_image_border > ImageHeight() - 1) {

    ASSERT_HOST(word->reject_map.length() == blobcount);

    for (int blobindex = 0; blobindex < blobcount; blobindex++) {

      TBOX blob_box = word->box_word->BlobBox(blobindex);

      if (blob_box.left() < tessedit_image_border || blob_box.bottom() < tessedit_image_border ||

          blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||

          blob_box.top() + tessedit_image_border > ImageHeight() - 1) {

        word->reject_map[blobindex].setrej_edge_char();

        // Close to edge

      }

    }

  }

}


/**********************************************************************

 * one_ell_conflict()

 *

 * Identify words where there is a potential I/l/1 error.

 * - A bundle of contextual heuristics!

 **********************************************************************/

bool Tesseract::one_ell_conflict(WERD_RES *word_res, bool update_map) {

  const char *word;

  const char *lengths;

  int16_t word_len; // its length

  int16_t first_alphanum_index_;

  int16_t first_alphanum_offset_;

  int16_t i;

  int16_t offset;

  bool non_conflict_set_char; // non conf set a/n?

  bool conflict = false;

  bool allow_1s;

  ACCEPTABLE_WERD_TYPE word_type;

  bool dict_perm_type;

  bool dict_word_ok;

  int dict_word_type;


  word = word_res->best_choice->unichar_string().c_str();

  lengths = word_res->best_choice->unichar_lengths().c_str();

  word_len = strlen(lengths);

  /*

  If there are no occurrences of the conflict set characters then the word

  is OK.

*/

  if (strpbrk(word, conflict_set_I_l_1.c_str()) == nullptr) {

    return false;

  }


  /*

  There is a conflict if there are NO other (confirmed) alphanumerics apart

  from those in the conflict set.

*/


  for (i = 0, offset = 0, non_conflict_set_char = false; (i < word_len) && !non_conflict_set_char;

       offset += lengths[i++]) {

    non_conflict_set_char = (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||

                             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&

                            !conflict_set_I_l_1.contains(word[offset]);

  }

  if (!non_conflict_set_char) {

    if (update_map) {

      reject_I_1_L(word_res);

    }

    return true;

  }


  /*

  If the word is accepted by a dawg permuter, and the first alpha character

  is "I" or "l", check to see if the alternative is also a dawg word. If it

  is, then there is a potential error otherwise the word is ok.

*/


  dict_perm_type = (word_res->best_choice->permuter() == SYSTEM_DAWG_PERM) ||

                   (word_res->best_choice->permuter() == USER_DAWG_PERM) ||

                   (rej_trust_doc_dawg && (word_res->best_choice->permuter() == DOC_DAWG_PERM)) ||

                   (word_res->best_choice->permuter() == FREQ_DAWG_PERM);

  dict_word_type = dict_word(*(word_res->best_choice));

  dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));


  if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||

      (dict_perm_type && dict_word_ok)) {

    first_alphanum_index_ = first_alphanum_index(word, lengths);

    first_alphanum_offset_ = first_alphanum_offset(word, lengths);

    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {

      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

      if (safe_dict_word(word_res) > 0) {

        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

        if (update_map) {

          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();

        }

        return true;

      } else {

        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

        return false;

      }

    }


    if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {

      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

      if (safe_dict_word(word_res) > 0) {

        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

        if (update_map) {

          word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();

        }

        return true;

      } else {

        word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

        return false;

      }

    }

    return false;

  }


  /*

  NEW 1Il code. The old code relied on permuter types too much. In fact,

  tess will use TOP_CHOICE permute for good things like "palette".

  In this code the string is examined independently to see if it looks like

  a well formed word.

*/


  /*

  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a

  dictionary word.

*/

  first_alphanum_index_ = first_alphanum_index(word, lengths);

  first_alphanum_offset_ = first_alphanum_offset(word, lengths);

  if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'l') {

    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

    if (safe_dict_word(word_res) > 0) {

      return false;

    } else {

      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

    }

  } else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] == 'I') {

    word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';

    if (safe_dict_word(word_res) > 0) {

      return false;

    } else {

      word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';

    }

  }

  /*

  For strings containing digits:

    If there are no alphas OR the numeric permuter liked the word,

      reject any non 1 conflict chs

    Else reject all conflict chs

*/

  if (word_contains_non_1_digit(word, lengths)) {

    allow_1s =

        (alpha_count(word, lengths) == 0) || (word_res->best_choice->permuter() == NUMBER_PERM);


    int16_t offset;

    conflict = false;

    for (i = 0, offset = 0; word[offset] != '\0';

         offset += word_res->best_choice->unichar_lengths()[i++]) {

      if ((!allow_1s || (word[offset] != '1')) &&

          conflict_set_I_l_1.contains(word[offset])) {

        if (update_map) {

          word_res->reject_map[i].setrej_1Il_conflict();

        }

        conflict = true;

      }

    }

    return conflict;

  }

  /*

  For anything else. See if it conforms to an acceptable word type. If so,

  treat accordingly.

*/

  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);

  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {

    first_alphanum_index_ = first_alphanum_index(word, lengths);

    first_alphanum_offset_ = first_alphanum_offset(word, lengths);

    if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {

      if (update_map) {

        word_res->reject_map[first_alphanum_index_].setrej_1Il_conflict();

      }

      return true;

    } else {

      return false;

    }

  } else if (word_type == AC_UPPER_CASE) {

    return false;

  } else {

    if (update_map) {

      reject_I_1_L(word_res);

    }

    return true;

  }

}


int16_t Tesseract::first_alphanum_index(const char *word, const char *word_lengths) {

  int16_t i;

  int16_t offset;


  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||

        unicharset.get_isdigit(word + offset, word_lengths[i])) {

      return i;

    }

  }

  return -1;

}


int16_t Tesseract::first_alphanum_offset(const char *word, const char *word_lengths) {

  int16_t i;

  int16_t offset;


  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||

        unicharset.get_isdigit(word + offset, word_lengths[i])) {

      return offset;

    }

  }

  return -1;

}


int16_t Tesseract::alpha_count(const char *word, const char *word_lengths) {

  int16_t i;

  int16_t offset;

  int16_t count = 0;


  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

    if (unicharset.get_isalpha(word + offset, word_lengths[i])) {

      count++;

    }

  }

  return count;

}


bool Tesseract::word_contains_non_1_digit(const char *word, const char *word_lengths) {

  int16_t i;

  int16_t offset;


  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {

    if (unicharset.get_isdigit(word + offset, word_lengths[i]) &&

        (word_lengths[i] != 1 || word[offset] != '1')) {

      return true;

    }

  }

  return false;

}


/*************************************************************************

 * dont_allow_1Il()

 * Don't unreject LONE accepted 1Il conflict set chars

 *************************************************************************/

void Tesseract::dont_allow_1Il(WERD_RES *word) {

  int word_len = word->reject_map.length();

  const char *s = word->best_choice->unichar_string().c_str();

  const char *lengths = word->best_choice->unichar_lengths().c_str();

  bool accepted_1Il = false;


  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {

    if (word->reject_map[i].accepted()) {

      if (conflict_set_I_l_1.contains(s[offset])) {

        accepted_1Il = true;

      } else {

        if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||

            word->uch_set->get_isdigit(s + offset, lengths[i])) {

          return; // >=1 non 1Il ch accepted

        }

      }

    }

  }

  if (!accepted_1Il) {

    return; // Nothing to worry about

  }


  for (int i = 0, offset = 0; i < word_len; offset += word->best_choice->unichar_lengths()[i++]) {

    if (conflict_set_I_l_1.contains(s[offset]) && word->reject_map[i].accepted()) {

      word->reject_map[i].setrej_postNN_1Il();

    }

  }

}


int16_t Tesseract::count_alphanums(WERD_RES *word_res) {

  int count = 0;

  const WERD_CHOICE *best_choice = word_res->best_choice;

  for (unsigned i = 0; i < word_res->reject_map.length(); ++i) {

    if ((word_res->reject_map[i].accepted()) &&

        (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||

         word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {

      count++;

    }

  }

  return count;

}


// reject all if most rejected.

void Tesseract::reject_mostly_rejects(WERD_RES *word) {

  /* Reject the whole of the word if the fraction of rejects exceeds a limit */


  if (static_cast<float>(word->reject_map.reject_count()) / word->reject_map.length() >=

      rej_whole_of_mostly_reject_word_fract) {

    word->reject_map.rej_word_mostly_rej();

  }

}


bool Tesseract::repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {

  if (word->best_choice->unichar_lengths().length() <= 1) {

    return false;

  }


  if (!ok_repeated_ch_non_alphanum_wds.contains(word->best_choice->unichar_string()[0])) {

    return false;

  }


  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);

  for (unsigned i = 1; i < word->best_choice->length(); ++i) {

    if (word->best_choice->unichar_id(i) != uch_id) {

      return false;

    }

  }


  int16_t char_quality;

  int16_t accepted_char_quality;

  word_char_quality(word, &char_quality, &accepted_char_quality);


  if ((word->best_choice->unichar_lengths().length() == static_cast<size_t>(char_quality)) &&

      (char_quality == accepted_char_quality)) {

    return true;

  } else {

    return false;

  }

}


int16_t Tesseract::safe_dict_word(const WERD_RES *werd_res) {

  const WERD_CHOICE &word = *werd_res->best_choice;

  int dict_word_type = werd_res->tesseract->dict_word(word);

  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;

}


// Note: After running this function word_res->ratings

// might not contain the right BLOB_CHOICE corresponding to each character

// in word_res->best_choice.

void Tesseract::flip_hyphens(WERD_RES *word_res) {

  WERD_CHOICE *best_choice = word_res->best_choice;

  int prev_right = -9999;

  int next_left;

  TBOX out_box;

  float aspect_ratio;


  if (tessedit_lower_flip_hyphen <= 1) {

    return;

  }


  auto num_blobs = word_res->rebuild_word->NumBlobs();

  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");

  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {

    TBLOB *blob = word_res->rebuild_word->blobs[i];

    out_box = blob->bounding_box();

    if (i + 1 == num_blobs) {

      next_left = 9999;

    } else {

      next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();

    }

    // Don't touch small or touching blobs - it is too dangerous.

    if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) &&

        (out_box.right() < next_left)) {

      aspect_ratio = out_box.width() / static_cast<float>(out_box.height());

      if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {

        if (aspect_ratio >= tessedit_upper_flip_hyphen &&

            word_res->uch_set->contains_unichar_id(unichar_dash) &&

            word_res->uch_set->get_enabled(unichar_dash)) {

          /* Certain HYPHEN */

          best_choice->set_unichar_id(unichar_dash, i);

          if (word_res->reject_map[i].rejected()) {

            word_res->reject_map[i].setrej_hyphen_accept();

          }

        }

        if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->reject_map[i].accepted()) {

          // Suspected HYPHEN

          word_res->reject_map[i].setrej_hyphen();

        }

      } else if (best_choice->unichar_id(i) == unichar_dash) {

        if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->reject_map[i].rejected())) {

          word_res->reject_map[i].setrej_hyphen_accept();

        }

        // Certain HYPHEN


        if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->reject_map[i].accepted())) {

          // Suspected HYPHEN

          word_res->reject_map[i].setrej_hyphen();

        }

      }

    }

    prev_right = out_box.right();

  }

}


// Note: After running this function word_res->ratings

// might not contain the right BLOB_CHOICE corresponding to each character

// in word_res->best_choice.

void Tesseract::flip_0O(WERD_RES *word_res) {

  WERD_CHOICE *best_choice = word_res->best_choice;

  TBOX out_box;


  if (!tessedit_flip_0O) {

    return;

  }


  auto num_blobs = word_res->rebuild_word->NumBlobs();

  for (unsigned i = 0; i < best_choice->length() && i < num_blobs; ++i) {

    TBLOB *blob = word_res->rebuild_word->blobs[i];

    if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||

        word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {

      out_box = blob->bounding_box();

      if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||

          (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4)) {

        return; // Beware words with sub/superscripts

      }

    }

  }

  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");

  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");

  if (unichar_0 == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_0) ||

      unichar_O == INVALID_UNICHAR_ID || !word_res->uch_set->get_enabled(unichar_O)) {

    return; // 0 or O are not present/enabled in unicharset

  }

  for (unsigned i = 1; i < best_choice->length(); ++i) {

    if (best_choice->unichar_id(i) == unichar_0 || best_choice->unichar_id(i) == unichar_O) {

      /* A0A */

      if ((i + 1) < best_choice->length() &&

          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 1))) {

        best_choice->set_unichar_id(unichar_O, i);

      }

      /* A00A */

      if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (i + 1) < best_choice->length() &&

          (best_choice->unichar_id(i + 1) == unichar_0 ||

           best_choice->unichar_id(i + 1) == unichar_O) &&

          (i + 2) < best_choice->length() &&

          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i + 2))) {

        best_choice->set_unichar_id(unichar_O, i);

        i++;

      }

      /* AA0<non digit or end of word> */

      if ((i > 1) && non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 2)) &&

          non_O_upper(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (((i + 1) < best_choice->length() &&

            !word_res->uch_set->get_isdigit(best_choice->unichar_id(i + 1)) &&

            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "l") &&

            !word_res->uch_set->eq(best_choice->unichar_id(i + 1), "I")) ||

           (i == best_choice->length() - 1))) {

        best_choice->set_unichar_id(unichar_O, i);

      }

      /* 9O9 */

      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (i + 1) < best_choice->length() &&

          non_0_digit(*word_res->uch_set, best_choice->unichar_id(i + 1))) {

        best_choice->set_unichar_id(unichar_0, i);

      }

      /* 9OOO */

      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (i + 2) < best_choice->length() &&

          (best_choice->unichar_id(i + 1) == unichar_0 ||

           best_choice->unichar_id(i + 1) == unichar_O) &&

          (best_choice->unichar_id(i + 2) == unichar_0 ||

           best_choice->unichar_id(i + 2) == unichar_O)) {

        best_choice->set_unichar_id(unichar_0, i);

        best_choice->set_unichar_id(unichar_0, i + 1);

        best_choice->set_unichar_id(unichar_0, i + 2);

        i += 2;

      }

      /* 9OO<non upper> */

      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (i + 2) < best_choice->length() &&

          (best_choice->unichar_id(i + 1) == unichar_0 ||

           best_choice->unichar_id(i + 1) == unichar_O) &&

          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 2))) {

        best_choice->set_unichar_id(unichar_0, i);

        best_choice->set_unichar_id(unichar_0, i + 1);

        i++;

      }

      /* 9O<non upper> */

      if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i - 1)) &&

          (i + 1) < best_choice->length() &&

          !word_res->uch_set->get_isupper(best_choice->unichar_id(i + 1))) {

        best_choice->set_unichar_id(unichar_0, i);

      }

      /* 9[.,]OOO.. */

      if ((i > 1) &&

          (word_res->uch_set->eq(best_choice->unichar_id(i - 1), ".") ||

           word_res->uch_set->eq(best_choice->unichar_id(i - 1), ",")) &&

          (word_res->uch_set->get_isdigit(best_choice->unichar_id(i - 2)) ||

           best_choice->unichar_id(i - 2) == unichar_O)) {

        if (best_choice->unichar_id(i - 2) == unichar_O) {

          best_choice->set_unichar_id(unichar_0, i - 2);

        }

        while (i < best_choice->length() && (best_choice->unichar_id(i) == unichar_O ||

                                             best_choice->unichar_id(i) == unichar_0)) {

          best_choice->set_unichar_id(unichar_0, i);

          i++;

        }

        i--;

      }

    }

  }

}


bool Tesseract::non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {

  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");

}


bool Tesseract::non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id) {

  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");

}

} // namespace tesseract


#endif // def DISABLED_LEGACY_ENGINE

reject.h

docqual.h

control.h

ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:28

AC_INITIAL_CAP
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32

AC_UNACCEPTABLE
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29

AC_UPPER_CASE
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31

AC_LOWER_CASE
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30

tessvars.h

tesseractclass.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

helpers.h

i
int i
Definition: gmock-matchers_test.cc:718

count
int * count
Definition: gmock_stress_test.cc:96

tesseract
Definition: baseapi.h:39

tesseract::compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:227

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:33

tesseract::reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:208

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:36

tesseract::SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244

tesseract::NUMBER_PERM
@ NUMBER_PERM
Definition: ratngs.h:242

tesseract::USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:246

tesseract::DOC_DAWG_PERM
@ DOC_DAWG_PERM
Definition: ratngs.h:245

tesseract::FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

tesseract::kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:34

tesseract::reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:182

tesseract::Tesseract::first_alphanum_index
int16_t first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:457

tesseract::Tesseract::reject_edge_blobs
void reject_edge_blobs(WERD_RES *word)
Definition: reject.cpp:260

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81

tesseract::Tesseract::first_alphanum_offset
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:470

tesseract::Tesseract::non_O_upper
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:768

tesseract::Tesseract::alpha_count
int16_t alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:483

tesseract::Tesseract::dont_allow_1Il
void dont_allow_1Il(WERD_RES *word)
Definition: reject.cpp:513

tesseract::Tesseract::count_alphanums
int16_t count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:375

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692

tesseract::Tesseract::one_ell_conflict
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
Definition: reject.cpp:287

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593

tesseract::Tesseract::set_done
void set_done(WERD_RES *word, int16_t pass)
Definition: reject.cpp:62

tesseract::Tesseract::non_0_digit
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:772

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799

tesseract::Tesseract::flip_hyphens
void flip_hyphens(WERD_RES *word)
Definition: reject.cpp:602

tesseract::Tesseract::ImageHeight
int ImageHeight() const
Definition: tesseractclass.h:260

tesseract::Tesseract::reject_I_1_L
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:195

tesseract::Tesseract::reject_mostly_rejects
void reject_mostly_rejects(WERD_RES *word)
Definition: reject.cpp:556

tesseract::Tesseract::ImageWidth
int ImageWidth() const
Definition: tesseractclass.h:257

tesseract::Tesseract::word_contains_non_1_digit
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:496

tesseract::Tesseract::repeated_nonalphanum_wd
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
Definition: reject.cpp:565

tesseract::Tesseract::flip_0O
void flip_0O(WERD_RES *word)
Definition: reject.cpp:660

tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96

tesseract::TBLOB
Definition: blobs.h:291

tesseract::TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466

tesseract::TWERD::blobs
std::vector< TBLOB * > blobs
Definition: blobs.h:462

tesseract::TWERD::NumBlobs
unsigned NumBlobs() const
Definition: blobs.h:449

tesseract::BoxWord::length
unsigned length() const
Definition: boxword.h:81

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84

tesseract::DENORM::y_scale
float y_scale() const
Definition: normalis.h:262

tesseract::DENORM::x_scale
float x_scale() const
Definition: normalis.h:259

tesseract::ROW
Definition: ocrrow.h:39

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:278

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:301

tesseract::WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:292

tesseract::WERD_RES::done
bool done
Definition: pageres.h:303

tesseract::WERD_RES::denorm
DENORM denorm
Definition: pageres.h:199

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:270

tesseract::WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:264

tesseract::WERD_CHOICE
Definition: ratngs.h:258

tesseract::WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:315

tesseract::WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:344

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:331

tesseract::WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:348

tesseract::WERD_CHOICE::length
unsigned length() const
Definition: ratngs.h:287

tesseract::WERD_CHOICE::print
void print() const
Definition: ratngs.h:561

tesseract::WERD_CHOICE::unichar_lengths
const std::string & unichar_lengths() const
Definition: ratngs.h:533

tesseract::WERD_CHOICE::unichar_string
std::string & unichar_string()
Definition: ratngs.h:519

tesseract::WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:312

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::left
TDimension left() const
Definition: rect.h:82

tesseract::TBOX::height
TDimension height() const
Definition: rect.h:118

tesseract::TBOX::width
TDimension width() const
Definition: rect.h:126

tesseract::TBOX::top
TDimension top() const
Definition: rect.h:68

tesseract::TBOX::right
TDimension right() const
Definition: rect.h:89

tesseract::TBOX::bottom
TDimension bottom() const
Definition: rect.h:75

tesseract::REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:139

tesseract::REJMAP::reject_count
int16_t reject_count() const
Definition: rejctmap.h:339

tesseract::REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:147

tesseract::REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:127

tesseract::REJMAP::length
uint16_t length() const
Definition: rejctmap.h:333

tesseract::REJMAP::initialise
void initialise(uint16_t length)
Definition: rejctmap.cpp:67

tesseract::REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:155

tesseract::REJMAP::rej_word_mostly_rej
void rej_word_mostly_rej()
Definition: rejctmap.cpp:179

tesseract::WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:155

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:61

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497

tesseract::UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303

tesseract::UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

tesseract::UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911

tesseract::UNICHARSET::eq
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713

tesseract::Wordrec::dict_word
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:86