tesseract-ocr.github.io/5.3.3/a00125_source.html

/******************************************************************

 * File:        docqual.cpp  (Formerly docqual.c)

 * Description: Document Quality Metrics

 * Author:      Phil Cheatle

 *

 * (C) Copyright 1994, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


#include "docqual.h"

#include <cctype>

#include "reject.h"

#include "tesseractclass.h"

#include "tessvars.h"


namespace tesseract {


static void countMatchingBlobs(int16_t &match_count, int /*index*/) {

  ++match_count;

}


static void countAcceptedBlobs(WERD_RES *word, int16_t &match_count, int16_t &accepted_match_count,

                               int index) {

  if (word->reject_map[index].accepted()) {

    ++accepted_match_count;

  }

  ++match_count;

}


static void acceptIfGoodQuality(WERD_RES *word, int index) {

  if (word->reject_map[index].accept_if_good_quality()) {

    word->reject_map[index].setrej_quality_accept();

  }

}


/*************************************************************************

 * word_blob_quality()

 * How many blobs in the box_word are identical to those of the inword?

 * ASSUME blobs in both initial word and box_word are in ascending order of

 * left hand blob edge.

 *************************************************************************/

int16_t Tesseract::word_blob_quality(WERD_RES *word) {

  int16_t match_count = 0;

  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&

      !word->rebuild_word->blobs.empty()) {

    using namespace std::placeholders; // for _1

    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,

                                         std::bind(countMatchingBlobs, match_count, _1));

  }

  return match_count;

}


int16_t Tesseract::word_outline_errs(WERD_RES *word) {

  int16_t i = 0;

  int16_t err_count = 0;


  if (word->rebuild_word != nullptr) {

    for (unsigned b = 0; b < word->rebuild_word->NumBlobs(); ++b) {

      TBLOB *blob = word->rebuild_word->blobs[b];

      err_count += count_outline_errs(word->best_choice->unichar_string()[i], blob->NumOutlines());

      i++;

    }

  }

  return err_count;

}


/*************************************************************************

 * word_char_quality()

 * Combination of blob quality and outline quality - how many good chars are

 * there? - I.e chars which pass the blob AND outline tests.

 *************************************************************************/

void Tesseract::word_char_quality(WERD_RES *word, int16_t *match_count,

                                  int16_t *accepted_match_count) {

  *match_count = 0;

  *accepted_match_count = 0;

  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&

      !word->rebuild_word->blobs.empty()) {

    using namespace std::placeholders; // for _1

    word->bln_boxes->ProcessMatchedBlobs(

        *word->rebuild_word,

        std::bind(countAcceptedBlobs, word, *match_count, *accepted_match_count, _1));

  }

}


/*************************************************************************

 * unrej_good_chs()

 * Unreject POTENTIAL rejects if the blob passes the blob and outline checks

 *************************************************************************/

void Tesseract::unrej_good_chs(WERD_RES *word) {

  if (word->bln_boxes != nullptr && word->rebuild_word != nullptr &&

      word->rebuild_word->blobs.empty()) {

    using namespace std::placeholders; // for _1

    word->bln_boxes->ProcessMatchedBlobs(*word->rebuild_word,

                                         std::bind(acceptIfGoodQuality, word, _1));

  }

}


int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {

  int expected_outline_count;


  if (outlines_odd.contains(c)) {

    return 0; // Don't use this char

  } else if (outlines_2.contains(c)) {

    expected_outline_count = 2;

  } else {

    expected_outline_count = 1;

  }

  return abs(outline_count - expected_outline_count);

}


void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc) {

  if ((tessedit_good_quality_unrej && good_quality_doc)) {

    unrej_good_quality_words(page_res_it);

  }

  doc_and_block_rejection(page_res_it, good_quality_doc);

  if (unlv_tilde_crunching) {

    tilde_crunch(page_res_it);

    tilde_delete(page_res_it);

  }

}


/*************************************************************************

 * unrej_good_quality_words()

 * Accept potential rejects in words which pass the following checks:

 *    - Contains a potential reject

 *    - Word looks like a sensible alpha word.

 *    - Word segmentation is the same as the original image

 *    - All characters have the expected number of outlines

 * NOTE - the rejection counts are recalculated after unrejection

 *      - CAN'T do it in a single pass without a bit of fiddling

 *    - keep it simple but inefficient

 *************************************************************************/

void Tesseract::unrej_good_quality_words( // unreject potential

    PAGE_RES_IT &page_res_it) {

  WERD_RES *word;

  ROW_RES *current_row;

  BLOCK_RES *current_block;

  int i;


  page_res_it.restart_page();

  while (page_res_it.word() != nullptr) {

    check_debug_pt(page_res_it.word(), 100);

    if (bland_unrej) {

      word = page_res_it.word();

      for (i = 0; i < word->reject_map.length(); i++) {

        if (word->reject_map[i].accept_if_good_quality()) {

          word->reject_map[i].setrej_quality_accept();

        }

      }

      page_res_it.forward();

    } else if ((page_res_it.row()->char_count > 0) &&

               ((page_res_it.row()->rej_count /

                 static_cast<float>(page_res_it.row()->char_count)) <= quality_rowrej_pc)) {

      word = page_res_it.word();

      if (word->reject_map.quality_recoverable_rejects() &&

          (tessedit_unrej_any_wd ||

           acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),

                                  word->best_choice->unichar_lengths().c_str()) !=

               AC_UNACCEPTABLE)) {

        unrej_good_chs(word);

      }

      page_res_it.forward();

    } else {

      // Skip to end of dodgy row.

      current_row = page_res_it.row();

      while ((page_res_it.word() != nullptr) && (page_res_it.row() == current_row)) {

        page_res_it.forward();

      }

    }

    check_debug_pt(page_res_it.word(), 110);

  }

  page_res_it.restart_page();

  page_res_it.page_res->char_count = 0;

  page_res_it.page_res->rej_count = 0;

  current_block = nullptr;

  current_row = nullptr;

  while (page_res_it.word() != nullptr) {

    if (current_block != page_res_it.block()) {

      current_block = page_res_it.block();

      current_block->char_count = 0;

      current_block->rej_count = 0;

    }

    if (current_row != page_res_it.row()) {

      current_row = page_res_it.row();

      current_row->char_count = 0;

      current_row->rej_count = 0;

      current_row->whole_word_rej_count = 0;

    }

    page_res_it.rej_stat_word();

    page_res_it.forward();

  }

}


/*************************************************************************

 * doc_and_block_rejection()

 *

 * If the page has too many rejects - reject all of it.

 * If any block has too many rejects - reject all words in the block

 *************************************************************************/


void Tesseract::doc_and_block_rejection( // reject big chunks

    PAGE_RES_IT &page_res_it, bool good_quality_doc) {

  int16_t block_no = 0;

  int16_t row_no = 0;

  BLOCK_RES *current_block;

  ROW_RES *current_row;


  bool rej_word;

  bool prev_word_rejected;

  int16_t char_quality = 0;

  int16_t accepted_char_quality;


  if (page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count >

      tessedit_reject_doc_percent) {

    reject_whole_page(page_res_it);

    if (tessedit_debug_doc_rejection) {

      tprintf("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count,

              page_res_it.page_res->rej_count);

    }

  } else {

    if (tessedit_debug_doc_rejection) {

      tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n", page_res_it.page_res->char_count,

              page_res_it.page_res->rej_count);

    }


    /* Walk blocks testing for block rejection */


    page_res_it.restart_page();

    WERD_RES *word;

    while ((word = page_res_it.word()) != nullptr) {

      current_block = page_res_it.block();

      block_no = current_block->block->pdblk.index();

      if (current_block->char_count > 0 &&

          (current_block->rej_count * 100.0 / current_block->char_count) >

              tessedit_reject_block_percent) {

        if (tessedit_debug_block_rejection) {

          tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n", block_no,

                  current_block->char_count, current_block->rej_count);

        }

        prev_word_rejected = false;

        while ((word = page_res_it.word()) != nullptr && (page_res_it.block() == current_block)) {

          if (tessedit_preserve_blk_rej_perfect_wds) {

            rej_word = word->reject_map.reject_count() > 0 ||

                       word->reject_map.length() < tessedit_preserve_min_wd_len;

            if (rej_word && tessedit_dont_blkrej_good_wds &&

                word->reject_map.length() >= tessedit_preserve_min_wd_len &&

                acceptable_word_string(*word->uch_set, word->best_choice->unichar_string().c_str(),

                                       word->best_choice->unichar_lengths().c_str()) !=

                    AC_UNACCEPTABLE) {

              word_char_quality(word, &char_quality, &accepted_char_quality);

              rej_word = char_quality != word->reject_map.length();

            }

          } else {

            rej_word = true;

          }

          if (rej_word) {

            /*

  Reject spacing if both current and prev words are rejected.

  NOTE - this is NOT restricted to FUZZY spaces. - When tried this

  generated more space errors.

*/

            if (tessedit_use_reject_spaces && prev_word_rejected &&

                page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {

              word->reject_spaces = true;

            }

            word->reject_map.rej_word_block_rej();

          }

          prev_word_rejected = rej_word;

          page_res_it.forward();

        }

      } else {

        if (tessedit_debug_block_rejection) {

          tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n", block_no,

                  page_res_it.block()->char_count, page_res_it.block()->rej_count);

        }


        /* Walk rows in block testing for row rejection */

        row_no = 0;

        while (page_res_it.word() != nullptr && page_res_it.block() == current_block) {

          current_row = page_res_it.row();

          row_no++;

          /* Reject whole row if:

  fraction of chars on row which are rejected exceed a limit AND

  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a

  limit

*/

          if (current_row->char_count > 0 &&

              (current_row->rej_count * 100.0 / current_row->char_count) >

                  tessedit_reject_row_percent &&

              (current_row->whole_word_rej_count * 100.0 / current_row->rej_count) <

                  tessedit_whole_wd_rej_row_percent) {

            if (tessedit_debug_block_rejection) {

              tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n", row_no,

                      current_row->char_count, current_row->rej_count);

            }

            prev_word_rejected = false;

            while ((word = page_res_it.word()) != nullptr && page_res_it.row() == current_row) {

              /* Preserve words on good docs unless they are mostly rejected*/

              if (!tessedit_row_rej_good_docs && good_quality_doc) {

                rej_word = word->reject_map.reject_count() /

                               static_cast<float>(word->reject_map.length()) >

                           tessedit_good_doc_still_rowrej_wd;

              } else if (tessedit_preserve_row_rej_perfect_wds) {

                /* Preserve perfect words anyway */

                rej_word = word->reject_map.reject_count() > 0 ||

                           word->reject_map.length() < tessedit_preserve_min_wd_len;

                if (rej_word && tessedit_dont_rowrej_good_wds &&

                    word->reject_map.length() >= tessedit_preserve_min_wd_len &&

                    acceptable_word_string(

                        *word->uch_set, word->best_choice->unichar_string().c_str(),

                        word->best_choice->unichar_lengths().c_str()) != AC_UNACCEPTABLE) {

                  word_char_quality(word, &char_quality, &accepted_char_quality);

                  rej_word = char_quality != word->reject_map.length();

                }

              } else {

                rej_word = true;

              }

              if (rej_word) {

                /*

  Reject spacing if both current and prev words are rejected.

  NOTE - this is NOT restricted to FUZZY spaces. - When tried

  this generated more space errors.

*/

                if (tessedit_use_reject_spaces && prev_word_rejected &&

                    page_res_it.prev_row() == page_res_it.row() && word->word->space() == 1) {

                  word->reject_spaces = true;

                }

                word->reject_map.rej_word_row_rej();

              }

              prev_word_rejected = rej_word;

              page_res_it.forward();

            }

          } else {

            if (tessedit_debug_block_rejection) {

              tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n", row_no,

                      current_row->char_count, current_row->rej_count);

            }

            while (page_res_it.word() != nullptr && page_res_it.row() == current_row) {

              page_res_it.forward();

            }

          }

        }

      }

    }

  }

}


/*************************************************************************

 * reject_whole_page()

 * Don't believe any of it - set the reject map to 00..00 in all words

 *

 *************************************************************************/


void reject_whole_page(PAGE_RES_IT &page_res_it) {

  page_res_it.restart_page();

  while (page_res_it.word() != nullptr) {

    page_res_it.word()->reject_map.rej_word_doc_rej();

    page_res_it.forward();

  }

  // whole page is rejected

  page_res_it.page_res->rejected = true;

}


void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {

  WERD_RES *word;

  GARBAGE_LEVEL garbage_level;

  PAGE_RES_IT copy_it;

  bool prev_potential_marked = false;

  bool found_terrible_word = false;

  bool ok_dict_word;


  page_res_it.restart_page();

  while (page_res_it.word() != nullptr) {

    POLY_BLOCK *pb = page_res_it.block()->block->pdblk.poly_block();

    if (pb != nullptr && !pb->IsText()) {

      page_res_it.forward();

      continue;

    }

    word = page_res_it.word();


    if (crunch_early_convert_bad_unlv_chs) {

      convert_bad_unlv_chs(word);

    }


    if (crunch_early_merge_tess_fails) {

      word->merge_tess_fails();

    }


    if (word->reject_map.accept_count() != 0) {

      found_terrible_word = false;

      // Forget earlier potential crunches

      prev_potential_marked = false;

    } else {

      ok_dict_word = safe_dict_word(word);

      garbage_level = garbage_word(word, ok_dict_word);


      if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch(word, garbage_level))) {

        if (crunch_debug > 0) {

          tprintf("T CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());

        }

        word->unlv_crunch_mode = CR_KEEP_SPACE;

        if (prev_potential_marked) {

          while (copy_it.word() != word) {

            if (crunch_debug > 0) {

              tprintf("P1 CRUNCHING: \"%s\"\n",

                      copy_it.word()->best_choice->unichar_string().c_str());

            }

            copy_it.word()->unlv_crunch_mode = CR_KEEP_SPACE;

            copy_it.forward();

          }

          prev_potential_marked = false;

        }

        found_terrible_word = true;

      } else if ((garbage_level != G_NEVER_CRUNCH) &&

                 (potential_word_crunch(word, garbage_level, ok_dict_word))) {

        if (found_terrible_word) {

          if (crunch_debug > 0) {

            tprintf("P2 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());

          }

          word->unlv_crunch_mode = CR_KEEP_SPACE;

        } else if (!prev_potential_marked) {

          copy_it = page_res_it;

          prev_potential_marked = true;

          if (crunch_debug > 1) {

            tprintf("P3 CRUNCHING: \"%s\"\n", word->best_choice->unichar_string().c_str());

          }

        }

      } else {

        found_terrible_word = false;

        // Forget earlier potential crunches

        prev_potential_marked = false;

        if (crunch_debug > 2) {

          tprintf("NO CRUNCH: \"%s\"\n", word->best_choice->unichar_string().c_str());

        }

      }

    }

    page_res_it.forward();

  }

}


bool Tesseract::terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {

  float rating_per_ch;

  int adjusted_len;

  int crunch_mode = 0;


  if (word->best_choice->unichar_string().empty() ||

      (strspn(word->best_choice->unichar_string().c_str(), " ") ==

       word->best_choice->unichar_string().size())) {

    crunch_mode = 1;

  } else {

    adjusted_len = word->reject_map.length();

    if (adjusted_len > crunch_rating_max) {

      adjusted_len = crunch_rating_max;

    }

    rating_per_ch = word->best_choice->rating() / adjusted_len;


    if (rating_per_ch > crunch_terrible_rating) {

      crunch_mode = 2;

    } else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) {

      crunch_mode = 3;

    } else if ((word->best_choice->certainty() < crunch_poor_garbage_cert) &&

               (garbage_level != G_OK)) {

      crunch_mode = 4;

    } else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) {

      crunch_mode = 5;

    }

  }

  if (crunch_mode > 0) {

    if (crunch_debug > 2) {

      tprintf("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode,

              word->best_choice->unichar_string().c_str());

    }

    return true;

  } else {

    return false;

  }

}


bool Tesseract::potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level,

                                      bool ok_dict_word) {

  float rating_per_ch;

  int adjusted_len;

  const char *str = word->best_choice->unichar_string().c_str();

  const char *lengths = word->best_choice->unichar_lengths().c_str();

  bool word_crunchable;

  int poor_indicator_count = 0;


  word_crunchable =

      !crunch_leave_accept_strings || word->reject_map.length() < 3 ||

      (acceptable_word_string(*word->uch_set, str, lengths) == AC_UNACCEPTABLE && !ok_dict_word);


  adjusted_len = word->reject_map.length();

  if (adjusted_len > 10) {

    adjusted_len = 10;

  }

  rating_per_ch = word->best_choice->rating() / adjusted_len;


  if (rating_per_ch > crunch_pot_poor_rate) {

    if (crunch_debug > 2) {

      tprintf("Potential poor rating on \"%s\"\n", word->best_choice->unichar_string().c_str());

    }

    poor_indicator_count++;

  }


  if (word_crunchable && word->best_choice->certainty() < crunch_pot_poor_cert) {

    if (crunch_debug > 2) {

      tprintf("Potential poor cert on \"%s\"\n", word->best_choice->unichar_string().c_str());

    }

    poor_indicator_count++;

  }


  if (garbage_level != G_OK) {

    if (crunch_debug > 2) {

      tprintf("Potential garbage on \"%s\"\n", word->best_choice->unichar_string().c_str());

    }

    poor_indicator_count++;

  }

  return poor_indicator_count >= crunch_pot_indicators;

}


void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {

  WERD_RES *word;

  PAGE_RES_IT copy_it;

  bool deleting_from_bol = false;

  bool marked_delete_point = false;

  int16_t debug_delete_mode;

  CRUNCH_MODE delete_mode;

  int16_t x_debug_delete_mode;

  CRUNCH_MODE x_delete_mode;


  page_res_it.restart_page();

  while (page_res_it.word() != nullptr) {

    word = page_res_it.word();


    delete_mode = word_deletable(word, debug_delete_mode);

    if (delete_mode != CR_NONE) {

      if (word->word->flag(W_BOL) || deleting_from_bol) {

        if (crunch_debug > 0) {

          tprintf("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,

                  word->best_choice->unichar_string().c_str());

        }

        word->unlv_crunch_mode = delete_mode;

        deleting_from_bol = true;

      } else if (word->word->flag(W_EOL)) {

        if (marked_delete_point) {

          while (copy_it.word() != word) {

            x_delete_mode = word_deletable(copy_it.word(), x_debug_delete_mode);

            if (crunch_debug > 0) {

              tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode,

                      copy_it.word()->best_choice->unichar_string().c_str());

            }

            copy_it.word()->unlv_crunch_mode = x_delete_mode;

            copy_it.forward();

          }

        }

        if (crunch_debug > 0) {

          tprintf("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode,

                  word->best_choice->unichar_string().c_str());

        }

        word->unlv_crunch_mode = delete_mode;

        deleting_from_bol = false;

        marked_delete_point = false;

      } else {

        if (!marked_delete_point) {

          copy_it = page_res_it;

          marked_delete_point = true;

        }

      }

    } else {

      deleting_from_bol = false;

      // Forget earlier potential crunches

      marked_delete_point = false;

    }

    /*

  The following step has been left till now as the tess fails are used to

  determine if the word is deletable.

*/

    if (!crunch_early_merge_tess_fails) {

      word->merge_tess_fails();

    }

    page_res_it.forward();

  }

}


void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {

  int i;

  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");

  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");

  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");

  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");

  for (i = 0; i < word_res->reject_map.length(); ++i) {

    if (word_res->best_choice->unichar_id(i) == unichar_tilde) {

      word_res->best_choice->set_unichar_id(unichar_dash, i);

      if (word_res->reject_map[i].accepted()) {

        word_res->reject_map[i].setrej_unlv_rej();

      }

    }

    if (word_res->best_choice->unichar_id(i) == unichar_pow) {

      word_res->best_choice->set_unichar_id(unichar_space, i);

      if (word_res->reject_map[i].accepted()) {

        word_res->reject_map[i].setrej_unlv_rej();

      }

    }

  }

}


GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {

  enum STATES {

    JUNK,

    FIRST_UPPER,

    FIRST_LOWER,

    FIRST_NUM,

    SUBSEQUENT_UPPER,

    SUBSEQUENT_LOWER,

    SUBSEQUENT_NUM

  };

  const char *str = word->best_choice->unichar_string().c_str();

  const char *lengths = word->best_choice->unichar_lengths().c_str();

  STATES state = JUNK;

  int len = 0;

  int isolated_digits = 0;

  int isolated_alphas = 0;

  int bad_char_count = 0;

  int tess_rejs = 0;

  int dodgy_chars = 0;

  int ok_chars;

  UNICHAR_ID last_char = -1;

  int alpha_repetition_count = 0;

  int longest_alpha_repetition_count = 0;

  int longest_lower_run_len = 0;

  int lower_string_count = 0;

  int longest_upper_run_len = 0;

  int upper_string_count = 0;

  int total_alpha_count = 0;

  int total_digit_count = 0;


  for (; *str != '\0'; str += *(lengths++)) {

    len++;

    if (word->uch_set->get_isupper(str, *lengths)) {

      total_alpha_count++;

      switch (state) {

        case SUBSEQUENT_UPPER:

        case FIRST_UPPER:

          state = SUBSEQUENT_UPPER;

          upper_string_count++;

          if (longest_upper_run_len < upper_string_count) {

            longest_upper_run_len = upper_string_count;

          }

          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {

            alpha_repetition_count++;

            if (longest_alpha_repetition_count < alpha_repetition_count) {

              longest_alpha_repetition_count = alpha_repetition_count;

            }

          } else {

            last_char = word->uch_set->unichar_to_id(str, *lengths);

            alpha_repetition_count = 1;

          }

          break;

        case FIRST_NUM:

          isolated_digits++;

          // Fall through.

        default:

          state = FIRST_UPPER;

          last_char = word->uch_set->unichar_to_id(str, *lengths);

          alpha_repetition_count = 1;

          upper_string_count = 1;

          break;

      }

    } else if (word->uch_set->get_islower(str, *lengths)) {

      total_alpha_count++;

      switch (state) {

        case SUBSEQUENT_LOWER:

        case FIRST_LOWER:

          state = SUBSEQUENT_LOWER;

          lower_string_count++;

          if (longest_lower_run_len < lower_string_count) {

            longest_lower_run_len = lower_string_count;

          }

          if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {

            alpha_repetition_count++;

            if (longest_alpha_repetition_count < alpha_repetition_count) {

              longest_alpha_repetition_count = alpha_repetition_count;

            }

          } else {

            last_char = word->uch_set->unichar_to_id(str, *lengths);

            alpha_repetition_count = 1;

          }

          break;

        case FIRST_NUM:

          isolated_digits++;

          // Fall through.

        default:

          state = FIRST_LOWER;

          last_char = word->uch_set->unichar_to_id(str, *lengths);

          alpha_repetition_count = 1;

          lower_string_count = 1;

          break;

      }

    } else if (word->uch_set->get_isdigit(str, *lengths)) {

      total_digit_count++;

      switch (state) {

        case FIRST_NUM:

          state = SUBSEQUENT_NUM;

        case SUBSEQUENT_NUM:

          break;

        case FIRST_UPPER:

        case FIRST_LOWER:

          isolated_alphas++;

          // Fall through.

        default:

          state = FIRST_NUM;

          break;

      }

    } else {

      if (*lengths == 1 && *str == ' ') {

        tess_rejs++;

      } else {

        bad_char_count++;

      }

      switch (state) {

        case FIRST_NUM:

          isolated_digits++;

          break;

        case FIRST_UPPER:

        case FIRST_LOWER:

          isolated_alphas++;

        default:

          break;

      }

      state = JUNK;

    }

  }


  switch (state) {

    case FIRST_NUM:

      isolated_digits++;

      break;

    case FIRST_UPPER:

    case FIRST_LOWER:

      isolated_alphas++;

    default:

      break;

  }


  if (crunch_include_numerals) {

    total_alpha_count += total_digit_count - isolated_digits;

  }


  if (crunch_leave_ok_strings && len >= 4 && 2 * (total_alpha_count - isolated_alphas) > len &&

      longest_alpha_repetition_count < crunch_long_repetitions) {

    if ((crunch_accept_ok &&

         acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE) ||

        longest_lower_run_len > crunch_leave_lc_strings ||

        longest_upper_run_len > crunch_leave_uc_strings) {

      return G_NEVER_CRUNCH;

    }

  }

  if (word->reject_map.length() > 1 && strpbrk(str, " ") == nullptr &&

      (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

       word->best_choice->permuter() == FREQ_DAWG_PERM ||

       word->best_choice->permuter() == USER_DAWG_PERM ||

       word->best_choice->permuter() == NUMBER_PERM ||

       acceptable_word_string(*word->uch_set, str, lengths) != AC_UNACCEPTABLE || ok_dict_word)) {

    return G_OK;

  }


  ok_chars = len - bad_char_count - isolated_digits - isolated_alphas - tess_rejs;


  if (crunch_debug > 3) {

    tprintf("garbage_word: \"%s\"\n", word->best_choice->unichar_string().c_str());

    tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n", len, bad_char_count,

            isolated_digits, isolated_alphas, tess_rejs);

  }

  if (bad_char_count == 0 && tess_rejs == 0 &&

      (len > isolated_digits + isolated_alphas || len <= 2)) {

    return G_OK;

  }


  if (tess_rejs > ok_chars || (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len)) {

    return G_TERRIBLE;

  }


  if (len > 4) {

    dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits + isolated_alphas;

    if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5) {

      return G_DODGY;

    } else {

      return G_OK;

    }

  } else {

    dodgy_chars = 2 * tess_rejs + bad_char_count;

    if ((len == 4 && dodgy_chars > 2) || (len == 3 && dodgy_chars > 2) || dodgy_chars >= len) {

      return G_DODGY;

    } else {

      return G_OK;

    }

  }

}


/*************************************************************************

 * word_deletable()

 *     DELETE WERDS AT ENDS OF ROWS IF

 *        Word is crunched &&

 *        ( string length = 0                                          OR

 *          > 50% of chars are "|" (before merging)                    OR

 *          certainty < -10                                            OR

 *          rating /char > 60                                          OR

 *          TOP of word is more than 0.5 xht BELOW baseline            OR

 *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR

 *          length of word < 3xht                                      OR

 *          height of word < 0.7 xht                                   OR

 *          height of word > 3.0 xht                                   OR

 *          >75% of the outline BBs have longest dimension < 0.5xht

 *************************************************************************/


CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {

  int word_len = word->reject_map.length();

  float rating_per_ch;

  TBOX box; // BB of word


  if (word->unlv_crunch_mode == CR_NONE) {

    delete_mode = 0;

    return CR_NONE;

  }


  if (word_len == 0) {

    delete_mode = 1;

    return CR_DELETE;

  }


  if (word->rebuild_word != nullptr) {

    // Cube leaves rebuild_word nullptr.

    box = word->rebuild_word->bounding_box();

    if (box.height() < crunch_del_min_ht * kBlnXHeight) {

      delete_mode = 4;

      return CR_DELETE;

    }


    if (noise_outlines(word->rebuild_word)) {

      delete_mode = 5;

      return CR_DELETE;

    }

  }


  if ((failure_count(word) * 1.5) > word_len) {

    delete_mode = 2;

    return CR_LOOSE_SPACE;

  }


  if (word->best_choice->certainty() < crunch_del_cert) {

    delete_mode = 7;

    return CR_LOOSE_SPACE;

  }


  rating_per_ch = word->best_choice->rating() / word_len;


  if (rating_per_ch > crunch_del_rating) {

    delete_mode = 8;

    return CR_LOOSE_SPACE;

  }


  if (box.top() < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {

    delete_mode = 9;

    return CR_LOOSE_SPACE;

  }


  if (box.bottom() > kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {

    delete_mode = 10;

    return CR_LOOSE_SPACE;

  }


  if (box.height() > crunch_del_max_ht * kBlnXHeight) {

    delete_mode = 11;

    return CR_LOOSE_SPACE;

  }


  if (box.width() < crunch_del_min_width * kBlnXHeight) {

    delete_mode = 3;

    return CR_LOOSE_SPACE;

  }


  delete_mode = 0;

  return CR_NONE;

}


int16_t Tesseract::failure_count(WERD_RES *word) {

  const char *str = word->best_choice->unichar_string().c_str();

  int tess_rejs = 0;


  for (; *str != '\0'; str++) {

    if (*str == ' ') {

      tess_rejs++;

    }

  }

  return tess_rejs;

}


bool Tesseract::noise_outlines(TWERD *word) {

  TBOX box; // BB of outline

  int16_t outline_count = 0;

  int16_t small_outline_count = 0;

  int16_t max_dimension;

  float small_limit = kBlnXHeight * crunch_small_outlines_size;


  for (unsigned b = 0; b < word->NumBlobs(); ++b) {

    TBLOB *blob = word->blobs[b];

    for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {

      outline_count++;

      box = ol->bounding_box();

      if (box.height() > box.width()) {

        max_dimension = box.height();

      } else {

        max_dimension = box.width();

      }

      if (max_dimension < small_limit) {

        small_outline_count++;

      }

    }

  }

  return small_outline_count >= outline_count;

}


} // namespace tesseract

reject.h

docqual.h

AC_UNACCEPTABLE
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29

tessvars.h

tesseractclass.h

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::W_BOL
@ W_BOL
start of line
Definition: werd.h:34

tesseract::W_EOL
@ W_EOL
end of line
Definition: werd.h:35

tesseract::CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:160

tesseract::CR_NONE
@ CR_NONE
Definition: pageres.h:160

tesseract::CR_KEEP_SPACE
@ CR_KEEP_SPACE
Definition: pageres.h:160

tesseract::CR_LOOSE_SPACE
@ CR_LOOSE_SPACE
Definition: pageres.h:160

tesseract::CR_DELETE
@ CR_DELETE
Definition: pageres.h:160

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:33

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:30

tesseract::G_TERRIBLE
@ G_TERRIBLE
Definition: docqual.h:30

tesseract::G_NEVER_CRUNCH
@ G_NEVER_CRUNCH
Definition: docqual.h:30

tesseract::G_OK
@ G_OK
Definition: docqual.h:30

tesseract::G_DODGY
@ G_DODGY
Definition: docqual.h:30

tesseract::SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244

tesseract::NUMBER_PERM
@ NUMBER_PERM
Definition: ratngs.h:242

tesseract::USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:246

tesseract::FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

tesseract::reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:363

tesseract::kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:34

tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:530

tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:616

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81

tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51

tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:373

tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word)
Definition: docqual.cpp:98

tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:210

tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692

tesseract::Tesseract::noise_outlines
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:907

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593

tesseract::Tesseract::failure_count
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:895

tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:594

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799

tesseract::Tesseract::potential_word_crunch
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:488

tesseract::Tesseract::terrible_word_crunch
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:450

tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:825

tesseract::Tesseract::count_outline_errs
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:107

tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:142

tesseract::TESSLINE
Definition: blobs.h:211

tesseract::TESSLINE::next
TESSLINE * next
Definition: blobs.h:288

tesseract::TBLOB
Definition: blobs.h:291

tesseract::TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:404

tesseract::TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:452

tesseract::TWERD
Definition: blobs.h:421

tesseract::TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:863

tesseract::TWERD::blobs
std::vector< TBLOB * > blobs
Definition: blobs.h:462

tesseract::TWERD::NumBlobs
unsigned NumBlobs() const
Definition: blobs.h:449

tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, const std::function< void(int)> &cb) const
Definition: boxword.cpp:201

tesseract::BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185

tesseract::PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:80

tesseract::PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:79

tesseract::PAGE_RES::rejected
bool rejected
Definition: pageres.h:82

tesseract::BLOCK_RES
Definition: pageres.h:118

tesseract::BLOCK_RES::block
BLOCK * block
Definition: pageres.h:120

tesseract::BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:122

tesseract::BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:121

tesseract::ROW_RES
Definition: pageres.h:142

tesseract::ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:147

tesseract::ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:146

tesseract::ROW_RES::char_count
int32_t char_count
Definition: pageres.h:145

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:292

tesseract::WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313

tesseract::WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1099

tesseract::WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:339

tesseract::WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:264

tesseract::PAGE_RES_IT
Definition: pageres.h:682

tesseract::PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:769

tesseract::PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:684

tesseract::PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:743

tesseract::PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:763

tesseract::PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:710

tesseract::PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1722

tesseract::PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:757

tesseract::PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:766

tesseract::PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59

tesseract::PDBLK::index
int index() const
Definition: pdblock.h:77

tesseract::POLY_BLOCK
Definition: polyblk.h:30

tesseract::POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:315

tesseract::WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:344

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:331

tesseract::WERD_CHOICE::unichar_lengths
const std::string & unichar_lengths() const
Definition: ratngs.h:533

tesseract::WERD_CHOICE::unichar_string
std::string & unichar_string()
Definition: ratngs.h:519

tesseract::WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:312

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::height
TDimension height() const
Definition: rect.h:118

tesseract::TBOX::width
TDimension width() const
Definition: rect.h:126

tesseract::TBOX::top
TDimension top() const
Definition: rect.h:68

tesseract::TBOX::bottom
TDimension bottom() const
Definition: rect.h:75

tesseract::REJMAP::reject_count
int16_t reject_count() const
Definition: rejctmap.h:339

tesseract::REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:211

tesseract::REJMAP::accept_count
int16_t accept_count() const
Definition: rejctmap.cpp:72

tesseract::REJMAP::length
uint16_t length() const
Definition: rejctmap.h:333

tesseract::REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:203

tesseract::REJMAP::quality_recoverable_rejects
bool quality_recoverable_rejects() const
Definition: rejctmap.cpp:91

tesseract::REJMAP::rej_word_doc_rej
void rej_word_doc_rej()
Definition: rejctmap.cpp:195

tesseract::WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::WERD::space
uint8_t space() const
Definition: werd.h:100

tesseract::UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506

tesseract::UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186