tesseract-ocr.github.io/5.3.3/a00083_source.html

/******************************************************************

 * File:        fixspace.cpp  (Formerly fixspace.c)

 * Description: Implements a pass over the page res, exploring the alternative

 *              spacing possibilities, trying to use context to improve the

 *              word spacing

 * Author:      Phil Cheatle

 *

 * (C) Copyright 1993, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


#include "fixspace.h"


#include "blobs.h"          // for TWERD, TBLOB, TESSLINE

#include "boxword.h"        // for BoxWord

#include "errcode.h"        // for ASSERT_HOST

#include "normalis.h"       // for kBlnXHeight, kBlnBaselineOffset

#include "pageres.h"        // for WERD_RES_IT, WERD_RES, WERD_RES_LIST

#include "params.h"         // for IntParam, StringParam, BoolParam, DoubleParam, ...

#include "ratngs.h"         // for WERD_CHOICE, FREQ_DAWG_PERM, NUMBER_PERM

#include "rect.h"           // for TBOX

#include "stepblob.h"       // for C_BLOB_IT, C_BLOB_LIST, C_BLOB

#include "tesseractclass.h" // for Tesseract, TesseractStats, WordData

#include "tessvars.h"       // for debug_fp

#include "tprintf.h"        // for tprintf

#include "unicharset.h"     // for UNICHARSET

#include "werd.h"           // for WERD, W_EOL, W_FUZZY_NON, W_FUZZY_SP


#include <tesseract/ocrclass.h> // for ETEXT_DESC

#include <tesseract/unichar.h>  // for UNICHAR_ID


#include <cstdint> // for INT16_MAX, int16_t, int32_t


namespace tesseract {


class BLOCK;

class ROW;


#define PERFECT_WERDS 999


/**********************************************************************

 *  c_blob_comparator()

 *

 *  Blob comparator used to sort a blob list so that blobs are in increasing

 *  order of left edge.

 **********************************************************************/


static int c_blob_comparator( // sort blobs

    const void *blob1p,       // ptr to ptr to blob1

    const void *blob2p        // ptr to ptr to blob2

) {

  const C_BLOB *blob1 = *reinterpret_cast<const C_BLOB *const *>(blob1p);

  const C_BLOB *blob2 = *reinterpret_cast<const C_BLOB *const *>(blob2p);


  return blob1->bounding_box().left() - blob2->bounding_box().left();

}


void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) {

  BLOCK_RES_IT block_res_it;

  ROW_RES_IT row_res_it;

  WERD_RES_IT word_res_it_from;

  WERD_RES_IT word_res_it_to;

  WERD_RES *word_res;

  WERD_RES_LIST fuzzy_space_words;

  int16_t new_length;

  bool prevent_null_wd_fixsp; // DON'T process blobless wds

  int32_t word_index;         // current word


  block_res_it.set_to_list(&page_res->block_res_list);

  word_index = 0;

  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list(); block_res_it.forward()) {

    row_res_it.set_to_list(&block_res_it.data()->row_res_list);

    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list(); row_res_it.forward()) {

      word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);

      while (!word_res_it_from.at_last()) {

        word_res = word_res_it_from.data();

        while (!word_res_it_from.at_last() &&

               !(word_res->combination ||

                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||

                 word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {

          fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);

          word_res = word_res_it_from.forward();

          word_index++;

          if (monitor != nullptr) {

            monitor->ocr_alive = true;

            monitor->progress = 90 + 5 * word_index / word_count;

            if (monitor->deadline_exceeded() ||

                (monitor->cancel != nullptr &&

                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {

              return;

            }

          }

        }


        if (!word_res_it_from.at_last()) {

          word_res_it_to = word_res_it_from;

          prevent_null_wd_fixsp = word_res->word->cblob_list()->empty();

          if (check_debug_pt(word_res, 60)) {

            debug_fix_space_level.set_value(10);

          }

          word_res_it_to.forward();

          word_index++;

          if (monitor != nullptr) {

            monitor->ocr_alive = true;

            monitor->progress = 90 + 5 * word_index / word_count;

            if (monitor->deadline_exceeded() ||

                (monitor->cancel != nullptr &&

                 (*monitor->cancel)(monitor->cancel_this, stats_.dict_words))) {

              return;

            }

          }

          while (!word_res_it_to.at_last() &&

                 (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||

                  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {

            if (check_debug_pt(word_res, 60)) {

              debug_fix_space_level.set_value(10);

            }

            if (word_res->word->cblob_list()->empty()) {

              prevent_null_wd_fixsp = true;

            }

            word_res = word_res_it_to.forward();

          }

          if (check_debug_pt(word_res, 60)) {

            debug_fix_space_level.set_value(10);

          }

          if (word_res->word->cblob_list()->empty()) {

            prevent_null_wd_fixsp = true;

          }

          if (prevent_null_wd_fixsp) {

            word_res_it_from = word_res_it_to;

          } else {

            fuzzy_space_words.assign_to_sublist(&word_res_it_from, &word_res_it_to);

            fix_fuzzy_space_list(fuzzy_space_words, row_res_it.data()->row,

                                 block_res_it.data()->block);

            new_length = fuzzy_space_words.length();

            word_res_it_from.add_list_before(&fuzzy_space_words);

            for (; !word_res_it_from.at_last() && new_length > 0; new_length--) {

              word_res_it_from.forward();

            }

          }

          if (test_pt) {

            debug_fix_space_level.set_value(0);

          }

        }

        fix_sp_fp_word(word_res_it_from, row_res_it.data()->row, block_res_it.data()->block);

        // Last word in row

      }

    }

  }

}


void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {

  int16_t best_score;

  WERD_RES_LIST current_perm;

  int16_t current_score;

  bool improved = false;


  best_score = eval_word_spacing(best_perm); // default score

  dump_words(best_perm, best_score, 1, improved);


  if (best_score != PERFECT_WERDS) {

    initialise_search(best_perm, current_perm);

  }


  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {

    match_current_words(current_perm, row, block);

    current_score = eval_word_spacing(current_perm);

    dump_words(current_perm, current_score, 2, improved);

    if (current_score > best_score) {

      best_perm.clear();

      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);

      best_score = current_score;

      improved = true;

    }

    if (current_score < PERFECT_WERDS) {

      transform_to_next_perm(current_perm);

    }

  }

  dump_words(best_perm, best_score, 3, improved);

}


void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {

  WERD_RES_IT src_it(&src_list);

  WERD_RES_IT new_it(&new_list);

  WERD_RES *src_wd;

  WERD_RES *new_wd;


  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {

    src_wd = src_it.data();

    if (!src_wd->combination) {

      new_wd = WERD_RES::deep_copy(src_wd);

      new_wd->combination = false;

      new_wd->part_of_combo = false;

      new_it.add_after_then_move(new_wd);

    }

  }

}


void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block) {

  WERD_RES_IT word_it(&words);

  WERD_RES *word;

  // Since we are not using PAGE_RES to iterate over words, we need to update

  // prev_word_best_choice_ before calling classify_word_pass2().

  prev_word_best_choice_ = nullptr;

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

    word = word_it.data();

    if ((!word->part_of_combo) && (word->box_word == nullptr)) {

      WordData word_data(block, row, word);

      SetupWordPassN(2, &word_data);

      classify_word_and_language(2, nullptr, &word_data);

    }

    prev_word_best_choice_ = word->best_choice;

  }

}


int16_t Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {

  WERD_RES_IT word_res_it(&word_res_list);

  int16_t total_score = 0;

  int16_t word_count = 0;

  int16_t done_word_count = 0;

  int i;

  int16_t offset;

  int16_t prev_word_score = 0;

  bool prev_word_done = false;

  bool prev_char_1 = false;     // prev ch a "1/I/l"?

  bool prev_char_digit = false; // prev ch 2..9 or 0

  const char *punct_chars = "!\"`',.:;";

  bool prev_char_punct = false;


  do {

    // current word

    WERD_RES *word = word_res_it.data();

    bool word_done = fixspace_thinks_word_done(word);

    word_count++;

    if (word->tess_failed) {

      total_score += prev_word_score;

      if (prev_word_done) {

        done_word_count++;

      }

      prev_word_score = 0;

      prev_char_1 = false;

      prev_char_digit = false;

      prev_word_done = false;

    } else {

      /*

  Can we add the prev word score and potentially count this word?

  Yes IF it didn't end in a 1 when the first char of this word is a digit

    AND it didn't end in a digit when the first char of this word is a 1

*/

      auto word_len = word->reject_map.length();

      bool current_word_ok_so_far = false;

      if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||

            (prev_char_digit &&

             ((word_done && word->best_choice->unichar_lengths().c_str()[0] == 1 &&

               word->best_choice->unichar_string()[0] == '1') ||

              (!word_done &&

               conflict_set_I_l_1.contains(word->best_choice->unichar_string()[0])))))) {

        total_score += prev_word_score;

        if (prev_word_done) {

          done_word_count++;

        }

        current_word_ok_so_far = word_done;

      }


      if (current_word_ok_so_far) {

        prev_word_done = true;

        prev_word_score = word_len;

      } else {

        prev_word_done = false;

        prev_word_score = 0;

      }


      /* Add 1 to total score for every joined 1 regardless of context and

   rejtn */

      for (i = 0, prev_char_1 = false; i < word_len; i++) {

        bool current_char_1 = word->best_choice->unichar_string()[i] == '1';

        if (prev_char_1 || (current_char_1 && (i > 0))) {

          total_score++;

        }

        prev_char_1 = current_char_1;

      }


      /* Add 1 to total score for every joined punctuation regardless of context

  and rejtn */

      if (tessedit_prefer_joined_punct) {

        for (i = 0, offset = 0, prev_char_punct = false; i < word_len;

             offset += word->best_choice->unichar_lengths()[i++]) {

          bool current_char_punct =

              strchr(punct_chars, word->best_choice->unichar_string()[offset]) != nullptr;

          if (prev_char_punct || (current_char_punct && i > 0)) {

            total_score++;

          }

          prev_char_punct = current_char_punct;

        }

      }

      prev_char_digit = digit_or_numeric_punct(word, word_len - 1);

      for (i = 0, offset = 0; i < word_len - 1;

           offset += word->best_choice->unichar_lengths()[i++]) {

        ;

      }

      prev_char_1 =

          ((word_done && (word->best_choice->unichar_string()[offset] == '1')) ||

           (!word_done &&

            conflict_set_I_l_1.contains(word->best_choice->unichar_string()[offset])));

    }

    /* Find next word */

    do {

      word_res_it.forward();

    } while (word_res_it.data()->part_of_combo);

  } while (!word_res_it.at_first());

  total_score += prev_word_score;

  if (prev_word_done) {

    done_word_count++;

  }

  if (done_word_count == word_count) {

    return PERFECT_WERDS;

  } else {

    return total_score;

  }

}


bool Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {

  int i;

  int offset;


  for (i = 0, offset = 0; i < char_position; offset += word->best_choice->unichar_lengths()[i++]) {

    ;

  }

  return (

      word->uch_set->get_isdigit(word->best_choice->unichar_string().c_str() + offset,

                                 word->best_choice->unichar_lengths()[i]) ||

      (word->best_choice->permuter() == NUMBER_PERM &&

       numeric_punctuation.contains(word->best_choice->unichar_string().c_str()[offset])));

}


void transform_to_next_perm(WERD_RES_LIST &words) {

  WERD_RES_IT word_it(&words);

  WERD_RES_IT prev_word_it(&words);

  WERD_RES *word;

  WERD_RES *prev_word;

  WERD_RES *combo;

  WERD *copy_word;

  int16_t prev_right = -INT16_MAX;

  TBOX box;

  int16_t gap;

  int16_t min_gap = INT16_MAX;


  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

    word = word_it.data();

    if (!word->part_of_combo) {

      box = word->word->bounding_box();

      if (prev_right > -INT16_MAX) {

        gap = box.left() - prev_right;

        if (gap < min_gap) {

          min_gap = gap;

        }

      }

      prev_right = box.right();

    }

  }

  if (min_gap < INT16_MAX) {

    prev_right = -INT16_MAX; // back to start

    word_it.set_to_list(&words);

    // Note: we can't use cycle_pt due to inserted combos at start of list.

    for (; (prev_right == -INT16_MAX) || !word_it.at_first(); word_it.forward()) {

      word = word_it.data();

      if (!word->part_of_combo) {

        box = word->word->bounding_box();

        if (prev_right > -INT16_MAX) {

          gap = box.left() - prev_right;

          if (gap <= min_gap) {

            prev_word = prev_word_it.data();

            if (prev_word->combination) {

              combo = prev_word;

            } else {

              /* Make a new combination and insert before

               * the first word being joined. */

              copy_word = new WERD;

              *copy_word = *(prev_word->word);

              // deep copy

              combo = new WERD_RES(copy_word);

              combo->combination = true;

              combo->x_height = prev_word->x_height;

              prev_word->part_of_combo = true;

              prev_word_it.add_before_then_move(combo);

            }

            combo->word->set_flag(W_EOL, word->word->flag(W_EOL));

            if (word->combination) {

              combo->word->join_on(word->word);

              // Move blobs to combo

              // old combo no longer needed

              delete word_it.extract();

            } else {

              // Copy current wd to combo

              combo->copy_on(word);

              word->part_of_combo = true;

            }

            combo->done = false;

            combo->ClearResults();

          } else {

            prev_word_it = word_it; // catch up

          }

        }

        prev_right = box.right();

      }

    }

  } else {

    words.clear(); // signal termination

  }

}


void Tesseract::dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) {

  WERD_RES_IT word_res_it(&perm);


  if (debug_fix_space_level > 0) {

    if (mode == 1) {

      stats_.dump_words_str = "";

      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {

        if (!word_res_it.data()->part_of_combo) {

          stats_.dump_words_str += word_res_it.data()->best_choice->unichar_string();

          stats_.dump_words_str += ' ';

        }

      }

    }


    if (debug_fix_space_level > 1) {

      switch (mode) {

        case 1:

          tprintf("EXTRACTED (%d): \"", score);

          break;

        case 2:

          tprintf("TESTED (%d): \"", score);

          break;

        case 3:

          tprintf("RETURNED (%d): \"", score);

          break;

      }


      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {

        if (!word_res_it.data()->part_of_combo) {

          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),

                  static_cast<int>(word_res_it.data()->best_choice->permuter()));

        }

      }

      tprintf("\"\n");

    } else if (improved) {

      tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.c_str());

      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list(); word_res_it.forward()) {

        if (!word_res_it.data()->part_of_combo) {

          tprintf("%s/%1d ", word_res_it.data()->best_choice->unichar_string().c_str(),

                  static_cast<int>(word_res_it.data()->best_choice->permuter()));

        }

      }

      tprintf("\"\n");

    }

  }

}


bool Tesseract::fixspace_thinks_word_done(WERD_RES *word) {

  if (word->done) {

    return true;

  }


  /*

  Use all the standard pass 2 conditions for mode 5 in set_done() in

  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T

  CARE WHETHER WE HAVE of/at on/an etc.

*/

  if (fixsp_done_mode > 0 &&

      (word->tess_accepted || (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||

       fixsp_done_mode == 3) &&

      (strchr(word->best_choice->unichar_string().c_str(), ' ') == nullptr) &&

      ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||

       (word->best_choice->permuter() == FREQ_DAWG_PERM) ||

       (word->best_choice->permuter() == USER_DAWG_PERM) ||

       (word->best_choice->permuter() == NUMBER_PERM))) {

    return true;

  } else {

    return false;

  }

}


void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) {

  WERD_RES *word_res;

  WERD_RES_LIST sub_word_list;

  WERD_RES_IT sub_word_list_it(&sub_word_list);

  int16_t blob_index;

  int16_t new_length;

  float junk;


  word_res = word_res_it.data();

  if (word_res->word->flag(W_REP_CHAR) || word_res->combination || word_res->part_of_combo ||

      !word_res->word->flag(W_DONT_CHOP)) {

    return;

  }


  blob_index = worst_noise_blob(word_res, &junk);

  if (blob_index < 0) {

    return;

  }


  if (debug_fix_space_level > 1) {

    tprintf("FP fixspace working on \"%s\"\n", word_res->best_choice->unichar_string().c_str());

  }

  word_res->word->rej_cblob_list()->sort(c_blob_comparator);

  sub_word_list_it.add_after_stay_put(word_res_it.extract());

  fix_noisy_space_list(sub_word_list, row, block);

  new_length = sub_word_list.length();

  word_res_it.add_list_before(&sub_word_list);

  for (; !word_res_it.at_last() && new_length > 1; new_length--) {

    word_res_it.forward();

  }

}


void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) {

  int16_t best_score;

  WERD_RES_IT best_perm_it(&best_perm);

  WERD_RES_LIST current_perm;

  WERD_RES_IT current_perm_it(&current_perm);

  WERD_RES *old_word_res;

  int16_t current_score;

  bool improved = false;


  best_score = fp_eval_word_spacing(best_perm); // default score


  dump_words(best_perm, best_score, 1, improved);


  old_word_res = best_perm_it.data();

  // Even deep_copy doesn't copy the underlying WERD unless its combination

  // flag is true!.

  old_word_res->combination = true; // Kludge to force deep copy

  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));

  old_word_res->combination = false; // Undo kludge


  break_noisiest_blob_word(current_perm);


  while (best_score != PERFECT_WERDS && !current_perm.empty()) {

    match_current_words(current_perm, row, block);

    current_score = fp_eval_word_spacing(current_perm);

    dump_words(current_perm, current_score, 2, improved);

    if (current_score > best_score) {

      best_perm.clear();

      best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);

      best_score = current_score;

      improved = true;

    }

    if (current_score < PERFECT_WERDS) {

      break_noisiest_blob_word(current_perm);

    }

  }

  dump_words(best_perm, best_score, 3, improved);

}


void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {

  WERD_RES_IT word_it(&words);

  WERD_RES_IT worst_word_it;

  float worst_noise_score = 9999;

  int worst_blob_index = -1; // Noisiest blob of noisiest wd

  int blob_index;            // of wds noisiest blob

  float noise_score;         // of wds noisiest blob

  WERD_RES *word_res;

  C_BLOB_IT blob_it;

  C_BLOB_IT rej_cblob_it;

  C_BLOB_LIST new_blob_list;

  C_BLOB_IT new_blob_it;

  C_BLOB_IT new_rej_cblob_it;

  WERD *new_word;

  int16_t start_of_noise_blob;

  int16_t i;


  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

    blob_index = worst_noise_blob(word_it.data(), &noise_score);

    if (blob_index > -1 && worst_noise_score > noise_score) {

      worst_noise_score = noise_score;

      worst_blob_index = blob_index;

      worst_word_it = word_it;

    }

  }

  if (worst_blob_index < 0) {

    words.clear(); // signal termination

    return;

  }


  /* Now split the worst_word_it */


  word_res = worst_word_it.data();


  /* Move blobs before noise blob to a new bloblist */


  new_blob_it.set_to_list(&new_blob_list);

  blob_it.set_to_list(word_res->word->cblob_list());

  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {

    new_blob_it.add_after_then_move(blob_it.extract());

  }

  start_of_noise_blob = blob_it.data()->bounding_box().left();

  delete blob_it.extract(); // throw out noise blob


  new_word = new WERD(&new_blob_list, word_res->word);

  new_word->set_flag(W_EOL, false);

  word_res->word->set_flag(W_BOL, false);

  word_res->word->set_blanks(1); // After break


  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());

  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());

  for (; (!rej_cblob_it.empty() &&

          (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));

       rej_cblob_it.forward()) {

    new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());

  }


  auto *new_word_res = new WERD_RES(new_word);

  new_word_res->combination = true;

  worst_word_it.add_before_then_move(new_word_res);


  word_res->ClearResults();

}


int16_t Tesseract::worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) {

  float noise_score[512];

  int min_noise_blob; // 1st contender

  int max_noise_blob; // last contender

  int non_noise_count;

  int worst_noise_blob; // Worst blob

  float small_limit = kBlnXHeight * fixsp_small_outlines_size;

  float non_noise_limit = kBlnXHeight * 0.8;


  if (word_res->rebuild_word == nullptr) {

    return -1; // Can't handle cube words.

  }


  // Normalised.

  auto blob_count = word_res->box_word->length();

  ASSERT_HOST(blob_count <= 512);

  if (blob_count < 5) {

    return -1; // too short to split

  }


    /* Get the noise scores for all blobs */


#ifndef SECURE_NAMES

  if (debug_fix_space_level > 5) {

    tprintf("FP fixspace Noise metrics for \"%s\": ",

            word_res->best_choice->unichar_string().c_str());

  }

#endif


  for (unsigned i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {

    TBLOB *blob = word_res->rebuild_word->blobs[i];

    if (word_res->reject_map[i].accepted()) {

      noise_score[i] = non_noise_limit;

    } else {

      noise_score[i] = blob_noise_score(blob);

    }


    if (debug_fix_space_level > 5) {

      tprintf("%1.1f ", noise_score[i]);

    }

  }

  if (debug_fix_space_level > 5) {

    tprintf("\n");

  }


  /* Now find the worst one which is far enough away from the end of the word */


  non_noise_count = 0;

  int i;

  for (i = 0; static_cast<unsigned>(i) < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {

    if (noise_score[i] >= non_noise_limit) {

      non_noise_count++;

    }

  }

  if (non_noise_count < fixsp_non_noise_limit) {

    return -1;

  }


  min_noise_blob = i;


  non_noise_count = 0;

  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit; i--) {

    if (noise_score[i] >= non_noise_limit) {

      non_noise_count++;

    }

  }

  if (non_noise_count < fixsp_non_noise_limit) {

    return -1;

  }


  max_noise_blob = i;


  if (min_noise_blob > max_noise_blob) {

    return -1;

  }


  *worst_noise_score = small_limit;

  worst_noise_blob = -1;

  for (auto i = min_noise_blob; i <= max_noise_blob; i++) {

    if (noise_score[i] < *worst_noise_score) {

      worst_noise_blob = i;

      *worst_noise_score = noise_score[i];

    }

  }

  return worst_noise_blob;

}


float Tesseract::blob_noise_score(TBLOB *blob) {

  TBOX box; // BB of outline

  int16_t outline_count = 0;

  int16_t max_dimension;

  int16_t largest_outline_dimension = 0;


  for (TESSLINE *ol = blob->outlines; ol != nullptr; ol = ol->next) {

    outline_count++;

    box = ol->bounding_box();

    if (box.height() > box.width()) {

      max_dimension = box.height();

    } else {

      max_dimension = box.width();

    }


    if (largest_outline_dimension < max_dimension) {

      largest_outline_dimension = max_dimension;

    }

  }


  if (outline_count > 5) {

    // penalise LOTS of blobs

    largest_outline_dimension *= 2;

  }


  box = blob->bounding_box();

  if (box.bottom() > kBlnBaselineOffset * 4 || box.top() < kBlnBaselineOffset / 2) {

    // Lax blob is if high or low

    largest_outline_dimension /= 2;

  }


  return largest_outline_dimension;

}


void fixspace_dbg(WERD_RES *word) {

  TBOX box = word->word->bounding_box();

  const bool show_map_detail = false;

  int16_t i;


  box.print();

  tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());

  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n", word->word->cblob_list()->length(),

          word->rebuild_word->NumBlobs(), word->box_word->length());

  word->reject_map.print(debug_fp);

  tprintf("\n");

  if (show_map_detail) {

    tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());

    for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {

      tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);

      word->reject_map[i].full_print(debug_fp);

    }

  }


  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");

  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");

}


int16_t Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {

  WERD_RES_IT word_it(&word_res_list);

  WERD_RES *word;

  int16_t score = 0;

  float small_limit = kBlnXHeight * fixsp_small_outlines_size;


  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

    word = word_it.data();

    if (word->rebuild_word == nullptr) {

      continue; // Can't handle cube words.

    }

    if (word->done || word->tess_accepted || word->best_choice->permuter() == SYSTEM_DAWG_PERM ||

        word->best_choice->permuter() == FREQ_DAWG_PERM ||

        word->best_choice->permuter() == USER_DAWG_PERM || safe_dict_word(word) > 0) {

      auto num_blobs = word->rebuild_word->NumBlobs();

      UNICHAR_ID space = word->uch_set->unichar_to_id(" ");

      for (unsigned i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {

        TBLOB *blob = word->rebuild_word->blobs[i];

        if (word->best_choice->unichar_id(i) == space || blob_noise_score(blob) < small_limit) {

          score -= 1; // penalise possibly erroneous non-space

        } else if (word->reject_map[i].accepted()) {

          score++;

        }

      }

    }

  }

  if (score < 0) {

    score = 0;

  }

  return score;

}


} // namespace tesseract

ocrclass.h

unichar.h

fixspace.h

PERFECT_WERDS
#define PERFECT_WERDS
Definition: fixspace.cpp:48

tessvars.h

debug_fp
FILE * debug_fp
Definition: tessvars.cpp:24

tesseractclass.h

unicharset.h

params.h

errcode.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

tprintf.h

pageres.h

normalis.h

ratngs.h

boxword.h

werd.h

blobs.h

rect.h

stepblob.h

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::W_BOL
@ W_BOL
start of line
Definition: werd.h:34

tesseract::W_FUZZY_SP
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41

tesseract::W_EOL
@ W_EOL
end of line
Definition: werd.h:35

tesseract::W_DONT_CHOP
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39

tesseract::W_REP_CHAR
@ W_REP_CHAR
repeated character
Definition: werd.h:40

tesseract::W_FUZZY_NON
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:33

tesseract::transform_to_next_perm
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:391

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::initialise_search
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:201

tesseract::SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244

tesseract::NUMBER_PERM
@ NUMBER_PERM
Definition: ratngs.h:242

tesseract::USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:246

tesseract::FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

tesseract::fixspace_dbg
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:806

tesseract::kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:34

tesseract::ETEXT_DESC
Definition: ocrclass.h:102

tesseract::ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110

tesseract::ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:136

tesseract::ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116

tesseract::ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105

tesseract::ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112

tesseract::TesseractStats::dump_words_str
std::string dump_words_str
Definition: tesseractclass.h:142

tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:141

tesseract::WordData
Definition: tesseractclass.h:151

tesseract::Tesseract::fixspace_thinks_word_done
bool fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:514

tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302

tesseract::Tesseract::fix_noisy_space_list
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:577

tesseract::Tesseract::worst_noise_blob
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:685

tesseract::Tesseract::fix_sp_fp_word
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:545

tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166

tesseract::Tesseract::blob_noise_score
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:772

tesseract::Tesseract::fix_fuzzy_space_list
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:171

tesseract::Tesseract::dump_words
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
Definition: fixspace.cpp:467

tesseract::Tesseract::digit_or_numeric_punct
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:366

tesseract::Tesseract::safe_dict_word
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:593

tesseract::Tesseract::break_noisiest_blob_word
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:621

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799

tesseract::Tesseract::fp_eval_word_spacing
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:837

tesseract::Tesseract::eval_word_spacing
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:260

tesseract::Tesseract::match_current_words
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:218

tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77

tesseract::TESSLINE
Definition: blobs.h:211

tesseract::TESSLINE::next
TESSLINE * next
Definition: blobs.h:288

tesseract::TBLOB
Definition: blobs.h:291

tesseract::TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466

tesseract::TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:404

tesseract::TWERD::blobs
std::vector< TBLOB * > blobs
Definition: blobs.h:462

tesseract::TWERD::NumBlobs
unsigned NumBlobs() const
Definition: blobs.h:449

tesseract::BoxWord::length
unsigned length() const
Definition: boxword.h:81

tesseract::BLOCK
Definition: ocrblock.h:34

tesseract::ROW
Definition: ocrrow.h:39

tesseract::PAGE_RES
Definition: pageres.h:77

tesseract::PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:667

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:301

tesseract::WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:292

tesseract::WERD_RES::done
bool done
Definition: pageres.h:303

tesseract::WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1138

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:270

tesseract::WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:293

tesseract::WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:264

tesseract::WERD_RES::deep_copy
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:655

tesseract::WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:338

tesseract::WERD_RES::x_height
float x_height
Definition: pageres.h:314

tesseract::WERD_RES::combination
bool combination
Definition: pageres.h:337

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:331

tesseract::WERD_CHOICE::length
unsigned length() const
Definition: ratngs.h:287

tesseract::WERD_CHOICE::unichar_lengths
const std::string & unichar_lengths() const
Definition: ratngs.h:533

tesseract::WERD_CHOICE::unichar_string
std::string & unichar_string()
Definition: ratngs.h:519

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::left
TDimension left() const
Definition: rect.h:82

tesseract::TBOX::height
TDimension height() const
Definition: rect.h:118

tesseract::TBOX::width
TDimension width() const
Definition: rect.h:126

tesseract::TBOX::top
TDimension top() const
Definition: rect.h:68

tesseract::TBOX::print
void print() const
Definition: rect.h:289

tesseract::TBOX::right
TDimension right() const
Definition: rect.h:89

tesseract::TBOX::bottom
TDimension bottom() const
Definition: rect.h:75

tesseract::REJMAP::print
void print(FILE *fp) const
Definition: rejctmap.cpp:112

tesseract::REJMAP::reject_count
int16_t reject_count() const
Definition: rejctmap.h:339

tesseract::REJMAP::length
uint16_t length() const
Definition: rejctmap.h:333

tesseract::REJMAP::full_print
void full_print(FILE *fp) const
Definition: rejctmap.cpp:120

tesseract::WERD
Definition: werd.h:58

tesseract::WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131

tesseract::WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:155

tesseract::WERD::join_on
void join_on(WERD *other)
Definition: werd.cpp:208

tesseract::WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91

tesseract::WERD::set_blanks
void set_blanks(uint8_t new_blanks)
Definition: werd.h:103

tesseract::WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:96

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387