tesseract-ocr.github.io/5.3.3/a00557_source.html

/**********************************************************************

 * File:        pageres.cpp  (Formerly page_res.c)

 * Description: Hierarchy of results classes from PAGE_RES to WERD_RES

 *              and an iterator class to iterate over the words.

 * Main purposes:

 *              Easy way to iterate over the words without a 3-nested loop.

 *              Holds data used during word recognition.

 *              Holds information about alternative spacing paths.

 * Author:      Phil Cheatle

 *

 * (C) Copyright 1992, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


#include "pageres.h"


#include "blamer.h"   // for BlamerBundle

#include "blobs.h"    // for TWERD, TBLOB

#include "boxword.h"  // for BoxWord

#include "errcode.h"  // for ASSERT_HOST

#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)

#include "ocrrow.h"   // for ROW, ROW_IT

#include "pdblock.h"  // for PDBLK

#include "polyblk.h"  // for POLY_BLOCK

#include "seam.h"     // for SEAM, start_seam_list

#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST

#include "tprintf.h"  // for tprintf


#include <tesseract/publictypes.h> // for OcrEngineMode, OEM_LSTM_ONLY


#include <cassert> // for assert

#include <cstdint> // for INT32_MAX

#include <cstring> // for strlen


struct Pix;


namespace tesseract {


// Gain factor for computing thresholds that determine the ambiguity of a

// word.

static const double kStopperAmbiguityThresholdGain = 8.0;

// Constant offset for computing thresholds that determine the ambiguity of a

// word.

static const double kStopperAmbiguityThresholdOffset = 1.5;

// Max number of broken pieces to associate.

const int kWordrecMaxNumJoinChunks = 4;

// Max ratio of word box height to line size to allow it to be processed as

// a line with other words.

const double kMaxWordSizeRatio = 1.25;

// Max ratio of line box height to line size to allow a new word to be added.

const double kMaxLineSizeRatio = 1.25;

// Max ratio of word gap to line size to allow a new word to be added.

const double kMaxWordGapRatio = 2.0;


// Computes and returns a threshold of certainty difference used to determine

// which words to keep, based on the adjustment factors of the two words.

// TODO(rays) This is horrible. Replace with an enhance params training model.

static double StopperAmbigThreshold(double f1, double f2) {

  return (f2 - f1) * kStopperAmbiguityThresholdGain -

         kStopperAmbiguityThresholdOffset;

}


/*************************************************************************

 * PAGE_RES::PAGE_RES

 *

 * Constructor for page results

 *************************************************************************/

PAGE_RES::PAGE_RES(bool merge_similar_words, BLOCK_LIST *the_block_list,

                   WERD_CHOICE **prev_word_best_choice_ptr) {

  Init();

  BLOCK_IT block_it(the_block_list);

  BLOCK_RES_IT block_res_it(&block_res_list);

  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {

    block_res_it.add_to_end(

        new BLOCK_RES(merge_similar_words, block_it.data()));

  }

  prev_word_best_choice = prev_word_best_choice_ptr;

}


/*************************************************************************

 * BLOCK_RES::BLOCK_RES

 *

 * Constructor for BLOCK results

 *************************************************************************/


BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {

  ROW_IT row_it(the_block->row_list());

  ROW_RES_IT row_res_it(&row_res_list);


  char_count = 0;

  rej_count = 0;

  font_class = -1; // not assigned

  x_height = -1.0;

  font_assigned = false;

  row_count = 0;


  block = the_block;


  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {

    row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));

  }

}


/*************************************************************************

 * ROW_RES::ROW_RES

 *

 * Constructor for ROW results

 *************************************************************************/


ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {

  WERD_IT word_it(the_row->word_list());

  WERD_RES_IT word_res_it(&word_res_list);

  WERD_RES *combo = nullptr; // current combination of fuzzies

  WERD *copy_word;


  char_count = 0;

  rej_count = 0;

  whole_word_rej_count = 0;


  row = the_row;

  bool add_next_word = false;

  TBOX union_box;

  float line_height =

      the_row->x_height() + the_row->ascenders() - the_row->descenders();

  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {

    auto *word_res = new WERD_RES(word_it.data());

    word_res->x_height = the_row->x_height();

    if (add_next_word) {

      ASSERT_HOST(combo != nullptr);

      // We are adding this word to the combination.

      word_res->part_of_combo = true;

      combo->copy_on(word_res);

    } else if (merge_similar_words) {

      union_box = word_res->word->bounding_box();

      add_next_word = !word_res->word->flag(W_REP_CHAR) &&

                      union_box.height() <= line_height * kMaxWordSizeRatio;

      word_res->odd_size = !add_next_word;

    }

    WERD *next_word = word_it.data_relative(1);

    if (merge_similar_words) {

      if (add_next_word && !next_word->flag(W_REP_CHAR)) {

        // Next word will be added on if all of the following are true:

        // Not a rep char.

        // Box height small enough.

        // Union box height small enough.

        // Horizontal gap small enough.

        TBOX next_box = next_word->bounding_box();

        int prev_right = union_box.right();

        union_box += next_box;

        if (next_box.height() > line_height * kMaxWordSizeRatio ||

            union_box.height() > line_height * kMaxLineSizeRatio ||

            next_box.left() > prev_right + line_height * kMaxWordGapRatio) {

          add_next_word = false;

        }

      }

      next_word->set_flag(W_FUZZY_NON, add_next_word);

    } else {

      add_next_word = next_word->flag(W_FUZZY_NON);

    }

    if (add_next_word) {

      if (combo == nullptr) {

        copy_word = new WERD;

        *copy_word = *(word_it.data()); // deep copy

        combo = new WERD_RES(copy_word);

        combo->x_height = the_row->x_height();

        combo->combination = true;

        word_res_it.add_to_end(combo);

      }

      word_res->part_of_combo = true;

    } else {

      combo = nullptr;

    }

    word_res_it.add_to_end(word_res);

  }

}


WERD_RES &WERD_RES::operator=(const WERD_RES &source) {

  this->ELIST_LINK::operator=(source);

  Clear();

  if (source.combination) {

    word = new WERD;

    *word = *(source.word); // deep copy

  } else {

    word = source.word; // pt to same word

  }

  if (source.bln_boxes != nullptr) {

    bln_boxes = new tesseract::BoxWord(*source.bln_boxes);

  }

  if (source.chopped_word != nullptr) {

    chopped_word = new TWERD(*source.chopped_word);

  }

  if (source.rebuild_word != nullptr) {

    rebuild_word = new TWERD(*source.rebuild_word);

  }

  // TODO(rays) Do we ever need to copy the seam_array?

  blob_row = source.blob_row;

  denorm = source.denorm;

  if (source.box_word != nullptr) {

    box_word = new tesseract::BoxWord(*source.box_word);

  }

  best_state = source.best_state;

  correct_text = source.correct_text;

  blob_widths = source.blob_widths;

  blob_gaps = source.blob_gaps;

  // None of the uses of operator= require the ratings matrix to be copied,

  // so don't as it would be really slow.


  // Copy the cooked choices.

  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&source.best_choices));

  WERD_CHOICE_IT wc_dest_it(&best_choices);

  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {

    const WERD_CHOICE *choice = wc_it.data();

    wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));

  }

  if (!wc_dest_it.empty()) {

    wc_dest_it.move_to_first();

    best_choice = wc_dest_it.data();

  } else {

    best_choice = nullptr;

  }


  if (source.raw_choice != nullptr) {

    raw_choice = new WERD_CHOICE(*source.raw_choice);

  } else {

    raw_choice = nullptr;

  }

  if (source.ep_choice != nullptr) {

    ep_choice = new WERD_CHOICE(*source.ep_choice);

  } else {

    ep_choice = nullptr;

  }

  reject_map = source.reject_map;

  combination = source.combination;

  part_of_combo = source.part_of_combo;

  CopySimpleFields(source);

  if (source.blamer_bundle != nullptr) {

    blamer_bundle = new BlamerBundle(*(source.blamer_bundle));

  }

  return *this;

}


// Copies basic fields that don't involve pointers that might be useful

// to copy when making one WERD_RES from another.

void WERD_RES::CopySimpleFields(const WERD_RES &source) {

  tess_failed = source.tess_failed;

  tess_accepted = source.tess_accepted;

  tess_would_adapt = source.tess_would_adapt;

  done = source.done;

  unlv_crunch_mode = source.unlv_crunch_mode;

  small_caps = source.small_caps;

  odd_size = source.odd_size;

  fontinfo = source.fontinfo;

  fontinfo2 = source.fontinfo2;

  fontinfo_id_count = source.fontinfo_id_count;

  fontinfo_id2_count = source.fontinfo_id2_count;

  x_height = source.x_height;

  caps_height = source.caps_height;

  baseline_shift = source.baseline_shift;

  guessed_x_ht = source.guessed_x_ht;

  guessed_caps_ht = source.guessed_caps_ht;

  reject_spaces = source.reject_spaces;

  uch_set = source.uch_set;

  tesseract = source.tesseract;

}


// Initializes a blank (default constructed) WERD_RES from one that has

// already been recognized.

// Use SetupFor*Recognition afterwards to complete the setup and make

// it ready for a retry recognition.

void WERD_RES::InitForRetryRecognition(const WERD_RES &source) {

  word = source.word;

  CopySimpleFields(source);

  if (source.blamer_bundle != nullptr) {

    blamer_bundle = new BlamerBundle();

    blamer_bundle->CopyTruth(*source.blamer_bundle);

  }

}


// Sets up the members used in recognition: bln_boxes, chopped_word,

// seam_array, denorm.  Returns false if

// the word is empty and sets up fake results.  If use_body_size is

// true and row->body_size is set, then body_size will be used for

// blob normalization instead of xheight + ascrise. This flag is for

// those languages that are using CJK pitch model and thus it has to

// be true if and only if tesseract->textord_use_cjk_fp_model is

// true.

// If allow_detailed_fx is true, the feature extractor will receive fine

// precision outline information, allowing smoother features and better

// features on low resolution images.

// The norm_mode_hint sets the default mode for normalization in absence

// of any of the above flags.

// norm_box is used to override the word bounding box to determine the

// normalization scale and offset.

// Returns false if the word is empty and sets up fake results.

bool WERD_RES::SetupForRecognition(const UNICHARSET &unicharset_in,

                                   tesseract::Tesseract *tess, Image pix,

                                   int norm_mode, const TBOX *norm_box,

                                   bool numeric_mode, bool use_body_size,

                                   bool allow_detailed_fx, ROW *row,

                                   const BLOCK *block) {

  auto norm_mode_hint = static_cast<tesseract::OcrEngineMode>(norm_mode);

  tesseract = tess;

  POLY_BLOCK *pb = block != nullptr ? block->pdblk.poly_block() : nullptr;

  if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY &&

       word->cblob_list()->empty()) ||

      (pb != nullptr && !pb->IsText())) {

    // Empty words occur when all the blobs have been moved to the rej_blobs

    // list, which seems to occur frequently in junk.

    SetupFake(unicharset_in);

    word->set_flag(W_REP_CHAR, false);

    return false;

  }

  ClearResults();

  SetupWordScript(unicharset_in);

  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);

  float word_xheight =

      use_body_size && row != nullptr && row->body_size() > 0.0f

          ? row->body_size()

          : x_height;

  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),

                            word_xheight, baseline_shift, numeric_mode,

                            norm_mode_hint, norm_box, &denorm);

  blob_row = row;

  SetupBasicsFromChoppedWord(unicharset_in);

  SetupBlamerBundle();

  int num_blobs = chopped_word->NumBlobs();

  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);

  tess_failed = false;

  return true;

}


// Set up the seam array, bln_boxes, best_choice, and raw_choice to empty

// accumulators from a made chopped word.  We presume the fields are already

// empty.

void WERD_RES::SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in) {

  bln_boxes = tesseract::BoxWord::CopyFromNormalized(chopped_word);

  start_seam_list(chopped_word, &seam_array);

  SetupBlobWidthsAndGaps();

  ClearWordChoices();

}


// Sets up the members used in recognition for an empty recognition result:

// bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.

void WERD_RES::SetupFake(const UNICHARSET &unicharset_in) {

  ClearResults();

  SetupWordScript(unicharset_in);

  chopped_word = new TWERD;

  rebuild_word = new TWERD;

  bln_boxes = new tesseract::BoxWord;

  box_word = new tesseract::BoxWord;

  int blob_count = word->cblob_list()->length();

  if (blob_count > 0) {

    auto **fake_choices = new BLOB_CHOICE *[blob_count];

    // For non-text blocks, just pass any blobs through to the box_word

    // and call the word failed with a fake classification.

    C_BLOB_IT b_it(word->cblob_list());

    int blob_id = 0;

    for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {

      TBOX box = b_it.data()->bounding_box();

      box_word->InsertBox(box_word->length(), box);

      fake_choices[blob_id++] = new BLOB_CHOICE;

    }

    FakeClassifyWord(blob_count, fake_choices);

    delete[] fake_choices;

  } else {

    auto *word = new WERD_CHOICE(&unicharset_in);

    word->make_bad();

    LogNewRawChoice(word);

    // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.

    LogNewCookedChoice(1, false, word);

  }

  tess_failed = true;

  done = true;

}


void WERD_RES::SetupWordScript(const UNICHARSET &uch) {

  uch_set = &uch;

  int script = uch.default_sid();

  word->set_script_id(script);

  word->set_flag(W_SCRIPT_HAS_XHEIGHT, uch.script_has_xheight());

  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());

}


// Sets up the blamer_bundle if it is not null, using the initialized denorm.

void WERD_RES::SetupBlamerBundle() {

  if (blamer_bundle != nullptr) {

    blamer_bundle->SetupNormTruthWord(denorm);

  }

}


// Computes the blob_widths and blob_gaps from the chopped_word.

void WERD_RES::SetupBlobWidthsAndGaps() {

  blob_widths.clear();

  blob_gaps.clear();

  int num_blobs = chopped_word->NumBlobs();

  for (int b = 0; b < num_blobs; ++b) {

    TBLOB *blob = chopped_word->blobs[b];

    TBOX box = blob->bounding_box();

    blob_widths.push_back(box.width());

    if (b + 1 < num_blobs) {

      blob_gaps.push_back(chopped_word->blobs[b + 1]->bounding_box().left() -

                          box.right());

    }

  }

}


// Updates internal data to account for a new SEAM (chop) at the given

// blob_number. Fixes the ratings matrix and states in the choices, as well

// as the blob widths and gaps.

void WERD_RES::InsertSeam(int blob_number, SEAM *seam) {

  // Insert the seam into the SEAMS array.

  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);

  seam_array.insert(seam_array.begin() + blob_number, seam);

  if (ratings != nullptr) {

    // Expand the ratings matrix.

    ratings = ratings->ConsumeAndMakeBigger(blob_number);

    // Fix all the segmentation states.

    if (raw_choice != nullptr) {

      raw_choice->UpdateStateForSplit(blob_number);

    }

    WERD_CHOICE_IT wc_it(&best_choices);

    for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {

      WERD_CHOICE *choice = wc_it.data();

      choice->UpdateStateForSplit(blob_number);

    }

    SetupBlobWidthsAndGaps();

  }

}


// Returns true if all the word choices except the first have adjust_factors

// worse than the given threshold.

bool WERD_RES::AlternativeChoiceAdjustmentsWorseThan(float threshold) const {

  // The choices are not changed by this iteration.

  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST *>(&best_choices));

  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {

    WERD_CHOICE *choice = wc_it.data();

    if (choice->adjust_factor() <= threshold) {

      return false;

    }

  }

  return true;

}


// Returns true if the current word is ambiguous (by number of answers or

// by dangerous ambigs.)

bool WERD_RES::IsAmbiguous() {

  return !best_choices.singleton() || best_choice->dangerous_ambig_found();

}


// Returns true if the ratings matrix size matches the sum of each of the

// segmentation states.

bool WERD_RES::StatesAllValid() {

  unsigned ratings_dim = ratings->dimension();

  if (raw_choice->TotalOfStates() != ratings_dim) {

    tprintf("raw_choice has total of states = %u vs ratings dim of %u\n",

            raw_choice->TotalOfStates(), ratings_dim);

    return false;

  }

  WERD_CHOICE_IT it(&best_choices);

  unsigned index = 0;

  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {

    WERD_CHOICE *choice = it.data();

    if (choice->TotalOfStates() != ratings_dim) {

      tprintf("Cooked #%u has total of states = %u vs ratings dim of %u\n",

              index, choice->TotalOfStates(), ratings_dim);

      return false;

    }

  }

  return true;

}


// Prints a list of words found if debug is true or the word result matches

// the word_to_debug.

void WERD_RES::DebugWordChoices(bool debug, const char *word_to_debug) {

  if (debug || (word_to_debug != nullptr && *word_to_debug != '\0' &&

                best_choice != nullptr &&

                best_choice->unichar_string() == std::string(word_to_debug))) {

    if (raw_choice != nullptr) {

      raw_choice->print("\nBest Raw Choice");

    }


    WERD_CHOICE_IT it(&best_choices);

    int index = 0;

    for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {

      WERD_CHOICE *choice = it.data();

      std::string label;

      label += "\nCooked Choice #" + std::to_string(index);

      choice->print(label.c_str());

    }

  }

}


// Prints the top choice along with the accepted/done flags.

void WERD_RES::DebugTopChoice(const char *msg) const {

  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ", tess_accepted,

          tess_would_adapt, done);

  if (best_choice == nullptr) {

    tprintf("<Null choice>\n");

  } else {

    best_choice->print(msg);

  }

}


// Removes from best_choices all choices which are not within a reasonable

// range of the best choice.

// TODO(rays) incorporate the information used here into the params training

// re-ranker, in place of this heuristic that is based on the previous

// adjustment factor.

void WERD_RES::FilterWordChoices(int debug_level) {

  if (best_choice == nullptr || best_choices.singleton()) {

    return;

  }


  if (debug_level >= 2) {

    best_choice->print("\nFiltering against best choice");

  }

  WERD_CHOICE_IT it(&best_choices);

  int index = 0;

  for (it.forward(); !it.at_first(); it.forward(), ++index) {

    WERD_CHOICE *choice = it.data();

    float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),

                                            choice->adjust_factor());

    // i, j index the blob choice in choice, best_choice.

    // chunk is an index into the chopped_word blobs (AKA chunks).

    // Since the two words may use different segmentations of the chunks, we

    // iterate over the chunks to find out whether a comparable blob

    // classification is much worse than the best result.

    unsigned i = 0, j = 0, chunk = 0;

    // Each iteration of the while deals with 1 chunk. On entry choice_chunk

    // and best_chunk are the indices of the first chunk in the NEXT blob,

    // i.e. we don't have to increment i, j while chunk < choice_chunk and

    // best_chunk respectively.

    auto choice_chunk = choice->state(0), best_chunk = best_choice->state(0);

    while (i < choice->length() && j < best_choice->length()) {

      if (choice->unichar_id(i) != best_choice->unichar_id(j) &&

          choice->certainty(i) - best_choice->certainty(j) < threshold) {

        if (debug_level >= 2) {

          choice->print("WorstCertaintyDiffWorseThan");

          tprintf(

              "i %u j %u Choice->Blob[i].Certainty %.4g"

              " WorstOtherChoiceCertainty %g Threshold %g\n",

              i, j, choice->certainty(i), best_choice->certainty(j), threshold);

          tprintf("Discarding bad choice #%d\n", index);

        }

        delete it.extract();

        break;

      }

      ++chunk;

      // If needed, advance choice_chunk to keep up with chunk.

      while (choice_chunk < chunk && ++i < choice->length()) {

        choice_chunk += choice->state(i);

      }

      // If needed, advance best_chunk to keep up with chunk.

      while (best_chunk < chunk && ++j < best_choice->length()) {

        best_chunk += best_choice->state(j);

      }

    }

  }

}


void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,

                                         float min_rating, float max_rating,

                                         float rating_margin,

                                         float *thresholds) {

  int chunk = 0;

  int end_chunk = best_choice->state(0);

  int end_raw_chunk = raw_choice->state(0);

  int raw_blob = 0;

  for (unsigned i = 0; i < best_choice->length(); i++, thresholds++) {

    float avg_rating = 0.0f;

    int num_error_chunks = 0;


    // For each chunk in best choice blob i, count non-matching raw results.

    while (chunk < end_chunk) {

      if (chunk >= end_raw_chunk) {

        ++raw_blob;

        end_raw_chunk += raw_choice->state(raw_blob);

      }

      if (best_choice->unichar_id(i) != raw_choice->unichar_id(raw_blob)) {

        avg_rating += raw_choice->certainty(raw_blob);

        ++num_error_chunks;

      }

      ++chunk;

    }


    if (num_error_chunks > 0) {

      avg_rating /= num_error_chunks;

      *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);

    } else {

      *thresholds = max_rating;

    }


    if (*thresholds > max_rating) {

      *thresholds = max_rating;

    }

    if (*thresholds < min_rating) {

      *thresholds = min_rating;

    }

  }

}


// Saves a copy of the word_choice if it has the best unadjusted rating.

// Returns true if the word_choice was the new best.

bool WERD_RES::LogNewRawChoice(WERD_CHOICE *word_choice) {

  if (raw_choice == nullptr || word_choice->rating() < raw_choice->rating()) {

    delete raw_choice;

    raw_choice = new WERD_CHOICE(*word_choice);

    raw_choice->set_permuter(TOP_CHOICE_PERM);

    return true;

  }

  return false;

}


// Consumes word_choice by adding it to best_choices, (taking ownership) if

// the certainty for word_choice is some distance of the best choice in

// best_choices, or by deleting the word_choice and returning false.

// The best_choices list is kept in sorted order by rating. Duplicates are

// removed, and the list is kept no longer than max_num_choices in length.

// Returns true if the word_choice is still a valid pointer.

bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,

                                  WERD_CHOICE *word_choice) {

  if (best_choice != nullptr) {

    // Throw out obviously bad choices to save some work.

    // TODO(rays) Get rid of this! This piece of code produces different

    // results according to the order in which words are found, which is an

    // undesirable behavior. It would be better to keep all the choices and

    // prune them later when more information is available.

    float max_certainty_delta = StopperAmbigThreshold(

        best_choice->adjust_factor(), word_choice->adjust_factor());

    if (max_certainty_delta > -kStopperAmbiguityThresholdOffset) {

      max_certainty_delta = -kStopperAmbiguityThresholdOffset;

    }

    if (word_choice->certainty() - best_choice->certainty() <

        max_certainty_delta) {

      if (debug) {

        std::string bad_string;

        word_choice->string_and_lengths(&bad_string, nullptr);

        tprintf(

            "Discarding choice \"%s\" with an overly low certainty"

            " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",

            bad_string.c_str(), word_choice->certainty(),

            best_choice->certainty(),

            max_certainty_delta + best_choice->certainty());

      }

      delete word_choice;

      return false;

    }

  }


  // Insert in the list in order of increasing rating, but knock out worse

  // string duplicates.

  WERD_CHOICE_IT it(&best_choices);

  const std::string &new_str = word_choice->unichar_string();

  bool inserted = false;

  int num_choices = 0;

  if (!it.empty()) {

    do {

      WERD_CHOICE *choice = it.data();

      if (choice->rating() > word_choice->rating() && !inserted) {

        // Time to insert.

        it.add_before_stay_put(word_choice);

        inserted = true;

        if (num_choices == 0) {

          best_choice = word_choice; // This is the new best.

        }

        ++num_choices;

      }

      if (choice->unichar_string() == new_str) {

        if (inserted) {

          // New is better.

          delete it.extract();

        } else {

          // Old is better.

          if (debug) {

            tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",

                    new_str.c_str(), word_choice->rating(), choice->rating());

          }

          delete word_choice;

          return false;

        }

      } else {

        ++num_choices;

        if (num_choices > max_num_choices) {

          delete it.extract();

        }

      }

      it.forward();

    } while (!it.at_first());

  }

  if (!inserted && num_choices < max_num_choices) {

    it.add_to_end(word_choice);

    inserted = true;

    if (num_choices == 0) {

      best_choice = word_choice; // This is the new best.

    }

  }

  if (debug) {

    if (inserted) {

      tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");

    } else {

      tprintf("Poor");

    }

    word_choice->print(" Word Choice");

  }

  if (!inserted) {

    delete word_choice;

    return false;

  }

  return true;

}


// Simple helper moves the ownership of the pointer data from src to dest,

// first deleting anything in dest, and nulling out src afterwards.

template <class T>

static void MovePointerData(T **dest, T **src) {

  delete *dest;

  *dest = *src;

  *src = nullptr;

}


// Prints a brief list of all the best choices.

void WERD_RES::PrintBestChoices() const {

  std::string alternates_str;

  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST *>(&best_choices));

  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {

    if (!it.at_first()) {

      alternates_str += "\", \"";

    }

    alternates_str += it.data()->unichar_string();

  }

  tprintf("Alternates for \"%s\": {\"%s\"}\n",

          best_choice->unichar_string().c_str(), alternates_str.c_str());

}


// Returns the sum of the widths of the blob between start_blob and last_blob

// inclusive.

int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) const {

  int result = 0;

  for (int b = start_blob; b <= last_blob; ++b) {

    result += blob_widths[b];

    if (b < last_blob) {

      result += blob_gaps[b];

    }

  }

  return result;

}

// Returns the width of a gap between the specified blob and the next one.

int WERD_RES::GetBlobsGap(unsigned blob_index) const {

  if (blob_index >= blob_gaps.size()) {

    return 0;

  }

  return blob_gaps[blob_index];

}


// Returns the BLOB_CHOICE corresponding to the given index in the

// best choice word taken from the appropriate cell in the ratings MATRIX.

// Borrowed pointer, so do not delete. May return nullptr if there is no

// BLOB_CHOICE matching the unichar_id at the given index.

BLOB_CHOICE *WERD_RES::GetBlobChoice(unsigned index) const {

  if (index >= best_choice->length()) {

    return nullptr;

  }

  BLOB_CHOICE_LIST *choices = GetBlobChoices(index);

  return FindMatchingChoice(best_choice->unichar_id(index), choices);

}


// Returns the BLOB_CHOICE_LIST corresponding to the given index in the

// best choice word taken from the appropriate cell in the ratings MATRIX.

// Borrowed pointer, so do not delete.

BLOB_CHOICE_LIST *WERD_RES::GetBlobChoices(int index) const {

  return best_choice->blob_choices(index, ratings);

}


// Moves the results fields from word to this. This takes ownership of all

// the data, so src can be destructed.

void WERD_RES::ConsumeWordResults(WERD_RES *word) {

  denorm = word->denorm;

  blob_row = word->blob_row;

  MovePointerData(&chopped_word, &word->chopped_word);

  MovePointerData(&rebuild_word, &word->rebuild_word);

  MovePointerData(&box_word, &word->box_word);

  for (auto data : seam_array) {

    delete data;

  }

  seam_array = word->seam_array;

  word->seam_array.clear();

  // TODO: optimize moves.

  best_state = word->best_state;

  word->best_state.clear();

  correct_text = word->correct_text;

  word->correct_text.clear();

  blob_widths = word->blob_widths;

  word->blob_widths.clear();

  blob_gaps = word->blob_gaps;

  word->blob_gaps.clear();

  if (ratings != nullptr) {

    ratings->delete_matrix_pointers();

  }

  MovePointerData(&ratings, &word->ratings);

  best_choice = word->best_choice;

  MovePointerData(&raw_choice, &word->raw_choice);

  best_choices.clear();

  WERD_CHOICE_IT wc_it(&best_choices);

  wc_it.add_list_after(&word->best_choices);

  reject_map = word->reject_map;

  if (word->blamer_bundle != nullptr) {

    assert(blamer_bundle != nullptr);

    blamer_bundle->CopyResults(*(word->blamer_bundle));

  }

  CopySimpleFields(*word);

}


// Replace the best choice and rebuild box word.

// choice must be from the current best_choices list.

void WERD_RES::ReplaceBestChoice(WERD_CHOICE *choice) {

  best_choice = choice;

  RebuildBestState();

  SetupBoxWord();

  // Make up a fake reject map of the right length to keep the

  // rejection pass happy.

  reject_map.initialise(best_state.size());

  done = tess_accepted = tess_would_adapt = true;

  SetScriptPositions();

}


// Builds the rebuild_word and sets the best_state from the chopped_word and

// the best_choice->state.

void WERD_RES::RebuildBestState() {

  ASSERT_HOST(best_choice != nullptr);

  delete rebuild_word;

  rebuild_word = new TWERD;

  if (seam_array.empty()) {

    start_seam_list(chopped_word, &seam_array);

  }

  best_state.clear();

  int start = 0;

  for (unsigned i = 0; i < best_choice->length(); ++i) {

    int length = best_choice->state(i);

    best_state.push_back(length);

    if (length > 1) {

      SEAM::JoinPieces(seam_array, chopped_word->blobs, start,

                       start + length - 1);

    }

    TBLOB *blob = chopped_word->blobs[start];

    rebuild_word->blobs.push_back(new TBLOB(*blob));

    if (length > 1) {

      SEAM::BreakPieces(seam_array, chopped_word->blobs, start,

                        start + length - 1);

    }

    start += length;

  }

}


// Copies the chopped_word to the rebuild_word, faking a best_state as well.

// Also sets up the output box_word.

void WERD_RES::CloneChoppedToRebuild() {

  delete rebuild_word;

  rebuild_word = new TWERD(*chopped_word);

  SetupBoxWord();

  auto word_len = box_word->length();

  best_state.reserve(word_len);

  correct_text.reserve(word_len);

  for (unsigned i = 0; i < word_len; ++i) {

    best_state.push_back(1);

    correct_text.emplace_back("");

  }

}


// Sets/replaces the box_word with one made from the rebuild_word.

void WERD_RES::SetupBoxWord() {

  delete box_word;

  rebuild_word->ComputeBoundingBoxes();

  box_word = tesseract::BoxWord::CopyFromNormalized(rebuild_word);

  box_word->ClipToOriginalWord(denorm.block(), word);

}


// Sets up the script positions in the output best_choice using the best_choice

// to get the unichars, and the unicharset to get the target positions.

void WERD_RES::SetScriptPositions() {

  best_choice->SetScriptPositions(small_caps, chopped_word);

}

// Sets all the blobs in all the words (raw choice and best choices) to be

// the given position. (When a sub/superscript is recognized as a separate

// word, it falls victim to the rule that a whole word cannot be sub or

// superscript, so this function overrides that problem.)

void WERD_RES::SetAllScriptPositions(tesseract::ScriptPos position) {

  raw_choice->SetAllScriptPositions(position);

  WERD_CHOICE_IT wc_it(&best_choices);

  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {

    wc_it.data()->SetAllScriptPositions(position);

  }

}


// Classifies the word with some already-calculated BLOB_CHOICEs.

// The choices are an array of blob_count pointers to BLOB_CHOICE,

// providing a single classifier result for each blob.

// The BLOB_CHOICEs are consumed and the word takes ownership.

// The number of blobs in the box_word must match blob_count.

void WERD_RES::FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices) {

  // Setup the WERD_RES.

  ASSERT_HOST(box_word != nullptr);

  ASSERT_HOST(blob_count == box_word->length());

  ClearWordChoices();

  ClearRatings();

  ratings = new MATRIX(blob_count, 1);

  for (unsigned c = 0; c < blob_count; ++c) {

    auto *choice_list = new BLOB_CHOICE_LIST;

    BLOB_CHOICE_IT choice_it(choice_list);

    choice_it.add_after_then_move(choices[c]);

    ratings->put(c, c, choice_list);

  }

  FakeWordFromRatings(TOP_CHOICE_PERM);

  reject_map.initialise(blob_count);

  best_state.clear();

  best_state.resize(blob_count, 1);

  done = true;

}


// Creates a WERD_CHOICE for the word using the top choices from the leading

// diagonal of the ratings matrix.

void WERD_RES::FakeWordFromRatings(PermuterType permuter) {

  int num_blobs = ratings->dimension();

  auto *word_choice = new WERD_CHOICE(uch_set, num_blobs);

  word_choice->set_permuter(permuter);

  for (int b = 0; b < num_blobs; ++b) {

    UNICHAR_ID unichar_id = UNICHAR_SPACE;

    // Initialize rating and certainty like in WERD_CHOICE::make_bad().

    float rating = WERD_CHOICE::kBadRating;

    float certainty = -FLT_MAX;

    BLOB_CHOICE_LIST *choices = ratings->get(b, b);

    if (choices != nullptr && !choices->empty()) {

      BLOB_CHOICE_IT bc_it(choices);

      BLOB_CHOICE *choice = bc_it.data();

      unichar_id = choice->unichar_id();

      rating = choice->rating();

      certainty = choice->certainty();

    }

    word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,

                                                   certainty);

  }

  LogNewRawChoice(word_choice);

  // Ownership of word_choice taken by word here.

  LogNewCookedChoice(1, false, word_choice);

}


// Copies the best_choice strings to the correct_text for adaption/training.

void WERD_RES::BestChoiceToCorrectText() {

  correct_text.clear();

  ASSERT_HOST(best_choice != nullptr);

  for (unsigned i = 0; i < best_choice->length(); ++i) {

    UNICHAR_ID choice_id = best_choice->unichar_id(i);

    const char *blob_choice = uch_set->id_to_unichar(choice_id);

    correct_text.emplace_back(blob_choice);

  }

}


// Merges 2 adjacent blobs in the result if the permanent callback

// class_cb returns other than INVALID_UNICHAR_ID, AND the permanent

// callback box_cb is nullptr or returns true, setting the merged blob

// result to the class returned from class_cb.

// Returns true if anything was merged.

bool WERD_RES::ConditionalBlobMerge(

    const std::function<UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb,

    const std::function<bool(const TBOX &, const TBOX &)> &box_cb) {

  ASSERT_HOST(best_choice->empty() || ratings != nullptr);

  bool modified = false;

  for (unsigned i = 0; i + 1 < best_choice->length(); ++i) {

    UNICHAR_ID new_id =

        class_cb(best_choice->unichar_id(i), best_choice->unichar_id(i + 1));

    if (new_id != INVALID_UNICHAR_ID &&

        (box_cb == nullptr ||

         box_cb(box_word->BlobBox(i), box_word->BlobBox(i + 1)))) {

      // Raw choice should not be fixed.

      best_choice->set_unichar_id(new_id, i);

      modified = true;

      MergeAdjacentBlobs(i);

      const MATRIX_COORD &coord = best_choice->MatrixCoord(i);

      if (!coord.Valid(*ratings)) {

        ratings->IncreaseBandSize(coord.row + 1 - coord.col);

      }

      BLOB_CHOICE_LIST *blob_choices = GetBlobChoices(i);

      if (FindMatchingChoice(new_id, blob_choices) == nullptr) {

        // Insert a fake result.

        auto *blob_choice = new BLOB_CHOICE;

        blob_choice->set_unichar_id(new_id);

        BLOB_CHOICE_IT bc_it(blob_choices);

        bc_it.add_before_then_move(blob_choice);

      }

    }

  }

  return modified;

}


// Merges 2 adjacent blobs in the result (index and index+1) and corrects

// all the data to account for the change.

void WERD_RES::MergeAdjacentBlobs(unsigned index) {

  if (reject_map.length() == best_choice->length()) {

    reject_map.remove_pos(index);

  }

  best_choice->remove_unichar_id(index + 1);

  rebuild_word->MergeBlobs(index, index + 2);

  box_word->MergeBoxes(index, index + 2);

  if (index + 1 < best_state.size()) {

    best_state[index] += best_state[index + 1];

    best_state.erase(best_state.begin() + index + 1);

  }

}


// TODO(tkielbus) Decide between keeping this behavior here or modifying the

// training data.


// Utility function for fix_quotes

// Return true if the next character in the string (given the UTF8 length in

// bytes) is a quote character.

static int is_simple_quote(const char *signed_str, int length) {

  const auto *str = reinterpret_cast<const unsigned char *>(signed_str);

  // Standard 1 byte quotes.

  return (length == 1 && (*str == '\'' || *str == '`')) ||

         // UTF-8 3 bytes curved quotes.

         (length == 3 &&

          ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) ||

           (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));

}


// Callback helper for fix_quotes returns a double quote if both

// arguments are quote, otherwise INVALID_UNICHAR_ID.

UNICHAR_ID WERD_RES::BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2) {

  const char *ch = uch_set->id_to_unichar(id1);

  const char *next_ch = uch_set->id_to_unichar(id2);

  if (is_simple_quote(ch, strlen(ch)) &&

      is_simple_quote(next_ch, strlen(next_ch))) {

    return uch_set->unichar_to_id("\"");

  }

  return INVALID_UNICHAR_ID;

}


// Change pairs of quotes to double quotes.

void WERD_RES::fix_quotes() {

  if (!uch_set->contains_unichar("\"") ||

      !uch_set->get_enabled(uch_set->unichar_to_id("\""))) {

    return; // Don't create it if it is disallowed.

  }


  using namespace std::placeholders; // for _1, _2

  ConditionalBlobMerge(std::bind(&WERD_RES::BothQuotes, this, _1, _2), nullptr);

}


// Callback helper for fix_hyphens returns UNICHAR_ID of - if both

// arguments are hyphen, otherwise INVALID_UNICHAR_ID.

UNICHAR_ID WERD_RES::BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2) {

  const char *ch = uch_set->id_to_unichar(id1);

  const char *next_ch = uch_set->id_to_unichar(id2);

  if (strlen(ch) == 1 && strlen(next_ch) == 1 && (*ch == '-' || *ch == '~') &&

      (*next_ch == '-' || *next_ch == '~')) {

    return uch_set->unichar_to_id("-");

  }

  return INVALID_UNICHAR_ID;

}


// Callback helper for fix_hyphens returns true if box1 and box2 overlap

// (assuming both on the same textline, are in order and a chopped em dash.)

bool WERD_RES::HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2) {

  return box1.right() >= box2.left();

}


// Change pairs of hyphens to a single hyphen if the bounding boxes touch

// Typically a long dash which has been segmented.

void WERD_RES::fix_hyphens() {

  if (!uch_set->contains_unichar("-") ||

      !uch_set->get_enabled(uch_set->unichar_to_id("-"))) {

    return; // Don't create it if it is disallowed.

  }


  using namespace std::placeholders; // for _1, _2

  ConditionalBlobMerge(std::bind(&WERD_RES::BothHyphens, this, _1, _2),

                       std::bind(&WERD_RES::HyphenBoxesOverlap, this, _1, _2));

}


// Callback helper for merge_tess_fails returns a space if both

// arguments are space, otherwise INVALID_UNICHAR_ID.

UNICHAR_ID WERD_RES::BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2) {

  if (id1 == id2 && id1 == uch_set->unichar_to_id(" ")) {

    return id1;

  } else {

    return INVALID_UNICHAR_ID;

  }

}


// Change pairs of tess failures to a single one

void WERD_RES::merge_tess_fails() {

  using namespace std::placeholders; // for _1, _2

  if (ConditionalBlobMerge(std::bind(&WERD_RES::BothSpaces, this, _1, _2),

                           nullptr)) {

    unsigned len = best_choice->length();

    ASSERT_HOST(reject_map.length() == len);

    ASSERT_HOST(box_word->length() == len);

  }

}


// Returns true if the collection of count pieces, starting at start, are all

// natural connected components, ie there are no real chops involved.

bool WERD_RES::PiecesAllNatural(int start, int count) const {

  // all seams must have no splits.

  for (int index = start; index < start + count - 1; ++index) {

    if (index >= 0 && static_cast<size_t>(index) < seam_array.size()) {

      SEAM *seam = seam_array[index];

      if (seam != nullptr && seam->HasAnySplits()) {

        return false;

      }

    }

  }

  return true;

}


WERD_RES::~WERD_RES() {

  Clear();

}


void WERD_RES::Clear() {

  if (combination) {

    delete word;

  }

  word = nullptr;

  delete blamer_bundle;

  blamer_bundle = nullptr;

  ClearResults();

}


void WERD_RES::ClearResults() {

  done = false;

  fontinfo = nullptr;

  fontinfo2 = nullptr;

  fontinfo_id_count = 0;

  fontinfo_id2_count = 0;

  delete bln_boxes;

  bln_boxes = nullptr;

  blob_row = nullptr;

  delete chopped_word;

  chopped_word = nullptr;

  delete rebuild_word;

  rebuild_word = nullptr;

  delete box_word;

  box_word = nullptr;

  best_state.clear();

  correct_text.clear();

  for (auto data : seam_array) {

    delete data;

  }

  seam_array.clear();

  blob_widths.clear();

  blob_gaps.clear();

  ClearRatings();

  ClearWordChoices();

  if (blamer_bundle != nullptr) {

    blamer_bundle->ClearResults();

  }

}

void WERD_RES::ClearWordChoices() {

  best_choice = nullptr;

  delete raw_choice;

  raw_choice = nullptr;

  best_choices.clear();

  delete ep_choice;

  ep_choice = nullptr;

}

void WERD_RES::ClearRatings() {

  if (ratings != nullptr) {

    ratings->delete_matrix_pointers();

    delete ratings;

    ratings = nullptr;

  }

}


int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {

  ASSERT_HOST(page_res == other.page_res);

  if (other.block_res == nullptr) {

    // other points to the end of the page.

    if (block_res == nullptr) {

      return 0;

    }

    return -1;

  }

  if (block_res == nullptr) {

    return 1; // we point to the end of the page.

  }

  if (block_res == other.block_res) {

    if (other.row_res == nullptr || row_res == nullptr) {

      // this should only happen if we hit an image block.

      return 0;

    }

    if (row_res == other.row_res) {

      // we point to the same block and row.

      ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);

      if (word_res == other.word_res) {

        // we point to the same word!

        return 0;

      }


      WERD_RES_IT word_res_it(&row_res->word_res_list);

      for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();

           word_res_it.forward()) {

        if (word_res_it.data() == word_res) {

          return -1;

        } else if (word_res_it.data() == other.word_res) {

          return 1;

        }

      }

      ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);

    }


    // we both point to the same block, but different rows.

    ROW_RES_IT row_res_it(&block_res->row_res_list);

    for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();

         row_res_it.forward()) {

      if (row_res_it.data() == row_res) {

        return -1;

      } else if (row_res_it.data() == other.row_res) {

        return 1;

      }

    }

    ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);

  }


  // We point to different blocks.

  BLOCK_RES_IT block_res_it(&page_res->block_res_list);

  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();

       block_res_it.forward()) {

    if (block_res_it.data() == block_res) {

      return -1;

    } else if (block_res_it.data() == other.block_res) {

      return 1;

    }

  }

  // Shouldn't happen...

  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);

  return 0;

}


// Inserts the new_word as a combination owned by a corresponding WERD_RES

// before the current position. The simple fields of the WERD_RES are copied

// from clone_res and the resulting WERD_RES is returned for further setup

// with best_choice etc.

WERD_RES *PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES &clone_res,

                                             WERD *new_word) {

  // Make a WERD_RES for the new_word.

  auto *new_res = new WERD_RES(new_word);

  new_res->CopySimpleFields(clone_res);

  new_res->combination = true;

  // Insert into the appropriate place in the ROW_RES.

  WERD_RES_IT wr_it(&row()->word_res_list);

  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {

    WERD_RES *word = wr_it.data();

    if (word == word_res) {

      break;

    }

  }

  ASSERT_HOST(!wr_it.cycled_list());

  wr_it.add_before_then_move(new_res);

  if (wr_it.at_first()) {

    // This is the new first word, so reset the member iterator so it

    // detects the cycled_list state correctly.

    ResetWordIterator();

  }

  return new_res;

}


// Helper computes the boundaries between blobs in the word. The blob bounds

// are likely very poor, if they come from LSTM, where it only outputs the

// character at one pixel within it, so we find the midpoints between them.

static void ComputeBlobEnds(const WERD_RES &word, const TBOX &clip_box,

                            C_BLOB_LIST *next_word_blobs,

                            std::vector<int> *blob_ends) {

  C_BLOB_IT blob_it(word.word->cblob_list());

  for (int length : word.best_state) {

    // Get the bounding box of the fake blobs

    TBOX blob_box = blob_it.data()->bounding_box();

    blob_it.forward();

    for (int b = 1; b < length; ++b) {

      blob_box += blob_it.data()->bounding_box();

      blob_it.forward();

    }

    // This blob_box is crap, so for now we are only looking for the

    // boundaries between them.

    int blob_end = INT32_MAX;

    if (!blob_it.at_first() || next_word_blobs != nullptr) {

      if (blob_it.at_first()) {

        blob_it.set_to_list(next_word_blobs);

      }

      blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;

    }

    blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());

    blob_ends->push_back(blob_end);

  }

  blob_ends->back() = clip_box.right();

}


// Helper computes the bounds of a word by restricting it to existing words

// that significantly overlap.

static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES> &words,

                              int w_index, TBOX prev_box, WERD_RES_IT w_it) {

  constexpr int kSignificantOverlapFraction = 4;

  TBOX clipped_box;

  TBOX current_box = words[w_index]->word->bounding_box();

  TBOX next_box;

  if (static_cast<size_t>(w_index + 1) < words.size() &&

      words[w_index + 1] != nullptr && words[w_index + 1]->word != nullptr) {

    next_box = words[w_index + 1]->word->bounding_box();

  }

  for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;

       w_it.forward()) {

    if (w_it.data() == nullptr || w_it.data()->word == nullptr) {

      continue;

    }

    TBOX w_box = w_it.data()->word->bounding_box();

    int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);

    int width_limit = w_box.width() / kSignificantOverlapFraction;

    int min_significant_overlap = std::max(height_limit, width_limit);

    int overlap = w_box.intersection(current_box).width();

    int prev_overlap = w_box.intersection(prev_box).width();

    int next_overlap = w_box.intersection(next_box).width();

    if (overlap > min_significant_overlap) {

      if (prev_overlap > min_significant_overlap) {

        // We have no choice but to use the LSTM word edge.

        clipped_box.set_left(current_box.left());

      } else if (next_overlap > min_significant_overlap) {

        // We have no choice but to use the LSTM word edge.

        clipped_box.set_right(current_box.right());

      } else {

        clipped_box += w_box;

      }

    }

  }

  if (clipped_box.height() <= 0) {

    clipped_box.set_top(current_box.top());

    clipped_box.set_bottom(current_box.bottom());

  }

  if (clipped_box.width() <= 0) {

    clipped_box = current_box;

  }

  return clipped_box;

}


// Helper moves the blob from src to dest. If it isn't contained by clip_box,

// the blob is replaced by a fake that is contained.

static TBOX MoveAndClipBlob(C_BLOB_IT *src_it, C_BLOB_IT *dest_it,

                            const TBOX &clip_box) {

  C_BLOB *src_blob = src_it->extract();

  TBOX box = src_blob->bounding_box();

  if (!clip_box.contains(box)) {

    int left =

        ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);

    int right =

        ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());

    int top =

        ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());

    int bottom =

        ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);

    box = TBOX(left, bottom, right, top);

    delete src_blob;

    src_blob = C_BLOB::FakeBlob(box);

  }

  dest_it->add_after_then_move(src_blob);

  return box;

}


// Replaces the current WERD/WERD_RES with the given words. The given words

// contain fake blobs that indicate the position of the characters. These are

// replaced with real blobs from the current word as much as possible.

void PAGE_RES_IT::ReplaceCurrentWord(

    tesseract::PointerVector<WERD_RES> *words) {

  if (words->empty()) {

    DeleteCurrentWord();

    return;

  }

  WERD_RES *input_word = word();

  // Set the BOL/EOL flags on the words from the input word.

  if (input_word->word->flag(W_BOL)) {

    (*words)[0]->word->set_flag(W_BOL, true);

  } else {

    (*words)[0]->word->set_blanks(input_word->word->space());

  }

  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));


  // Move the blobs from the input word to the new set of words.

  // If the input word_res is a combination, then the replacements will also be

  // combinations, and will own their own words. If the input word_res is not a

  // combination, then the final replacements will not be either, (although it

  // is allowed for the input words to be combinations) and their words

  // will get put on the row list. This maintains the ownership rules.

  WERD_IT w_it(row()->row->word_list());

  if (!input_word->combination) {

    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

      WERD *word = w_it.data();

      if (word == input_word->word) {

        break;

      }

    }

    // w_it is now set to the input_word's word.

    ASSERT_HOST(!w_it.cycled_list());

  }

  // Insert into the appropriate place in the ROW_RES.

  WERD_RES_IT wr_it(&row()->word_res_list);

  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {

    WERD_RES *word = wr_it.data();

    if (word == input_word) {

      break;

    }

  }

  ASSERT_HOST(!wr_it.cycled_list());

  // Since we only have an estimate of the bounds between blobs, use the blob

  // x-middle as the determiner of where to put the blobs

  C_BLOB_IT src_b_it(input_word->word->cblob_list());

  src_b_it.sort(&C_BLOB::SortByXMiddle);

  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());

  rej_b_it.sort(&C_BLOB::SortByXMiddle);

  TBOX clip_box;

  for (size_t w = 0; w < words->size(); ++w) {

    WERD_RES *word_w = (*words)[w];

    clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);

    // Compute blob boundaries.

    std::vector<int> blob_ends;

    C_BLOB_LIST *next_word_blobs =

        w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;

    ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);

    // Remove the fake blobs on the current word, but keep safe for back-up if

    // no blob can be found.

    C_BLOB_LIST fake_blobs;

    C_BLOB_IT fake_b_it(&fake_blobs);

    fake_b_it.add_list_after(word_w->word->cblob_list());

    fake_b_it.move_to_first();

    word_w->word->cblob_list()->clear();

    C_BLOB_IT dest_it(word_w->word->cblob_list());

    // Build the box word as we move the blobs.

    auto *box_word = new tesseract::BoxWord;

    for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {

      int end_x = blob_ends[i];

      TBOX blob_box;

      // Add the blobs up to end_x.

      while (!src_b_it.empty() &&

             src_b_it.data()->bounding_box().x_middle() < end_x) {

        blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);

        src_b_it.forward();

      }

      while (!rej_b_it.empty() &&

             rej_b_it.data()->bounding_box().x_middle() < end_x) {

        blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);

        rej_b_it.forward();

      }

      if (blob_box.null_box()) {

        // Use the original box as a back-up.

        blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);

      }

      box_word->InsertBox(i, blob_box);

    }

    delete word_w->box_word;

    word_w->box_word = box_word;

    if (!input_word->combination) {

      // Insert word_w->word into the ROW. It doesn't own its word, so the

      // ROW needs to own it.

      w_it.add_before_stay_put(word_w->word);

      word_w->combination = false;

    }

    (*words)[w] = nullptr; // We are taking ownership.

    wr_it.add_before_stay_put(word_w);

  }

  // We have taken ownership of the words.

  words->clear();

  // Delete the current word, which has been replaced. We could just call

  // DeleteCurrentWord, but that would iterate both lists again, and we know

  // we are already in the right place.

  if (!input_word->combination) {

    delete w_it.extract();

  }

  delete wr_it.extract();

  ResetWordIterator();

}


// Deletes the current WERD_RES and its underlying WERD.

void PAGE_RES_IT::DeleteCurrentWord() {

  // Check that this word is as we expect. part_of_combos are NEVER iterated

  // by the normal iterator, so we should never be trying to delete them.

  ASSERT_HOST(!word_res->part_of_combo);

  if (!word_res->combination) {

    // Combinations own their own word, so we won't find the word on the

    // row's word_list, but it is legitimate to try to delete them.

    // Delete word from the ROW when not a combination.

    WERD_IT w_it(row()->row->word_list());

    for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

      if (w_it.data() == word_res->word) {

        break;

      }

    }

    ASSERT_HOST(!w_it.cycled_list());

    delete w_it.extract();

  }

  // Remove the WERD_RES for the new_word.

  // Remove the WORD_RES from the ROW_RES.

  WERD_RES_IT wr_it(&row()->word_res_list);

  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {

    if (wr_it.data() == word_res) {

      word_res = nullptr;

      break;

    }

  }

  ASSERT_HOST(!wr_it.cycled_list());

  delete wr_it.extract();

  ResetWordIterator();

}


// Makes the current word a fuzzy space if not already fuzzy. Updates

// corresponding part of combo if required.

void PAGE_RES_IT::MakeCurrentWordFuzzy() {

  WERD *real_word = word_res->word;

  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {

    real_word->set_flag(W_FUZZY_SP, true);

    if (word_res->combination) {

      // The next word should be the corresponding part of combo, but we have

      // already stepped past it, so find it by search.

      WERD_RES_IT wr_it(&row()->word_res_list);

      for (wr_it.mark_cycle_pt();

           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {

      }

      wr_it.forward();

      ASSERT_HOST(wr_it.data()->part_of_combo);

      real_word = wr_it.data()->word;

      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&

                  !real_word->flag(W_FUZZY_NON));

      real_word->set_flag(W_FUZZY_SP, true);

    }

  }

}


/*************************************************************************

 * PAGE_RES_IT::restart_page

 *

 * Set things up at the start of the page

 *************************************************************************/


WERD_RES *PAGE_RES_IT::start_page(bool empty_ok) {

  block_res_it.set_to_list(&page_res->block_res_list);

  block_res_it.mark_cycle_pt();

  prev_block_res = nullptr;

  prev_row_res = nullptr;

  prev_word_res = nullptr;

  block_res = nullptr;

  row_res = nullptr;

  word_res = nullptr;

  next_block_res = nullptr;

  next_row_res = nullptr;

  next_word_res = nullptr;

  internal_forward(true, empty_ok);

  return internal_forward(false, empty_ok);

}


// Recovers from operations on the current word, such as in InsertCloneWord

// and DeleteCurrentWord.

// Resets the word_res_it so that it is one past the next_word_res, as

// it should be after internal_forward. If next_row_res != row_res,

// then the next_word_res is in the next row, so there is no need to do

// anything to word_res_it, but it is still a good idea to reset the pointers

// word_res and prev_word_res, which are still in the current row.

void PAGE_RES_IT::ResetWordIterator() {

  if (row_res == next_row_res) {

    // Reset the member iterator so it can move forward and detect the

    // cycled_list state correctly.

    word_res_it.move_to_first();

    for (word_res_it.mark_cycle_pt();

         !word_res_it.cycled_list() && word_res_it.data() != next_word_res;

         word_res_it.forward()) {

      if (!word_res_it.data()->part_of_combo) {

        if (prev_row_res == row_res) {

          prev_word_res = word_res;

        }

        word_res = word_res_it.data();

      }

    }

    ASSERT_HOST(!word_res_it.cycled_list());

    wr_it_of_next_word = word_res_it;

    word_res_it.forward();

  } else {

    // word_res_it is OK, but reset word_res and prev_word_res if needed.

    WERD_RES_IT wr_it(&row_res->word_res_list);

    for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {

      if (!wr_it.data()->part_of_combo) {

        if (prev_row_res == row_res) {

          prev_word_res = word_res;

        }

        word_res = wr_it.data();

      }

    }

  }

}


/*************************************************************************

 * PAGE_RES_IT::internal_forward

 *

 * Find the next word on the page. If empty_ok is true, then non-text blocks

 * and text blocks with no text are visited as if they contain a single

 * imaginary word in a single imaginary row. (word() and row() both return

 *nullptr in such a block and the return value is nullptr.) If empty_ok is

 *false, the old behaviour is maintained. Each real word is visited and empty

 *and non-text blocks and rows are skipped. new_block is used to initialize the

 *iterators for a new block. The iterator maintains pointers to block, row and

 *word for the previous, current and next words.  These are correct, regardless

 *of block/row boundaries. nullptr values denote start and end of the page.

 *************************************************************************/


WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {

  bool new_row = false;


  prev_block_res = block_res;

  prev_row_res = row_res;

  prev_word_res = word_res;

  block_res = next_block_res;

  row_res = next_row_res;

  word_res = next_word_res;

  wr_it_of_current_word = wr_it_of_next_word;

  next_block_res = nullptr;

  next_row_res = nullptr;

  next_word_res = nullptr;


  while (!block_res_it.cycled_list()) {

    if (new_block) {

      new_block = false;

      row_res_it.set_to_list(&block_res_it.data()->row_res_list);

      row_res_it.mark_cycle_pt();

      if (row_res_it.empty() && empty_ok) {

        next_block_res = block_res_it.data();

        break;

      }

      new_row = true;

    }

    while (!row_res_it.cycled_list()) {

      if (new_row) {

        new_row = false;

        word_res_it.set_to_list(&row_res_it.data()->word_res_list);

        word_res_it.mark_cycle_pt();

      }

      // Skip any part_of_combo words.

      while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo) {

        word_res_it.forward();

      }

      if (!word_res_it.cycled_list()) {

        next_block_res = block_res_it.data();

        next_row_res = row_res_it.data();

        next_word_res = word_res_it.data();

        wr_it_of_next_word = word_res_it;

        word_res_it.forward();

        goto foundword;

      }

      // end of row reached

      row_res_it.forward();

      new_row = true;

    }

    // end of block reached

    block_res_it.forward();

    new_block = true;

  }

foundword:

  // Update prev_word_best_choice pointer.

  if (page_res != nullptr && page_res->prev_word_best_choice != nullptr) {

    *page_res->prev_word_best_choice = (new_block || prev_word_res == nullptr)

                                           ? nullptr

                                           : prev_word_res->best_choice;

  }

  return word_res;

}


/*************************************************************************

 * PAGE_RES_IT::restart_row()

 *

 * Move to the beginning (leftmost word) of the current row.

 *************************************************************************/

WERD_RES *PAGE_RES_IT::restart_row() {

  ROW_RES *row = this->row();

  if (!row) {

    return nullptr;

  }

  for (restart_page(); this->row() != row; forward()) {

    // pass

  }

  return word();

}


/*************************************************************************

 * PAGE_RES_IT::forward_paragraph

 *

 * Move to the beginning of the next paragraph, allowing empty blocks.

 *************************************************************************/


WERD_RES *PAGE_RES_IT::forward_paragraph() {

  while (block_res == next_block_res &&

         (next_row_res != nullptr && next_row_res->row != nullptr &&

          row_res->row->para() == next_row_res->row->para())) {

    internal_forward(false, true);

  }

  return internal_forward(false, true);

}


/*************************************************************************

 * PAGE_RES_IT::forward_block

 *

 * Move to the beginning of the next block, allowing empty blocks.

 *************************************************************************/


WERD_RES *PAGE_RES_IT::forward_block() {

  while (block_res == next_block_res) {

    internal_forward(false, true);

  }

  return internal_forward(false, true);

}


void PAGE_RES_IT::rej_stat_word() {

  int16_t chars_in_word;

  int16_t rejects_in_word = 0;


  chars_in_word = word_res->reject_map.length();

  page_res->char_count += chars_in_word;

  block_res->char_count += chars_in_word;

  row_res->char_count += chars_in_word;


  rejects_in_word = word_res->reject_map.reject_count();


  page_res->rej_count += rejects_in_word;

  block_res->rej_count += rejects_in_word;

  row_res->rej_count += rejects_in_word;

  if (chars_in_word == rejects_in_word) {

    row_res->whole_word_rej_count += rejects_in_word;

  }

}


} // namespace tesseract

publictypes.h

errcode.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

tprintf.h

ocrblock.h

pageres.h

seam.h

boxword.h

polyblk.h

ocrrow.h

pdblock.h

blobs.h

blamer.h

stepblob.h

TBOX
@ TBOX
Definition: cleanapi_test.cc:19

i
int i
Definition: gmock-matchers_test.cc:718

ch
char ch
Definition: gmock-matchers_test.cc:4035

count
int * count
Definition: gmock_stress_test.cc:96

tesseract
Definition: baseapi.h:39

tesseract::W_BOL
@ W_BOL
start of line
Definition: werd.h:34

tesseract::W_INVERSE
@ W_INVERSE
white on black
Definition: werd.h:43

tesseract::W_FUZZY_SP
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41

tesseract::W_SCRIPT_HAS_XHEIGHT
@ W_SCRIPT_HAS_XHEIGHT
x-height concept makes sense.
Definition: werd.h:37

tesseract::W_EOL
@ W_EOL
end of line
Definition: werd.h:35

tesseract::W_SCRIPT_IS_LATIN
@ W_SCRIPT_IS_LATIN
Special case latin for y. splitting.
Definition: werd.h:38

tesseract::W_REP_CHAR
@ W_REP_CHAR
repeated character
Definition: werd.h:40

tesseract::W_FUZZY_NON
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42

tesseract::OcrEngineMode
OcrEngineMode
Definition: publictypes.h:263

tesseract::OEM_LSTM_ONLY
@ OEM_LSTM_ONLY
Definition: publictypes.h:265

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:254

tesseract::kMaxWordSizeRatio
const double kMaxWordSizeRatio
Definition: pageres.cpp:58

tesseract::kMaxLineSizeRatio
const double kMaxLineSizeRatio
Definition: pageres.cpp:60

tesseract::kWordrecMaxNumJoinChunks
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:55

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:36

tesseract::FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177

tesseract::start_seam_list
void start_seam_list(TWERD *word, std::vector< SEAM * > *seam_array)
Definition: seam.cpp:262

tesseract::PermuterType
PermuterType
Definition: ratngs.h:235

tesseract::TOP_CHOICE_PERM
@ TOP_CHOICE_PERM
Definition: ratngs.h:238

tesseract::kMaxWordGapRatio
const double kMaxWordGapRatio
Definition: pageres.cpp:62

upload.dest
dest
Definition: upload.py:409

tesseract::GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:268

tesseract::GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:260

tesseract::GENERIC_2D_ARRAY::delete_matrix_pointers
void delete_matrix_pointers()
Definition: matrix.h:515

tesseract::Tesseract
Definition: tesseractclass.h:178

tesseract::BlamerBundle
Definition: blamer.h:107

tesseract::BlamerBundle::ClearResults
void ClearResults()
Definition: blamer.h:198

tesseract::BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214

tesseract::BlamerBundle::CopyResults
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:220

tesseract::BlamerBundle::SetupNormTruthWord
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:151

tesseract::TBLOB
Definition: blobs.h:291

tesseract::TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:466

tesseract::TWERD
Definition: blobs.h:421

tesseract::TWERD::ComputeBoundingBoxes
void ComputeBoundingBoxes()
Definition: blobs.cpp:857

tesseract::TWERD::PolygonalCopy
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:778

tesseract::TWERD::blobs
std::vector< TBLOB * > blobs
Definition: blobs.h:462

tesseract::TWERD::BLNormalize
void BLNormalize(const BLOCK *block, const ROW *row, Image pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:792

tesseract::TWERD::NumBlobs
unsigned NumBlobs() const
Definition: blobs.h:449

tesseract::TWERD::MergeBlobs
void MergeBlobs(unsigned start, unsigned end)
Definition: blobs.cpp:874

tesseract::BoxWord
Definition: boxword.h:34

tesseract::BoxWord::MergeBoxes
void MergeBoxes(unsigned start, unsigned end)
Definition: boxword.cpp:138

tesseract::BoxWord::CopyFromNormalized
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:56

tesseract::BoxWord::length
unsigned length() const
Definition: boxword.h:81

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(unsigned index) const
Definition: boxword.h:84

tesseract::BoxWord::InsertBox
void InsertBox(unsigned index, const TBOX &box)
Definition: boxword.cpp:157

tesseract::BoxWord::ClipToOriginalWord
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:92

tesseract::Image
Definition: image.h:25

tesseract::BandTriMatrix::dimension
int dimension() const
Definition: matrix.h:612

tesseract::MATRIX
Definition: matrix.h:657

tesseract::MATRIX::ConsumeAndMakeBigger
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:61

tesseract::MATRIX::IncreaseBandSize
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:52

tesseract::MATRIX_COORD
Definition: matrix.h:687

tesseract::MATRIX_COORD::row
int row
Definition: matrix.h:720

tesseract::MATRIX_COORD::Valid
bool Valid(const MATRIX &m) const
Definition: matrix.h:697

tesseract::MATRIX_COORD::col
int col
Definition: matrix.h:719

tesseract::DENORM::block
const BLOCK * block() const
Definition: normalis.h:265

tesseract::BLOCK
Definition: ocrblock.h:34

tesseract::BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185

tesseract::BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111

tesseract::ROW
Definition: ocrrow.h:39

tesseract::ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:57

tesseract::ROW::para
PARA * para() const
Definition: ocrrow.h:120

tesseract::ROW::body_size
float body_size() const
Definition: ocrrow.h:75

tesseract::ROW::x_height
float x_height() const
Definition: ocrrow.h:66

tesseract::ROW::ascenders
float ascenders() const
Definition: ocrrow.h:84

tesseract::ROW::descenders
float descenders() const
Definition: ocrrow.h:87

tesseract::PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:80

tesseract::PAGE_RES::block_res_list
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81

tesseract::PAGE_RES::PAGE_RES
PAGE_RES()
Definition: pageres.h:103

tesseract::PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:79

tesseract::PAGE_RES::Init
void Init()
Definition: pageres.h:94

tesseract::PAGE_RES::prev_word_best_choice
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:85

tesseract::BLOCK_RES
Definition: pageres.h:118

tesseract::BLOCK_RES::font_assigned
bool font_assigned
Definition: pageres.h:126

tesseract::BLOCK_RES::BLOCK_RES
BLOCK_RES()=default

tesseract::BLOCK_RES::row_res_list
ROW_RES_LIST row_res_list
Definition: pageres.h:129

tesseract::BLOCK_RES::block
BLOCK * block
Definition: pageres.h:120

tesseract::BLOCK_RES::x_height
float x_height
Definition: pageres.h:125

tesseract::BLOCK_RES::row_count
int16_t row_count
Definition: pageres.h:124

tesseract::BLOCK_RES::font_class
int16_t font_class
Definition: pageres.h:123

tesseract::BLOCK_RES::rej_count
int32_t rej_count
Definition: pageres.h:122

tesseract::BLOCK_RES::char_count
int32_t char_count
Definition: pageres.h:121

tesseract::ROW_RES
Definition: pageres.h:142

tesseract::ROW_RES::word_res_list
WERD_RES_LIST word_res_list
Definition: pageres.h:148

tesseract::ROW_RES::row
ROW * row
Definition: pageres.h:144

tesseract::ROW_RES::whole_word_rej_count
int32_t whole_word_rej_count
Definition: pageres.h:147

tesseract::ROW_RES::ROW_RES
ROW_RES()=default

tesseract::ROW_RES::rej_count
int32_t rej_count
Definition: pageres.h:146

tesseract::ROW_RES::char_count
int32_t char_count
Definition: pageres.h:145

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::copy_on
void copy_on(WERD_RES *word_res)
Definition: pageres.h:667

tesseract::WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:308

tesseract::WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:865

tesseract::WERD_RES::FakeWordFromRatings
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:930

tesseract::WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:278

tesseract::WERD_RES::BothQuotes
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1036

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::odd_size
bool odd_size
Definition: pageres.h:305

tesseract::WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:244

tesseract::WERD_RES::~WERD_RES
~WERD_RES()
Definition: pageres.cpp:1124

tesseract::WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:301

tesseract::WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:310

tesseract::WERD_RES::correct_text
std::vector< std::string > correct_text
Definition: pageres.h:287

tesseract::WERD_RES::GetBlobsGap
int GetBlobsGap(unsigned blob_index) const
Definition: pageres.cpp:757

tesseract::WERD_RES::ComputeAdaptionThresholds
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:570

tesseract::WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:292

tesseract::WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1077

tesseract::WERD_RES::FilterWordChoices
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:518

tesseract::WERD_RES::done
bool done
Definition: pageres.h:303

tesseract::WERD_RES::FakeClassifyWord
void FakeClassifyWord(unsigned blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:908

tesseract::WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313

tesseract::WERD_RES::SetupWordScript
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:385

tesseract::WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:210

tesseract::WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:419

tesseract::WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:279

tesseract::WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304

tesseract::WERD_RES::ClearResults
void ClearResults()
Definition: pageres.cpp:1138

tesseract::WERD_RES::Clear
void Clear()
Definition: pageres.cpp:1128

tesseract::WERD_RES::ConditionalBlobMerge
bool ConditionalBlobMerge(const std::function< UNICHAR_ID(UNICHAR_ID, UNICHAR_ID)> &class_cb, const std::function< bool(const TBOX &, const TBOX &)> &box_cb)
Definition: pageres.cpp:971

tesseract::WERD_RES::small_caps
bool small_caps
Definition: pageres.h:304

tesseract::WERD_RES::ClearRatings
void ClearRatings()
Definition: pageres.cpp:1175

tesseract::WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:193

tesseract::WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:629

tesseract::WERD_RES::denorm
DENORM denorm
Definition: pageres.h:199

tesseract::WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:785

tesseract::WERD_RES::DebugTopChoice
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:503

tesseract::WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:888

tesseract::WERD_RES::BothHyphens
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1059

tesseract::WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:250

tesseract::WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:837

tesseract::WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:309

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:311

tesseract::WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:307

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:779

tesseract::WERD_RES::AlternativeChoiceAdjustmentsWorseThan
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:441

tesseract::WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:956

tesseract::WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247

tesseract::WERD_RES::blob_row
ROW * blob_row
Definition: pageres.h:195

tesseract::WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1099

tesseract::WERD_RES::ratings
MATRIX * ratings
Definition: pageres.h:235

tesseract::WERD_RES::best_state
std::vector< int > best_state
Definition: pageres.h:283

tesseract::WERD_RES::SetupBlobWidthsAndGaps
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:401

tesseract::WERD_RES::SetAllScriptPositions
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:895

tesseract::WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:270

tesseract::WERD_RES::StatesAllValid
bool StatesAllValid()
Definition: pageres.cpp:461

tesseract::WERD_RES::reject_spaces
bool reject_spaces
Definition: pageres.h:339

tesseract::WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:824

tesseract::WERD_RES::SetupFake
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:353

tesseract::WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:302

tesseract::WERD_RES::blob_widths
std::vector< int > blob_widths
Definition: pageres.h:214

tesseract::WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:879

tesseract::WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:293

tesseract::WERD_RES::DebugWordChoices
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:483

tesseract::WERD_RES::guessed_caps_ht
bool guessed_caps_ht
Definition: pageres.h:312

tesseract::WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:731

tesseract::WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(unsigned index) const
Definition: pageres.cpp:768

tesseract::WERD_RES::BothSpaces
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1090

tesseract::WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:455

tesseract::WERD_RES::SetupBlamerBundle
void SetupBlamerBundle()
Definition: pageres.cpp:394

tesseract::WERD_RES::GetBlobsWidth
int GetBlobsWidth(int start_blob, int last_blob) const
Definition: pageres.cpp:746

tesseract::WERD_RES::MergeAdjacentBlobs
void MergeAdjacentBlobs(unsigned index)
Definition: pageres.cpp:1005

tesseract::WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:613

tesseract::WERD_RES::blob_gaps
std::vector< int > blob_gaps
Definition: pageres.h:217

tesseract::WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:316

tesseract::WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1047

tesseract::WERD_RES::HyphenBoxesOverlap
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1071

tesseract::WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:264

tesseract::WERD_RES::ep_choice
WERD_CHOICE * ep_choice
Definition: pageres.h:291

tesseract::WERD_RES::PiecesAllNatural
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1111

tesseract::WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:338

tesseract::WERD_RES::CopySimpleFields
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:253

tesseract::WERD_RES::caps_height
float caps_height
Definition: pageres.h:315

tesseract::WERD_RES::operator=
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:186

tesseract::WERD_RES::SetupBasicsFromChoppedWord
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:344

tesseract::WERD_RES::x_height
float x_height
Definition: pageres.h:314

tesseract::WERD_RES::seam_array
std::vector< SEAM * > seam_array
Definition: pageres.h:212

tesseract::WERD_RES::combination
bool combination
Definition: pageres.h:337

tesseract::WERD_RES::ClearWordChoices
void ClearWordChoices()
Definition: pageres.cpp:1167

tesseract::PAGE_RES_IT
Definition: pageres.h:682

tesseract::PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1521

tesseract::PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:684

tesseract::PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:743

tesseract::PAGE_RES_IT::start_page
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1548

tesseract::PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:763

tesseract::PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:710

tesseract::PAGE_RES_IT::forward_paragraph
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1700

tesseract::PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1488

tesseract::PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1722

tesseract::PAGE_RES_IT::restart_row
WERD_RES * restart_row()
Definition: pageres.cpp:1683

tesseract::PAGE_RES_IT::forward_block
WERD_RES * forward_block()
Definition: pageres.cpp:1715

tesseract::PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1571

tesseract::PAGE_RES_IT::cmp
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1183

tesseract::PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:766

tesseract::PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1378

tesseract::PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1252

tesseract::PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59

tesseract::POLY_BLOCK
Definition: polyblk.h:30

tesseract::POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::BLOB_CHOICE
Definition: ratngs.h:56

tesseract::BLOB_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144

tesseract::BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:87

tesseract::BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:81

tesseract::BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:84

tesseract::WERD_CHOICE
Definition: ratngs.h:258

tesseract::WERD_CHOICE::TotalOfStates
unsigned TotalOfStates() const
Definition: ratngs.cpp:676

tesseract::WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:315

tesseract::WERD_CHOICE::remove_unichar_id
void remove_unichar_id(unsigned index)
Definition: ratngs.h:458

tesseract::WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(unsigned index) const
Definition: ratngs.cpp:286

tesseract::WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
Definition: ratngs.h:344

tesseract::WERD_CHOICE::string_and_lengths
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
Definition: ratngs.cpp:427

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::empty
bool empty() const
Definition: ratngs.h:284

tesseract::WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:260

tesseract::WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:348

tesseract::WERD_CHOICE::state
unsigned state(unsigned index) const
Definition: ratngs.h:303

tesseract::WERD_CHOICE::set_permuter
void set_permuter(uint8_t perm)
Definition: ratngs.h:360

tesseract::WERD_CHOICE::blob_choices
BLOB_CHOICE_LIST * blob_choices(unsigned index, MATRIX *ratings) const
Definition: ratngs.cpp:274

tesseract::WERD_CHOICE::length
unsigned length() const
Definition: ratngs.h:287

tesseract::WERD_CHOICE::print
void print() const
Definition: ratngs.h:561

tesseract::WERD_CHOICE::unichar_string
std::string & unichar_string()
Definition: ratngs.h:519

tesseract::WERD_CHOICE::SetAllScriptPositions
void SetAllScriptPositions(ScriptPos position)
Definition: ratngs.cpp:592

tesseract::WERD_CHOICE::UpdateStateForSplit
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:664

tesseract::WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:312

tesseract::WERD_CHOICE::SetScriptPositions
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:528

tesseract::WERD_CHOICE::adjust_factor
float adjust_factor() const
Definition: ratngs.h:290

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::left
TDimension left() const
Definition: rect.h:82

tesseract::TBOX::height
TDimension height() const
Definition: rect.h:118

tesseract::TBOX::width
TDimension width() const
Definition: rect.h:126

tesseract::TBOX::null_box
bool null_box() const
Definition: rect.h:60

tesseract::TBOX::right
TDimension right() const
Definition: rect.h:89

tesseract::REJMAP::reject_count
int16_t reject_count() const
Definition: rejctmap.h:339

tesseract::REJMAP::remove_pos
void remove_pos(uint16_t pos)
Definition: rejctmap.cpp:100

tesseract::REJMAP::length
uint16_t length() const
Definition: rejctmap.h:333

tesseract::REJMAP::initialise
void initialise(uint16_t length)
Definition: rejctmap.cpp:67

tesseract::SEAM
Definition: seam.h:33

tesseract::SEAM::JoinPieces
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204

tesseract::SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:52

tesseract::SEAM::PrepareToInsertSeam
bool PrepareToInsertSeam(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:54

tesseract::SEAM::BreakPieces
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181

tesseract::C_BLOB::FakeBlob
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238

tesseract::C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124

tesseract::WERD
Definition: werd.h:58

tesseract::WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::WERD::set_flag
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131

tesseract::WERD::set_script_id
void set_script_id(int id)
Definition: werd.h:109

tesseract::WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:155

tesseract::WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91

tesseract::WERD::space
uint8_t space() const
Definition: werd.h:100

tesseract::WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:96

tesseract::ELIST_LINK::operator=
void operator=(const ELIST_LINK &)
Definition: elst.h:100

tesseract::GenericVector::size
unsigned size() const
Definition: genericvector.h:70

tesseract::GenericVector::back
T & back() const
Definition: genericvector.h:516

tesseract::GenericVector::empty
bool empty() const
Definition: genericvector.h:84

tesseract::PointerVector
Definition: genericvector.h:302

tesseract::PointerVector::clear
void clear()
Definition: genericvector.h:353

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::default_sid
int default_sid() const
Definition: unicharset.h:946

tesseract::UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:958

tesseract::UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

tesseract::UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695

tesseract::UNICHARSET::latin_sid
int latin_sid() const
Definition: unicharset.h:922

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

tesseract::UNICHARSET::get_enabled
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911