tesseract-ocr.github.io/5.3.3/a00110_source.html

/******************************************************************

 * File:        control.cpp  (Formerly control.c)

 * Description: Module-independent matcher controller.

 * Author:      Ray Smith

 *

 * (C) Copyright 1992, Hewlett-Packard Ltd.

 ** Licensed under the Apache License, Version 2.0 (the "License");

 ** you may not use this file except in compliance with the License.

 ** You may obtain a copy of the License at

 ** http://www.apache.org/licenses/LICENSE-2.0

 ** Unless required by applicable law or agreed to in writing, software

 ** distributed under the License is distributed on an "AS IS" BASIS,

 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 ** See the License for the specific language governing permissions and

 ** limitations under the License.

 *

 **********************************************************************/


// Include automatically generated configuration file if running autoconf.

#ifdef HAVE_CONFIG_H

#  include "config_auto.h"

#endif


#include <cctype>

#include <cmath>

#include <cstdint> // for int16_t, int32_t

#include <cstdio>  // for fclose, fopen, FILE

#include <ctime>   // for clock

#include "control.h"

#ifndef DISABLED_LEGACY_ENGINE

#  include "docqual.h"

#  include "drawfx.h"

#  include "fixspace.h"

#endif

#include <tesseract/ocrclass.h>

#include "lstmrecognizer.h"

#include "output.h"

#include "pageres.h" // for WERD_RES, PAGE_RES_IT, PAGE_RES, BLO...

#ifndef DISABLED_LEGACY_ENGINE

#  include "reject.h"

#endif

#include "sorthelper.h"

#include "tesseractclass.h"

#include "tessvars.h"

#include "werdit.h"


const char *const kBackUpConfigFile = "tempconfigdata.config";

#ifndef DISABLED_LEGACY_ENGINE

// Min believable x-height for any text when refitting as a fraction of

// original x-height

const double kMinRefitXHeightFraction = 0.5;

#endif // ! DISABLED_LEGACY_ENGINE


namespace tesseract {


void Tesseract::recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box) {

  PAGE_RES_IT *it = make_pseudo_word(page_res, selection_box);

  if (it != nullptr) {

    recog_interactive(it);

    it->DeleteCurrentWord();

    delete it;

  }

}


bool Tesseract::recog_interactive(PAGE_RES_IT *pr_it) {

  WordData word_data(*pr_it);

  SetupWordPassN(2, &word_data);

  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.

  if (lstm_recognizer_ == nullptr) {

#ifndef DISABLED_LEGACY_ENGINE

    classify_word_and_language(2, pr_it, &word_data);

#endif // ndef DISABLED_LEGACY_ENGINE

  } else {

    classify_word_and_language(1, pr_it, &word_data);

  }

#ifndef DISABLED_LEGACY_ENGINE

  if (tessedit_debug_quality_metrics) {

    int16_t char_qual;

    int16_t good_char_qual;

    WERD_RES *word_res = pr_it->word();

    word_char_quality(word_res, &char_qual, &good_char_qual);

    tprintf(

        "\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "

        "char_quality: %d; good_char_quality: %d\n",

        word_res->reject_map.length(), word_blob_quality(word_res), word_outline_errs(word_res),

        char_qual, good_char_qual);

  }

#endif // ndef DISABLED_LEGACY_ENGINE

  return true;

}


// Helper function to check for a target word and handle it appropriately.

// Inspired by Jetsoft's requirement to process only single words on pass2

// and beyond.

// If word_config is not null:

//   If the word_box and target_word_box overlap, read the word_config file

//   else reset to previous config data.

//   return true.

// else

//   If the word_box and target_word_box overlap or pass <= 1, return true.

// Note that this function uses a fixed temporary file for storing the previous

// configs, so it is neither thread-safe, nor process-safe, but the assumption

// is that it will only be used for one debug window at a time.

//

// Since this function is used for debugging (and not to change OCR results)

// set only debug params from the word config file.

bool Tesseract::ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box,

                                  const char *word_config, int pass) {

  if (word_config != nullptr) {

    if (word_box.major_overlap(target_word_box)) {

      if (backup_config_file_ == nullptr) {

        backup_config_file_ = kBackUpConfigFile;

        FILE *config_fp = fopen(backup_config_file_, "wb");

        if (config_fp == nullptr) {

          tprintf("Error, failed to open file \"%s\"\n", backup_config_file_);

        } else {

          ParamUtils::PrintParams(config_fp, params());

          fclose(config_fp);

        }

        ParamUtils::ReadParamsFile(word_config, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());

      }

    } else {

      if (backup_config_file_ != nullptr) {

        ParamUtils::ReadParamsFile(backup_config_file_, SET_PARAM_CONSTRAINT_DEBUG_ONLY, params());

        backup_config_file_ = nullptr;

      }

    }

  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {

    return false;

  }

  return true;

}


void Tesseract::SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config,

                                   PAGE_RES *page_res, std::vector<WordData> *words) {

  // Prepare all the words.

  PAGE_RES_IT page_res_it(page_res);

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    if (target_word_box == nullptr || ProcessTargetWord(page_res_it.word()->word->bounding_box(),

                                                        *target_word_box, word_config, 1)) {

      words->push_back(WordData(page_res_it));

    }

  }

  // Setup all the words for recognition with polygonal approximation.

  for (unsigned w = 0; w < words->size(); ++w) {

    SetupWordPassN(pass_n, &(*words)[w]);

    if (w > 0) {

      (*words)[w].prev_word = &(*words)[w - 1];

    }

  }

}


// Sets up the single word ready for whichever engine is to be run.

void Tesseract::SetupWordPassN(int pass_n, WordData *word) {

  if (pass_n == 1 || !word->word->done) {

    if (pass_n == 1) {

      word->word->SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode,

                                      nullptr, classify_bln_numeric_mode, textord_use_cjk_fp_model,

                                      poly_allow_detailed_fx, word->row, word->block);

    } else if (pass_n == 2) {

      // TODO(rays) Should we do this on pass1 too?

      word->word->caps_height = 0.0;

      if (word->word->x_height == 0.0f) {

        word->word->x_height = word->row->x_height();

      }

    }

    word->lang_words.truncate(0);

    for (unsigned s = 0; s <= sub_langs_.size(); ++s) {

      // The sub_langs_.size() entry is for the master language.

      Tesseract *lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;

      auto *word_res = new WERD_RES;

      word_res->InitForRetryRecognition(*word->word);

      word->lang_words.push_back(word_res);

      // LSTM doesn't get setup for pass2.

      if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {

        word_res->SetupForRecognition(

            lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, nullptr,

            lang_t->classify_bln_numeric_mode, lang_t->textord_use_cjk_fp_model,

            lang_t->poly_allow_detailed_fx, word->row, word->block);

      }

    }

  }

}


// Runs word recognition on all the words.

bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it,

                                   std::vector<WordData> *words) {

  // TODO(rays) Before this loop can be parallelized (it would yield a massive

  // speed-up) all remaining member globals need to be converted to local/heap

  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be

  // added. The results will be significantly different with adaption on, and

  // deterioration will need investigation.

  pr_it->restart_page();

  for (unsigned w = 0; w < words->size(); ++w) {

    WordData *word = &(*words)[w];

    if (w > 0) {

      word->prev_word = &(*words)[w - 1];

    }

    if (monitor != nullptr) {

      monitor->ocr_alive = true;

      if (pass_n == 1) {

        monitor->progress = 70 * w / words->size();

      } else {

        monitor->progress = 70 + 30 * w / words->size();

      }

      if (monitor->progress_callback2 != nullptr) {

        TBOX box = pr_it->word()->word->bounding_box();

        (*monitor->progress_callback2)(monitor, box.left(), box.right(), box.top(), box.bottom());

      }

      if (monitor->deadline_exceeded() ||

          (monitor->cancel != nullptr && (*monitor->cancel)(monitor->cancel_this, words->size()))) {

        // Timeout. Fake out the rest of the words.

        for (; w < words->size(); ++w) {

          (*words)[w].word->SetupFake(unicharset);

        }

        return false;

      }

    }

    if (word->word->tess_failed) {

      unsigned s;

      for (s = 0; s < word->lang_words.size() && word->lang_words[s]->tess_failed; ++s) {

      }

      // If all are failed, skip it. Image words are skipped by this test.

      if (s > word->lang_words.size()) {

        continue;

      }

    }

    // Sync pr_it with the WordData.

    while (pr_it->word() != nullptr && pr_it->word() != word->word) {

      pr_it->forward();

    }

    ASSERT_HOST(pr_it->word() != nullptr);

    bool make_next_word_fuzzy = false;

#ifndef DISABLED_LEGACY_ENGINE

    if (!AnyLSTMLang() && ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {

      // Needs to be setup again to see the new outlines in the chopped_word.

      SetupWordPassN(pass_n, word);

    }

#endif // ndef DISABLED_LEGACY_ENGINE


    classify_word_and_language(pass_n, pr_it, word);

    if (tessedit_dump_choices || debug_noise_removal) {

      tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().c_str(),

              word->word->best_choice->debug_string().c_str());

    }

    pr_it->forward();

    if (make_next_word_fuzzy && pr_it->word() != nullptr) {

      pr_it->MakeCurrentWordFuzzy();

    }

  }

  return true;

}


bool Tesseract::recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor,

                                const TBOX *target_word_box, const char *word_config,

                                int dopasses) {

  PAGE_RES_IT page_res_it(page_res);


  if (tessedit_minimal_rej_pass1) {

    tessedit_test_adaption.set_value(true);

    tessedit_minimal_rejection.set_value(true);

  }


  if (dopasses == 0 || dopasses == 1) {

    page_res_it.restart_page();

    // ****************** Pass 1 *******************


#ifndef DISABLED_LEGACY_ENGINE

    // If the adaptive classifier is full switch to one we prepared earlier,

    // ie on the previous page. If the current adaptive classifier is non-empty,

    // prepare a backup starting at this page, in case it fills up. Do all this

    // independently for each language.

    if (AdaptiveClassifierIsFull()) {

      SwitchAdaptiveClassifier();

    } else if (!AdaptiveClassifierIsEmpty()) {

      StartBackupAdaptiveClassifier();

    }

    // Now check the sub-langs as well.

    for (auto &lang : sub_langs_) {

      if (lang->AdaptiveClassifierIsFull()) {

        lang->SwitchAdaptiveClassifier();

      } else if (!lang->AdaptiveClassifierIsEmpty()) {

        lang->StartBackupAdaptiveClassifier();

      }

    }


#endif // ndef DISABLED_LEGACY_ENGINE


    // Set up all words ready for recognition, so that if parallelism is on

    // all the input and output classes are ready to run the classifier.

    std::vector<WordData> words;

    SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);

#ifndef DISABLED_LEGACY_ENGINE

    if (tessedit_parallelize) {

      PrerecAllWordsPar(words);

    }

#endif // ndef DISABLED_LEGACY_ENGINE


    stats_.word_count = words.size();


    stats_.dict_words = 0;

    stats_.doc_blob_quality = 0;

    stats_.doc_outline_errs = 0;

    stats_.doc_char_quality = 0;

    stats_.good_char_count = 0;

    stats_.doc_good_char_quality = 0;


    most_recently_used_ = this;

    // Run pass 1 word recognition.

    if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) {

      return false;

    }

    // Pass 1 post-processing.

    for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

      if (page_res_it.word()->word->flag(W_REP_CHAR)) {

        fix_rep_char(&page_res_it);

        continue;

      }


      // Count dict words.

      if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) {

        ++(stats_.dict_words);

      }


      // Update misadaption log (we only need to do it on pass 1, since

      // adaption only happens on this pass).

      if (page_res_it.word()->blamer_bundle != nullptr &&

          page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {

        page_res->misadaption_log.push_back(page_res_it.word()->blamer_bundle->misadaption_debug());

      }

    }

  }


  if (dopasses == 1) {

    return true;

  }


#ifndef DISABLED_LEGACY_ENGINE


  // ****************** Pass 2 *******************

  if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption && AnyTessLang()) {

    page_res_it.restart_page();

    std::vector<WordData> words;

    SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);

    if (tessedit_parallelize) {

      PrerecAllWordsPar(words);

    }

    most_recently_used_ = this;

    // Run pass 2 word recognition.

    if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) {

      return false;

    }

  }


  // The next passes are only required for Tess-only.

  if (AnyTessLang() && !AnyLSTMLang()) {

    // ****************** Pass 3 *******************

    // Fix fuzzy spaces.


    if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word &&

        !right_to_left()) {

      fix_fuzzy_spaces(monitor, stats_.word_count, page_res);

    }


    // ****************** Pass 4 *******************

    if (tessedit_enable_dict_correction) {

      dictionary_correction_pass(page_res);

    }

    if (tessedit_enable_bigram_correction) {

      bigram_correction_pass(page_res);

    }


    // ****************** Pass 5,6 *******************

    rejection_passes(page_res, monitor, target_word_box, word_config);


    // ****************** Pass 8 *******************

    font_recognition_pass(page_res);


    // ****************** Pass 9 *******************

    // Check the correctness of the final results.

    blamer_pass(page_res);

    script_pos_pass(page_res);

  }


#endif // ndef DISABLED_LEGACY_ENGINE


  // Write results pass.

  // This is now redundant, but retained commented so show how to obtain

  // bounding boxes and style information.


#ifndef DISABLED_LEGACY_ENGINE

  // changed by jetsoft

  // needed for dll to output memory structure

  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) {

    output_pass(page_res_it, target_word_box);

  }

// end jetsoft

#endif // ndef DISABLED_LEGACY_ENGINE


  const auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));

  textord_.CleanupSingleRowResult(pageseg_mode, page_res);


  // Remove empty words, as these mess up the result iterators.

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    const WERD_RES *word = page_res_it.word();

    const POLY_BLOCK *pb = page_res_it.block()->block != nullptr

                               ? page_res_it.block()->block->pdblk.poly_block()

                               : nullptr;

    if (word->best_choice == nullptr || word->best_choice->empty() ||

        (word->best_choice->IsAllSpaces() && (pb == nullptr || pb->IsText()))) {

      page_res_it.DeleteCurrentWord();

    }

  }


  if (monitor != nullptr) {

    monitor->progress = 100;

  }

  return true;

}


#ifndef DISABLED_LEGACY_ENGINE


void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {

  PAGE_RES_IT word_it(page_res);


  WERD_RES *w_prev = nullptr;

  WERD_RES *w = word_it.word();

  while (true) {

    w_prev = w;

    while (word_it.forward() != nullptr && (!word_it.word() || word_it.word()->part_of_combo)) {

      // advance word_it, skipping over parts of combos

    }

    if (!word_it.word()) {

      break;

    }

    w = word_it.word();

    if (!w || !w_prev || w->uch_set != w_prev->uch_set) {

      continue;

    }

    if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {

      if (tessedit_bigram_debug) {

        tprintf("Skipping because one of the words is W_REP_CHAR\n");

      }

      continue;

    }

    // Two words sharing the same language model, excellent!

    std::vector<WERD_CHOICE *> overrides_word1;

    std::vector<WERD_CHOICE *> overrides_word2;


    const auto orig_w1_str = w_prev->best_choice->unichar_string();

    const auto orig_w2_str = w->best_choice->unichar_string();

    WERD_CHOICE prev_best(w->uch_set);

    {

      int w1start, w1end;

      w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);

      prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);

    }

    WERD_CHOICE this_best(w->uch_set);

    {

      int w2start, w2end;

      w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);

      this_best = w->best_choice->shallow_copy(w2start, w2end);

    }


    if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {

      if (tessedit_bigram_debug) {

        tprintf("Top choice \"%s %s\" verified by bigram model.\n", orig_w1_str.c_str(),

                orig_w2_str.c_str());

      }

      continue;

    }

    if (tessedit_bigram_debug > 2) {

      tprintf("Examining alt choices for \"%s %s\".\n", orig_w1_str.c_str(), orig_w2_str.c_str());

    }

    if (tessedit_bigram_debug > 1) {

      if (!w_prev->best_choices.singleton()) {

        w_prev->PrintBestChoices();

      }

      if (!w->best_choices.singleton()) {

        w->PrintBestChoices();

      }

    }

    float best_rating = 0.0;

    int best_idx = 0;

    WERD_CHOICE_IT prev_it(&w_prev->best_choices);

    for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {

      WERD_CHOICE *p1 = prev_it.data();

      WERD_CHOICE strip1(w->uch_set);

      {

        int p1start, p1end;

        p1->GetNonSuperscriptSpan(&p1start, &p1end);

        strip1 = p1->shallow_copy(p1start, p1end);

      }

      WERD_CHOICE_IT w_it(&w->best_choices);

      for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

        WERD_CHOICE *p2 = w_it.data();

        WERD_CHOICE strip2(w->uch_set);

        {

          int p2start, p2end;

          p2->GetNonSuperscriptSpan(&p2start, &p2end);

          strip2 = p2->shallow_copy(p2start, p2end);

        }

        if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {

          overrides_word1.push_back(p1);

          overrides_word2.push_back(p2);

          if (overrides_word1.size() == 1 || p1->rating() + p2->rating() < best_rating) {

            best_rating = p1->rating() + p2->rating();

            best_idx = overrides_word1.size() - 1;

          }

        }

      }

    }

    if (!overrides_word1.empty()) {

      // Excellent, we have some bigram matches.

      if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) &&

          EqualIgnoringCaseAndTerminalPunct(*w->best_choice, *overrides_word2[best_idx])) {

        if (tessedit_bigram_debug > 1) {

          tprintf(

              "Top choice \"%s %s\" verified (sans case) by bigram "

              "model.\n",

              orig_w1_str.c_str(), orig_w2_str.c_str());

        }

        continue;

      }

      const auto new_w1_str = overrides_word1[best_idx]->unichar_string();

      const auto new_w2_str = overrides_word2[best_idx]->unichar_string();

      if (new_w1_str != orig_w1_str) {

        w_prev->ReplaceBestChoice(overrides_word1[best_idx]);

      }

      if (new_w2_str != orig_w2_str) {

        w->ReplaceBestChoice(overrides_word2[best_idx]);

      }

      if (tessedit_bigram_debug > 0) {

        std::string choices_description;

        int num_bigram_choices = overrides_word1.size() * overrides_word2.size();

        if (num_bigram_choices == 1) {

          choices_description = "This was the unique bigram choice.";

        } else {

          if (tessedit_bigram_debug > 1) {

            std::string bigrams_list;

            const int kMaxChoicesToPrint = 20;

            for (unsigned i = 0; i < overrides_word1.size() && i < kMaxChoicesToPrint; i++) {

              if (i > 0) {

                bigrams_list += ", ";

              }

              WERD_CHOICE *p1 = overrides_word1[i];

              WERD_CHOICE *p2 = overrides_word2[i];

              bigrams_list += p1->unichar_string() + " " + p2->unichar_string();

            }

            choices_description = "There were many choices: {";

            choices_description += bigrams_list;

            choices_description += "}";

          } else {

            choices_description += "There were " + std::to_string(num_bigram_choices);

            choices_description += " compatible bigrams.";

          }

        }

        tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n", orig_w1_str.c_str(),

                orig_w2_str.c_str(), new_w1_str.c_str(), new_w2_str.c_str(),

                choices_description.c_str());

      }

    }

  }

}


void Tesseract::rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor,

                                 const TBOX *target_word_box, const char *word_config) {

  PAGE_RES_IT page_res_it(page_res);

  // ****************** Pass 5 *******************

  // Gather statistics on rejects.

  int word_index = 0;

  while (!tessedit_test_adaption && page_res_it.word() != nullptr) {

    WERD_RES *word = page_res_it.word();

    word_index++;

    if (monitor != nullptr) {

      monitor->ocr_alive = true;

      monitor->progress = 95 + 5 * word_index / stats_.word_count;

    }

    if (word->rebuild_word == nullptr) {

      // Word was not processed by tesseract.

      page_res_it.forward();

      continue;

    }

    check_debug_pt(word, 70);


    // changed by jetsoft

    // specific to its needs to extract one word when need

    if (target_word_box &&

        !ProcessTargetWord(word->word->bounding_box(), *target_word_box, word_config, 4)) {

      page_res_it.forward();

      continue;

    }

    // end jetsoft


    page_res_it.rej_stat_word();

    const int chars_in_word = word->reject_map.length();

    const int rejects_in_word = word->reject_map.reject_count();


    const int blob_quality = word_blob_quality(word);

    stats_.doc_blob_quality += blob_quality;

    const int outline_errs = word_outline_errs(word);

    stats_.doc_outline_errs += outline_errs;

    int16_t all_char_quality;

    int16_t accepted_all_char_quality;

    word_char_quality(word, &all_char_quality, &accepted_all_char_quality);

    stats_.doc_char_quality += all_char_quality;

    const uint8_t permuter_type = word->best_choice->permuter();

    if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) ||

        (permuter_type == USER_DAWG_PERM)) {

      stats_.good_char_count += chars_in_word - rejects_in_word;

      stats_.doc_good_char_quality += accepted_all_char_quality;

    }

    check_debug_pt(word, 80);

    if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) {

      word->reject_map.rej_word_bad_quality();

    }

    check_debug_pt(word, 90);

    page_res_it.forward();

  }


  if (tessedit_debug_quality_metrics) {

    tprintf(

        "QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"

        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",

        page_res->char_count, page_res->rej_count,

        page_res->rej_count / static_cast<float>(page_res->char_count), stats_.doc_blob_quality,

        stats_.doc_blob_quality / static_cast<float>(page_res->char_count), stats_.doc_outline_errs,

        stats_.doc_outline_errs / static_cast<float>(page_res->char_count), stats_.doc_char_quality,

        stats_.doc_char_quality / static_cast<float>(page_res->char_count),

        stats_.doc_good_char_quality,

        (stats_.good_char_count > 0)

            ? (stats_.doc_good_char_quality / static_cast<float>(stats_.good_char_count))

            : 0.0);

  }

  bool good_quality_doc =

      ((page_res->rej_count / static_cast<float>(page_res->char_count)) <= quality_rej_pc) &&

      (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >= quality_blob_pc) &&

      (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <= quality_outline_pc) &&

      (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >= quality_char_pc);


  // ****************** Pass 6 *******************

  // Do whole document or whole block rejection pass

  if (!tessedit_test_adaption) {

    quality_based_rejection(page_res_it, good_quality_doc);

  }

}


#endif // ndef DISABLED_LEGACY_ENGINE


void Tesseract::blamer_pass(PAGE_RES *page_res) {

  if (!wordrec_run_blamer) {

    return;

  }

  PAGE_RES_IT page_res_it(page_res);

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    WERD_RES *word = page_res_it.word();

    BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);

    page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;

  }

  tprintf("Blame reasons:\n");

  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {

    tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(static_cast<IncorrectResultReason>(bl)),

            page_res->blame_reasons[bl]);

  }

  if (page_res->misadaption_log.size() > 0) {

    tprintf("Misadaption log:\n");

    for (auto &log : page_res->misadaption_log) {

      tprintf("%s\n", log.c_str());

    }

  }

}


// Sets script positions and detects smallcaps on all output words.

void Tesseract::script_pos_pass(PAGE_RES *page_res) {

  PAGE_RES_IT page_res_it(page_res);

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    WERD_RES *word = page_res_it.word();

    if (word->word->flag(W_REP_CHAR)) {

      page_res_it.forward();

      continue;

    }

    const float x_height = page_res_it.block()->block->x_height();

    float word_x_height = word->x_height;

    if (word_x_height < word->best_choice->min_x_height() ||

        word_x_height > word->best_choice->max_x_height()) {

      word_x_height =

          (word->best_choice->min_x_height() + word->best_choice->max_x_height()) / 2.0f;

    }

    // Test for small caps. Word capheight must be close to block xheight,

    // and word must contain no lower case letters, and at least one upper case.

    const double small_cap_xheight = x_height * kXHeightCapRatio;

    const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;

    if (word->uch_set->script_has_xheight() &&

        small_cap_xheight - small_cap_delta <= word_x_height &&

        word_x_height <= small_cap_xheight + small_cap_delta) {

      // Scan for upper/lower.

      int num_upper = 0;

      int num_lower = 0;

      for (unsigned i = 0; i < word->best_choice->length(); ++i) {

        if (word->uch_set->get_isupper(word->best_choice->unichar_id(i))) {

          ++num_upper;

        } else if (word->uch_set->get_islower(word->best_choice->unichar_id(i))) {

          ++num_lower;

        }

      }

      if (num_upper > 0 && num_lower == 0) {

        word->small_caps = true;

      }

    }

    word->SetScriptPositions();

  }

}


// Helper finds the gap between the index word and the next.

static void WordGap(const PointerVector<WERD_RES> &words, unsigned index, int *right, int *next_left) {

  *right = -INT32_MAX;

  *next_left = INT32_MAX;

  if (index < words.size()) {

    *right = words[index]->word->bounding_box().right();

    if (index + 1 < words.size()) {

      *next_left = words[index + 1]->word->bounding_box().left();

    }

  }

}


// Factored helper computes the rating, certainty, badness and validity of

// the permuter of the words in [first_index, end_index).

static void EvaluateWordSpan(const PointerVector<WERD_RES> &words, unsigned first_index, unsigned end_index,

                             float *rating, float *certainty, bool *bad, bool *valid_permuter) {

  if (end_index <= first_index) {

    *bad = true;

    *valid_permuter = false;

  }

  for (unsigned index = first_index; index < end_index && index < words.size(); ++index) {

    WERD_CHOICE *choice = words[index]->best_choice;

    if (choice == nullptr) {

      *bad = true;

    } else {

      *rating += choice->rating();

      *certainty = std::min(*certainty, choice->certainty());

      if (!Dict::valid_word_permuter(choice->permuter(), false)) {

        *valid_permuter = false;

      }

    }

  }

}


// Helper chooses the best combination of words, transferring good ones from

// new_words to best_words. To win, a new word must have (better rating and

// certainty) or (better permuter status and rating within rating ratio and

// certainty within certainty margin) than current best.

// All the new_words are consumed (moved to best_words or deleted.)

// The return value is the number of new_words used minus the number of

// best_words that remain in the output.

static int SelectBestWords(double rating_ratio, double certainty_margin, bool debug,

                           PointerVector<WERD_RES> *new_words,

                           PointerVector<WERD_RES> *best_words) {

  // Process the smallest groups of words that have an overlapping word

  // boundary at the end.

  std::vector<WERD_RES *> out_words;

  // Index into each word vector (best, new).

  unsigned b = 0, n = 0;

  int num_best = 0, num_new = 0;

  while (b < best_words->size() || n < new_words->size()) {

    // Start of the current run in each.

    auto start_b = b, start_n = n;

    while (b < best_words->size() || n < new_words->size()) {

      int b_right = -INT32_MAX;

      int next_b_left = INT32_MAX;

      WordGap(*best_words, b, &b_right, &next_b_left);

      int n_right = -INT32_MAX;

      int next_n_left = INT32_MAX;

      WordGap(*new_words, n, &n_right, &next_n_left);

      if (std::max(b_right, n_right) < std::min(next_b_left, next_n_left)) {

        // The word breaks overlap. [start_b,b] and [start_n, n] match.

        break;

      }

      // Keep searching for the matching word break.

      if ((b_right < n_right && b < best_words->size()) || n == new_words->size()) {

        ++b;

      } else {

        ++n;

      }

    }

    // Rating of the current run in each.

    float b_rating = 0.0f, n_rating = 0.0f;

    // Certainty of the current run in each.

    float b_certainty = 0.0f, n_certainty = 0.0f;

    // True if any word is missing its best choice.

    bool b_bad = false, n_bad = false;

    // True if all words have a valid permuter.

    bool b_valid_permuter = true, n_valid_permuter = true;

    const int end_b = b < best_words->size() ? b + 1 : b;

    const int end_n = n < new_words->size() ? n + 1 : n;

    EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, &b_bad,

                     &b_valid_permuter);

    EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, &n_bad,

                     &n_valid_permuter);

    bool new_better = false;

    if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) ||

                   (!b_valid_permuter && n_valid_permuter && n_rating < b_rating * rating_ratio &&

                    n_certainty > b_certainty - certainty_margin))) {

      // New is better.

      for (int i = start_n; i < end_n; ++i) {

        out_words.push_back((*new_words)[i]);

        (*new_words)[i] = nullptr;

        ++num_new;

      }

      new_better = true;

    } else if (!b_bad) {

      // Current best is better.

      for (int i = start_b; i < end_b; ++i) {

        out_words.push_back((*best_words)[i]);

        (*best_words)[i] = nullptr;

        ++num_best;

      }

    }

    if (debug) {

      tprintf(

          "%d new words %s than %d old words: r: %g v %g c: %g v %g"

          " valid dict: %d v %d\n",

          end_n - start_n, new_better ? "better" : "worse", end_b - start_b, n_rating, b_rating,

          n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);

    }

    // Move on to the next group.

    b = end_b;

    n = end_n;

  }

  // Transfer from out_words to best_words.

  best_words->clear();

  for (auto &out_word : out_words) {

    best_words->push_back(out_word);

  }

  return num_new - num_best;

}


// Helper to recognize the word using the given (language-specific) tesseract.

// Returns positive if this recognizer found more new best words than the

// number kept from best_words.

int Tesseract::RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug,

                                 WERD_RES **in_word, PointerVector<WERD_RES> *best_words) {

  if (debug) {

    tprintf("Trying word using lang %s, oem %d\n", lang.c_str(),

            static_cast<int>(tessedit_ocr_engine_mode));

  }

  // Run the recognizer on the word.

  PointerVector<WERD_RES> new_words;

  (this->*recognizer)(word_data, in_word, &new_words);

  if (new_words.empty()) {

    // Transfer input word to new_words, as the classifier must have put

    // the result back in the input.

    new_words.push_back(*in_word);

    *in_word = nullptr;

  }

  if (debug) {

    for (unsigned i = 0; i < new_words.size(); ++i) {

      new_words[i]->DebugTopChoice("Lang result");

    }

  }

  // Initial version is a bit of a hack based on better certainty and rating

  // or a dictionary vs non-dictionary word.

  return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug,

                         &new_words, best_words);

}


// Helper returns true if all the words are acceptable.

static bool WordsAcceptable(const PointerVector<WERD_RES> &words) {

  for (unsigned w = 0; w < words.size(); ++w) {

    if (words[w]->tess_failed || !words[w]->tess_accepted) {

      return false;

    }

  }

  return true;

}


#ifndef DISABLED_LEGACY_ENGINE


// Moves good-looking "noise"/diacritics from the reject list to the main

// blob list on the current word. Returns true if anything was done, and

// sets make_next_word_fuzzy if blob(s) were added to the end of the word.

bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) {

  *make_next_word_fuzzy = false;

  WERD *real_word = pr_it->word()->word;

  if (real_word->rej_cblob_list()->empty() || real_word->cblob_list()->empty() ||

      real_word->rej_cblob_list()->length() > noise_maxperword) {

    return false;

  }

  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);

  // Get the noise outlines into a vector with matching bool map.

  std::vector<C_OUTLINE *> outlines;

  real_word->GetNoiseOutlines(&outlines);

  std::vector<bool> word_wanted;

  std::vector<bool> overlapped_any_blob;

  std::vector<C_BLOB *> target_blobs;

  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, &word_wanted,

                                     &overlapped_any_blob, &target_blobs);

  // Filter the outlines that overlapped any blob and put them into the word

  // now. This simplifies the remaining task and also makes it more accurate

  // as it has more completed blobs to work on.

  std::vector<bool> wanted;

  std::vector<C_BLOB *> wanted_blobs;

  std::vector<C_OUTLINE *> wanted_outlines;

  int num_overlapped = 0;

  int num_overlapped_used = 0;

  for (unsigned i = 0; i < overlapped_any_blob.size(); ++i) {

    if (overlapped_any_blob[i]) {

      ++num_overlapped;

      if (word_wanted[i]) {

        ++num_overlapped_used;

      }

      wanted.push_back(word_wanted[i]);

      wanted_blobs.push_back(target_blobs[i]);

      wanted_outlines.push_back(outlines[i]);

      outlines[i] = nullptr;

    }

  }

  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, nullptr);

  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, &target_blobs);

  int non_overlapped = 0;

  int non_overlapped_used = 0;

  for (unsigned i = 0; i < word_wanted.size(); ++i) {

    if (word_wanted[i]) {

      ++non_overlapped_used;

    }

    if (outlines[i] != nullptr) {

      ++non_overlapped_used;

    }

  }

  if (debug_noise_removal) {

    tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", num_overlapped_used,

            num_overlapped, non_overlapped_used, non_overlapped);

    real_word->bounding_box().print();

  }

  // Now we have decided which outlines we want, put them into the real_word.

  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, make_next_word_fuzzy)) {

    pr_it->MakeCurrentWordFuzzy();

  }

  // TODO(rays) Parts of combos have a deep copy of the real word, and need

  // to have their noise outlines moved/assigned in the same way!!

  return num_overlapped_used != 0 || non_overlapped_used != 0;

}


// Attempts to put noise/diacritic outlines into the blobs that they overlap.

// Input: a set of noisy outlines that probably belong to the real_word.

// Output: word_wanted indicates which outlines are to be assigned to a blob,

//   target_blobs indicates which to assign to, and overlapped_any_blob is

//   true for all outlines that overlapped a blob.

void Tesseract::AssignDiacriticsToOverlappingBlobs(const std::vector<C_OUTLINE *> &outlines,

                                                   int pass, WERD *real_word, PAGE_RES_IT *pr_it,

                                                   std::vector<bool> *word_wanted,

                                                   std::vector<bool> *overlapped_any_blob,

                                                   std::vector<C_BLOB *> *target_blobs) {

  std::vector<bool> blob_wanted;

  word_wanted->clear();

  word_wanted->resize(outlines.size());

  overlapped_any_blob->clear();

  overlapped_any_blob->resize(outlines.size());

  target_blobs->clear();

  target_blobs->resize(outlines.size());

  // For each real blob, find the outlines that seriously overlap it.

  // A single blob could be several merged characters, so there can be quite

  // a few outlines overlapping, and the full engine needs to be used to chop

  // and join to get a sensible result.

  C_BLOB_IT blob_it(real_word->cblob_list());

  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {

    C_BLOB *blob = blob_it.data();

    const TBOX blob_box = blob->bounding_box();

    blob_wanted.clear();

    blob_wanted.resize(outlines.size());

    int num_blob_outlines = 0;

    for (unsigned i = 0; i < outlines.size(); ++i) {

      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && !(*word_wanted)[i]) {

        blob_wanted[i] = true;

        (*overlapped_any_blob)[i] = true;

        ++num_blob_outlines;

      }

    }

    if (debug_noise_removal) {

      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);

      blob_box.print();

    }

    // If any outlines overlap the blob, and not too many, classify the blob

    // (using the full engine, languages and all), and choose the maximal

    // combination of outlines that doesn't hurt the end-result classification

    // by too much. Mark them as wanted.

    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {

      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, outlines,

                                      num_blob_outlines, &blob_wanted)) {

        for (unsigned i = 0; i < blob_wanted.size(); ++i) {

          if (blob_wanted[i]) {

            // Claim the outline and record where it is going.

            (*word_wanted)[i] = true;

            (*target_blobs)[i] = blob;

          }

        }

      }

    }

  }

}


// Attempts to assign non-overlapping outlines to their nearest blobs or

// make new blobs out of them.

void Tesseract::AssignDiacriticsToNewBlobs(const std::vector<C_OUTLINE *> &outlines, int pass,

                                           WERD *real_word, PAGE_RES_IT *pr_it,

                                           std::vector<bool> *word_wanted,

                                           std::vector<C_BLOB *> *target_blobs) {

  std::vector<bool> blob_wanted;

  word_wanted->clear();

  word_wanted->resize(outlines.size());

  target_blobs->clear();

  target_blobs->resize(outlines.size());

  // Check for outlines that need to be turned into stand-alone blobs.

  for (unsigned i = 0; i < outlines.size(); ++i) {

    if (outlines[i] == nullptr) {

      continue;

    }

    // Get a set of adjacent outlines that don't overlap any existing blob.

    blob_wanted.clear();

    blob_wanted.resize(outlines.size());

    int num_blob_outlines = 0;

    TBOX total_ol_box(outlines[i]->bounding_box());

    while (i < outlines.size() && outlines[i] != nullptr) {

      blob_wanted[i] = true;

      total_ol_box += outlines[i]->bounding_box();

      ++i;

      ++num_blob_outlines;

    }

    // Find the insertion point.

    C_BLOB_IT blob_it(real_word->cblob_list());

    while (!blob_it.at_last() &&

           blob_it.data_relative(1)->bounding_box().left() <= total_ol_box.left()) {

      blob_it.forward();

    }

    // Choose which combination of them we actually want and where to put

    // them.

    if (debug_noise_removal) {

      tprintf("Num blobless outlines = %d\n", num_blob_outlines);

    }

    C_BLOB *left_blob = blob_it.data();

    TBOX left_box = left_blob->bounding_box();

    C_BLOB *right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);

    if ((left_box.x_overlap(total_ol_box) || right_blob == nullptr ||

         !right_blob->bounding_box().x_overlap(total_ol_box)) &&

        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, outlines,

                                    num_blob_outlines, &blob_wanted)) {

      if (debug_noise_removal) {

        tprintf("Added to left blob\n");

      }

      for (unsigned j = 0; j < blob_wanted.size(); ++j) {

        if (blob_wanted[j]) {

          (*word_wanted)[j] = true;

          (*target_blobs)[j] = left_blob;

        }

      }

    } else if (right_blob != nullptr &&

               (!left_box.x_overlap(total_ol_box) ||

                right_blob->bounding_box().x_overlap(total_ol_box)) &&

               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, right_blob, outlines,

                                           num_blob_outlines, &blob_wanted)) {

      if (debug_noise_removal) {

        tprintf("Added to right blob\n");

      }

      for (unsigned j = 0; j < blob_wanted.size(); ++j) {

        if (blob_wanted[j]) {

          (*word_wanted)[j] = true;

          (*target_blobs)[j] = right_blob;

        }

      }

    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, nullptr, outlines,

                                           num_blob_outlines, &blob_wanted)) {

      if (debug_noise_removal) {

        tprintf("Fitted between blobs\n");

      }

      for (unsigned j = 0; j < blob_wanted.size(); ++j) {

        if (blob_wanted[j]) {

          (*word_wanted)[j] = true;

          (*target_blobs)[j] = nullptr;

        }

      }

    }

  }

}


// Starting with ok_outlines set to indicate which outlines overlap the blob,

// chooses the optimal set (approximately) and returns true if any outlines

// are desired, in which case ok_outlines indicates which ones.

bool Tesseract::SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it,

                                            C_BLOB *blob,

                                            const std::vector<C_OUTLINE *> &outlines,

                                            int num_outlines, std::vector<bool> *ok_outlines) {

  std::string best_str;

  float target_cert = certainty_threshold;

  if (blob != nullptr) {

    float target_c2;

    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, best_str, &target_c2);

    if (debug_noise_removal) {

      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.c_str(), target_cert,

              target_c2);

      blob->bounding_box().print();

    }

    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;

  }

  std::vector<bool> test_outlines = *ok_outlines;

  // Start with all the outlines in.

  std::string all_str;

  std::vector<bool> best_outlines = *ok_outlines;

  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, all_str);

  if (debug_noise_removal) {

    TBOX ol_box;

    for (unsigned i = 0; i < test_outlines.size(); ++i) {

      if (test_outlines[i]) {

        ol_box += outlines[i]->bounding_box();

      }

    }

    tprintf("All Noise blob classified as %s=%g, delta=%g at:", all_str.c_str(), best_cert,

            best_cert - target_cert);

    ol_box.print();

  }

  // Iteratively zero out the bit that improves the certainty the most, until

  // we get past the threshold, have zero bits, or fail to improve.

  int best_index = 0; // To zero out.

  while (num_outlines > 1 && best_index >= 0 &&

         (blob == nullptr || best_cert < target_cert || blob != nullptr)) {

    // Find the best bit to zero out.

    best_index = -1;

    for (unsigned i = 0; i < outlines.size(); ++i) {

      if (test_outlines[i]) {

        test_outlines[i] = false;

        std::string str;

        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, pr_it, blob, str);

        if (debug_noise_removal) {

          TBOX ol_box;

          for (unsigned j = 0; j < outlines.size(); ++j) {

            if (test_outlines[j]) {

              ol_box += outlines[j]->bounding_box();

            }

            tprintf("%c", test_outlines[j] ? 'T' : 'F');

          }

          tprintf(" blob classified as %s=%g, delta=%g) at:", str.c_str(), cert,

                  cert - target_cert);

          ol_box.print();

        }

        if (cert > best_cert) {

          best_cert = cert;

          best_index = i;

          best_outlines = test_outlines;

        }

        test_outlines[i] = true;

      }

    }

    if (best_index >= 0) {

      test_outlines[best_index] = false;

      --num_outlines;

    }

  }

  if (best_cert >= target_cert) {

    // Save the best combination.

    *ok_outlines = best_outlines;

    if (debug_noise_removal) {

      tprintf("%s noise combination ", blob ? "Adding" : "New");

      for (auto &&best_outline : best_outlines) {

        tprintf("%c", best_outline ? 'T' : 'F');

      }

      tprintf(" yields certainty %g, beating target of %g\n", best_cert, target_cert);

    }

    return true;

  }


  return false;

}


// Classifies the given blob plus the outlines flagged by ok_outlines, undoes

// the inclusion of the outlines, and returns the certainty of the raw choice.

float Tesseract::ClassifyBlobPlusOutlines(const std::vector<bool> &ok_outlines,

                                          const std::vector<C_OUTLINE *> &outlines, int pass_n,

                                          PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str) {

  C_OUTLINE_IT ol_it;

  C_OUTLINE *first_to_keep = nullptr;

  C_BLOB *local_blob = nullptr;

  if (blob != nullptr) {

    // Add the required outlines to the blob.

    ol_it.set_to_list(blob->out_list());

    first_to_keep = ol_it.data();

  }

  for (unsigned i = 0; i < ok_outlines.size(); ++i) {

    if (ok_outlines[i]) {

      // This outline is to be added.

      if (blob == nullptr) {

        local_blob = new C_BLOB(outlines[i]);

        blob = local_blob;

        ol_it.set_to_list(blob->out_list());

      } else {

        ol_it.add_before_stay_put(outlines[i]);

      }

    }

  }

  float c2;

  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);

  ol_it.move_to_first();

  if (first_to_keep == nullptr) {

    // We created blob. Empty its outlines and delete it.

    for (; !ol_it.empty(); ol_it.forward()) {

      ol_it.extract();

    }

    delete local_blob;

    cert = -c2;

  } else {

    // Remove the outlines that we put in.

    for (; ol_it.data() != first_to_keep; ol_it.forward()) {

      ol_it.extract();

    }

  }

  return cert;

}


// Classifies the given blob (part of word_data->word->word) as an individual

// word, using languages, chopper etc, returning only the certainty of the

// best raw choice, and undoing all the work done to fake out the word.

float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str,

                                    float *c2) {

  WERD *real_word = pr_it->word()->word;

  WERD *word = real_word->ConstructFromSingleBlob(real_word->flag(W_BOL), real_word->flag(W_EOL),

                                                  C_BLOB::deep_copy(blob));

  WERD_RES *word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);

  // Get a new iterator that points to the new word.

  PAGE_RES_IT it(pr_it->page_res);

  while (it.word() != word_res && it.word() != nullptr) {

    it.forward();

  }

  ASSERT_HOST(it.word() == word_res);

  WordData wd(it);

  // Force full initialization.

  SetupWordPassN(1, &wd);

  classify_word_and_language(pass_n, &it, &wd);

  if (debug_noise_removal) {

    if (wd.word->raw_choice != nullptr) {

      tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, wd.row->x_height(),

              wd.word->raw_choice->min_x_height(), wd.word->raw_choice->max_x_height());

    } else {

      tprintf("Got word with null raw choice xheight=%g, row=%g\n", word_res->x_height,

              wd.row->x_height());

    }

  }

  float cert = 0.0f;

  if (wd.word->raw_choice != nullptr) { // This probably shouldn't happen, but...

    cert = wd.word->raw_choice->certainty();

    float rat = wd.word->raw_choice->rating();

    *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;

    best_str = wd.word->raw_choice->unichar_string();

  } else {

    *c2 = 0.0f;

    best_str.clear();

  }

  it.DeleteCurrentWord();

  pr_it->ResetWordIterator();

  return cert;

}


#endif // ndef DISABLED_LEGACY_ENGINE


// Generic function for classifying a word. Can be used either for pass1 or

// pass2 according to the function passed to recognizer.

// word_data holds the word to be recognized, and its block and row, and

// pr_it points to the word as well, in case we are running LSTM and it wants

// to output multiple words.

// Recognizes in the current language, and if successful that is all.

// If recognition was not successful, tries all available languages until

// it gets a successful result or runs out of languages. Keeps the best result.

void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) {

#ifdef DISABLED_LEGACY_ENGINE

  WordRecognizer recognizer = &Tesseract::classify_word_pass1;

#else

  WordRecognizer recognizer =

      pass_n == 1 ? &Tesseract::classify_word_pass1 : &Tesseract::classify_word_pass2;

#endif // def DISABLED_LEGACY_ENGINE


  // Best result so far.

  PointerVector<WERD_RES> best_words;

  // Points to the best result. May be word or in lang_words.

  const WERD_RES *word = word_data->word;

  clock_t start_t = clock();

  const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;

  if (debug) {

    tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing",

            most_recently_used_->lang.c_str());

    word->word->bounding_box().print();

  }

  if (word->done) {

    // If done on pass1, leave it as-is.

    if (!word->tess_failed) {

      most_recently_used_ = word->tesseract;

    }

    return;

  }

  auto sub = sub_langs_.size();

  if (most_recently_used_ != this) {

    // Get the index of the most_recently_used_.

    for (sub = 0; sub < sub_langs_.size() && most_recently_used_ != sub_langs_[sub]; ++sub) {

    }

  }

  most_recently_used_->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub],

                                         &best_words);

  Tesseract *best_lang_tess = most_recently_used_;

  if (!WordsAcceptable(best_words)) {

    // Try all the other languages to see if they are any better.

    if (most_recently_used_ != this &&

        this->RetryWithLanguage(*word_data, recognizer, debug,

                                &word_data->lang_words[sub_langs_.size()], &best_words) > 0) {

      best_lang_tess = this;

    }

    for (unsigned i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) {

      if (most_recently_used_ != sub_langs_[i] &&

          sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i],

                                           &best_words) > 0) {

        best_lang_tess = sub_langs_[i];

      }

    }

  }

  most_recently_used_ = best_lang_tess;

  if (!best_words.empty()) {

    if (best_words.size() == 1 && !best_words[0]->combination) {

      // Move the best single result to the main word.

      word_data->word->ConsumeWordResults(best_words[0]);

    } else {

      // Words came from LSTM, and must be moved to the PAGE_RES properly.

      word_data->word = best_words.back();

      pr_it->ReplaceCurrentWord(&best_words);

    }

    ASSERT_HOST(word_data->word->box_word != nullptr);

  } else {

    tprintf("no best words!!\n");

  }

  clock_t ocr_t = clock();

  if (tessedit_timing_debug) {

    tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),

            static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);

  }

}


void Tesseract::classify_word_pass1(const WordData &word_data, WERD_RES **in_word,

                                    PointerVector<WERD_RES> *out_words) {

  ROW *row = word_data.row;

  BLOCK *block = word_data.block;

  prev_word_best_choice_ =

      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;

#ifdef DISABLED_LEGACY_ENGINE

  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {

#else

  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||

      tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {

#endif // def DISABLED_LEGACY_ENGINE

    if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {

      LSTMRecognizeWord(*block, row, *in_word, out_words);

      if (!out_words->empty()) {

        return; // Successful lstm recognition.

      }

    }

    if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {

      // No fallback allowed, so use a fake.

      (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());

      return;

    }


#ifndef DISABLED_LEGACY_ENGINE

    // Fall back to tesseract for failed words or odd words.

    (*in_word)->SetupForRecognition(unicharset, this, BestPix(), OEM_TESSERACT_ONLY, nullptr,

                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,

                                    poly_allow_detailed_fx, row, block);

#endif // ndef DISABLED_LEGACY_ENGINE

  }


#ifndef DISABLED_LEGACY_ENGINE

  WERD_RES *word = *in_word;

  match_word_pass_n(1, word, row, block);

  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {

    word->tess_would_adapt = AdaptableWord(word);

    bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);


    if (adapt_ok) {

      // Send word to adaptive classifier for training.

      word->BestChoiceToCorrectText();

      LearnWord(nullptr, word);

      // Mark misadaptions if running blamer.

      if (word->blamer_bundle != nullptr) {

        word->blamer_bundle->SetMisAdaptionDebug(word->best_choice, wordrec_debug_blamer);

      }

    }


    if (tessedit_enable_doc_dict && !word->IsAmbiguous()) {

      tess_add_doc_word(word->best_choice);

    }

  }

#endif // ndef DISABLED_LEGACY_ENGINE

}


// Helper to report the result of the xheight fix.

void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word,

                                   WERD_RES *new_word) {

  tprintf("New XHT Match:%s = %s ", word->best_choice->unichar_string().c_str(),

          word->best_choice->debug_string().c_str());

  word->reject_map.print(debug_fp);

  tprintf(" -> %s = %s ", new_word->best_choice->unichar_string().c_str(),

          new_word->best_choice->debug_string().c_str());

  new_word->reject_map.print(debug_fp);

  tprintf(" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT",

          new_word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK",

          accept_new_word ? "ACCEPTED" : "");

}


#ifndef DISABLED_LEGACY_ENGINE


// Run the x-height fix-up, based on min/max top/bottom information in

// unicharset.

// Returns true if the word was changed.

// See the comment in fixxht.cpp for a description of the overall process.

bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row) {

  int original_misfits = CountMisfitTops(word);

  if (original_misfits == 0) {

    return false;

  }

  float baseline_shift = 0.0f;

  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);

  if (baseline_shift != 0.0f) {

    // Try the shift on its own first.

    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, word, block, row)) {

      return false;

    }

    original_misfits = CountMisfitTops(word);

    if (original_misfits > 0) {

      float new_baseline_shift;

      // Now recompute the new x_height.

      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);

      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {

        // No test of return value here, as we are definitely making a change

        // to the word by shifting the baseline.

        TestNewNormalization(original_misfits, baseline_shift, new_x_ht, word, block, row);

      }

    }

    return true;

  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {

    return TestNewNormalization(original_misfits, 0.0f, new_x_ht, word, block, row);

  } else {

    return false;

  }

}


// Runs recognition with the test baseline shift and x-height and returns true

// if there was an improvement in recognition result.

bool Tesseract::TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht,

                                     WERD_RES *word, BLOCK *block, ROW *row) {

  bool accept_new_x_ht = false;

  WERD_RES new_x_ht_word(word->word);

  if (word->blamer_bundle != nullptr) {

    new_x_ht_word.blamer_bundle = new BlamerBundle();

    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));

  }

  new_x_ht_word.x_height = new_x_ht;

  new_x_ht_word.baseline_shift = baseline_shift;

  new_x_ht_word.caps_height = 0.0;

  new_x_ht_word.SetupForRecognition(unicharset, this, BestPix(), tessedit_ocr_engine_mode, nullptr,

                                    classify_bln_numeric_mode, textord_use_cjk_fp_model,

                                    poly_allow_detailed_fx, row, block);

  match_word_pass_n(2, &new_x_ht_word, row, block);

  if (!new_x_ht_word.tess_failed) {

    int new_misfits = CountMisfitTops(&new_x_ht_word);

    if (debug_x_ht_level >= 1) {

      tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", original_misfits,

              word->x_height, new_misfits, new_x_ht);

      tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", word->best_choice->rating(),

              word->best_choice->certainty(), new_x_ht_word.best_choice->rating(),

              new_x_ht_word.best_choice->certainty());

    }

    // The misfits must improve and either the rating or certainty.

    accept_new_x_ht = new_misfits < original_misfits &&

                      (new_x_ht_word.best_choice->certainty() > word->best_choice->certainty() ||

                       new_x_ht_word.best_choice->rating() < word->best_choice->rating());

    if (debug_x_ht_level >= 1) {

      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);

    }

  }

  if (accept_new_x_ht) {

    word->ConsumeWordResults(&new_x_ht_word);

    return true;

  }

  return false;

}


#endif // ndef DISABLED_LEGACY_ENGINE


void Tesseract::classify_word_pass2(const WordData &word_data, WERD_RES **in_word,

                                    PointerVector<WERD_RES> *out_words) {

  // Return if we do not want to run Tesseract.

  if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {

    return;

  }

#ifndef DISABLED_LEGACY_ENGINE

  ROW *row = word_data.row;

  BLOCK *block = word_data.block;

  WERD_RES *word = *in_word;

  prev_word_best_choice_ =

      word_data.prev_word != nullptr ? word_data.prev_word->word->best_choice : nullptr;


  check_debug_pt(word, 30);

  if (!word->done) {

    word->caps_height = 0.0;

    if (word->x_height == 0.0f) {

      word->x_height = row->x_height();

    }

    match_word_pass_n(2, word, row, block);

    check_debug_pt(word, 40);

  }


  SubAndSuperscriptFix(word);


  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {

    if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&

        block->classify_rotation().y() == 0.0f) {

      // Use the tops and bottoms since they are available.

      TrainedXheightFix(word, block, row);

    }

  }

#  ifndef GRAPHICS_DISABLED

  if (tessedit_display_outwords) {

    if (fx_win == nullptr) {

      create_fx_win();

    }

    clear_fx_win();

    word->rebuild_word->plot(fx_win);

    TBOX wbox = word->rebuild_word->bounding_box();

    fx_win->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());

    ScrollView::Update();

  }

#  endif

  check_debug_pt(word, 50);

#endif // ndef DISABLED_LEGACY_ENGINE

}


#ifndef DISABLED_LEGACY_ENGINE

void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block) {

  if (word->tess_failed) {

    return;

  }

  tess_segment_pass_n(pass_n, word);


  if (!word->tess_failed) {

    if (!word->word->flag(W_REP_CHAR)) {

      word->fix_quotes();

      if (tessedit_fix_hyphens) {

        word->fix_hyphens();

      }

      /* Don't trust fix_quotes! - though I think I've fixed the bug */

      if (static_cast<unsigned>(word->best_choice->length()) != word->box_word->length()) {

        tprintf(

            "POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"

            " #Blobs=%u\n",

            word->best_choice->debug_string().c_str(), word->best_choice->length(),

            word->box_word->length());

      }

      word->tess_accepted = tess_acceptable_word(word);


      // Also sets word->done flag

      make_reject_map(word, row, pass_n);

    }

  }

  set_word_fonts(word);


  ASSERT_HOST(word->raw_choice != nullptr);

}

#endif // ndef DISABLED_LEGACY_ENGINE


// Helper to return the best rated BLOB_CHOICE in the whole word that matches

// the given char_id, or nullptr if none can be found.

static BLOB_CHOICE *FindBestMatchingChoice(UNICHAR_ID char_id, WERD_RES *word_res) {

  // Find the corresponding best BLOB_CHOICE from any position in the word_res.

  BLOB_CHOICE *best_choice = nullptr;

  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {

    BLOB_CHOICE *choice = FindMatchingChoice(char_id, word_res->GetBlobChoices(i));

    if (choice != nullptr) {

      if (best_choice == nullptr || choice->rating() < best_choice->rating()) {

        best_choice = choice;

      }

    }

  }

  return best_choice;

}


// Helper to insert blob_choice in each location in the leader word if there is

// no matching BLOB_CHOICE there already, and correct any incorrect results

// in the best_choice.

static void CorrectRepcharChoices(BLOB_CHOICE *blob_choice, WERD_RES *word_res) {

  WERD_CHOICE *word = word_res->best_choice;

  for (unsigned i = 0; i < word_res->best_choice->length(); ++i) {

    BLOB_CHOICE *choice =

        FindMatchingChoice(blob_choice->unichar_id(), word_res->GetBlobChoices(i));

    if (choice == nullptr) {

      BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));

      choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));

    }

  }

  // Correct any incorrect results in word.

  for (unsigned i = 0; i < word->length(); ++i) {

    if (word->unichar_id(i) != blob_choice->unichar_id()) {

      word->set_unichar_id(blob_choice->unichar_id(), i);

    }

  }

}


void Tesseract::fix_rep_char(PAGE_RES_IT *page_res_it) {

  WERD_RES *word_res = page_res_it->word();

  const WERD_CHOICE &word = *(word_res->best_choice);


  // Find the frequency of each unique character in the word.

  SortHelper<UNICHAR_ID> rep_ch(word.length());

  for (unsigned i = 0; i < word.length(); ++i) {

    rep_ch.Add(word.unichar_id(i), 1);

  }


  // Find the most frequent result.

  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char

  int max_count = rep_ch.MaxCount(&maxch_id);

  // Find the best exemplar of a classifier result for maxch_id.

  BLOB_CHOICE *best_choice = FindBestMatchingChoice(maxch_id, word_res);

  if (best_choice == nullptr) {

    tprintf("Failed to find a choice for %s, occurring %d times\n",

            word_res->uch_set->debug_str(maxch_id).c_str(), max_count);

    return;

  }

  word_res->done = true;


  // Just correct existing classification.

  CorrectRepcharChoices(best_choice, word_res);

  word_res->reject_map.initialise(word.length());

}


ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(const UNICHARSET &char_set, const char *s,

                                                       const char *lengths) {

  int i = 0;

  int offset = 0;

  int leading_punct_count;

  int upper_count = 0;

  int hyphen_pos = -1;

  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;


  if (strlen(lengths) > 20) {

    return word_type;

  }


  /* Single Leading punctuation char*/


  if (s[offset] != '\0' && chs_leading_punct.contains(s[offset])) {

    offset += lengths[i++];

  }

  leading_punct_count = i;


  /* Initial cap */

  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {

    offset += lengths[i++];

    upper_count++;

  }

  if (upper_count > 1) {

    word_type = AC_UPPER_CASE;

  } else {

    /* Lower case word, possibly with an initial cap */

    while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {

      offset += lengths[i++];

    }

    if (i - leading_punct_count < quality_min_initial_alphas_reqd) {

      goto not_a_word;

    }

    /*

Allow a single hyphen in a lower case word

- don't trust upper case - I've seen several cases of "H" -> "I-I"

*/

    if (lengths[i] == 1 && s[offset] == '-') {

      hyphen_pos = i;

      offset += lengths[i++];

      if (s[offset] != '\0') {

        while ((s[offset] != '\0') && char_set.get_islower(s + offset, lengths[i])) {

          offset += lengths[i++];

        }

        if (i < hyphen_pos + 3) {

          goto not_a_word;

        }

      }

    } else {

      /* Allow "'s" in NON hyphenated lower case words */

      if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 &&

          (s[offset + lengths[i]] == 's')) {

        offset += lengths[i++];

        offset += lengths[i++];

      }

    }

    if (upper_count > 0) {

      word_type = AC_INITIAL_CAP;

    } else {

      word_type = AC_LOWER_CASE;

    }

  }


  /* Up to two different, constrained trailing punctuation chars */

  if (lengths[i] == 1 && s[offset] != '\0' && chs_trailing_punct1.contains(s[offset])) {

    offset += lengths[i++];

  }

  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 && s[offset - lengths[i - 1]] != s[offset] &&

      chs_trailing_punct2.contains(s[offset])) {

    offset += lengths[i++];

  }


  if (s[offset] != '\0') {

    word_type = AC_UNACCEPTABLE;

  }


not_a_word:


  if (word_type == AC_UNACCEPTABLE) {

    /* Look for abbreviation string */

    i = 0;

    offset = 0;

    if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {

      word_type = AC_UC_ABBREV;

      while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i]) &&

             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {

        offset += lengths[i++];

        offset += lengths[i++];

      }

    } else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {

      word_type = AC_LC_ABBREV;

      while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i]) &&

             lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {

        offset += lengths[i++];

        offset += lengths[i++];

      }

    }

    if (s[offset] != '\0') {

      word_type = AC_UNACCEPTABLE;

    }

  }


  return word_type;

}


bool Tesseract::check_debug_pt(WERD_RES *word, int location) {

  bool show_map_detail = false;

  int16_t i;


  if (!test_pt) {

    return false;

  }


  tessedit_rejection_debug.set_value(false);

  debug_x_ht_level.set_value(0);


  if (word->word->bounding_box().contains(FCOORD(test_pt_x, test_pt_y))) {

    if (location < 0) {

      return true; // For breakpoint use

    }

    tessedit_rejection_debug.set_value(true);

    debug_x_ht_level.set_value(2);

    tprintf("\n\nTESTWD::");

    switch (location) {

      case 0:

        tprintf("classify_word_pass1 start\n");

        word->word->print();

        break;

      case 10:

        tprintf("make_reject_map: initial map");

        break;

      case 20:

        tprintf("make_reject_map: after NN");

        break;

      case 30:

        tprintf("classify_word_pass2 - START");

        break;

      case 40:

        tprintf("classify_word_pass2 - Pre Xht");

        break;

      case 50:

        tprintf("classify_word_pass2 - END");

        show_map_detail = true;

        break;

      case 60:

        tprintf("fixspace");

        break;

      case 70:

        tprintf("MM pass START");

        break;

      case 80:

        tprintf("MM pass END");

        break;

      case 90:

        tprintf("After Poor quality rejection");

        break;

      case 100:

        tprintf("unrej_good_quality_words - START");

        break;

      case 110:

        tprintf("unrej_good_quality_words - END");

        break;

      case 120:

        tprintf("Write results pass");

        show_map_detail = true;

        break;

    }

    if (word->best_choice != nullptr) {

      tprintf(" \"%s\" ", word->best_choice->unichar_string().c_str());

      word->reject_map.print(debug_fp);

      tprintf("\n");

      if (show_map_detail) {

        tprintf("\"%s\"\n", word->best_choice->unichar_string().c_str());

        for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {

          tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);

          word->reject_map[i].full_print(debug_fp);

        }

      }

    } else {

      tprintf("null best choice\n");

    }

    tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");

    tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");

    return true;

  } else {

    return false;

  }

}


#ifndef DISABLED_LEGACY_ENGINE

static void find_modal_font( // good chars in word

    STATS *fonts,            // font stats

    int16_t *font_out,       // output font

    int8_t *font_count       // output count

) {

  int16_t font;  // font index

  int32_t count; // pile count


  if (fonts->get_total() > 0) {

    font = static_cast<int16_t>(fonts->mode());

    *font_out = font;

    count = fonts->pile_count(font);

    *font_count = count < INT8_MAX ? count : INT8_MAX;

    fonts->add(font, -*font_count);

  } else {

    *font_out = -1;

    *font_count = 0;

  }

}

#endif // ! DISABLED_LEGACY_ENGINE


void Tesseract::set_word_fonts(WERD_RES *word) {

  // Don't try to set the word fonts for an lstm word, as the configs

  // will be meaningless.

  if (word->chopped_word == nullptr) {

    return;

  }

  ASSERT_HOST(word->best_choice != nullptr);


#ifndef DISABLED_LEGACY_ENGINE

  const int fontinfo_size = fontinfo_table_.size();

  if (fontinfo_size == 0) {

    return;

  }

  if (tessedit_font_id > 0) {

    if (tessedit_font_id >= fontinfo_size) {

      tprintf("Error, invalid font ID provided: must be below %d.\n"

              "Falling back to font auto-detection.\n", fontinfo_size);

    } else {

      word->fontinfo = &fontinfo_table_.at(tessedit_font_id);

      word->fontinfo2 = nullptr;

      word->fontinfo_id_count = INT8_MAX;

      word->fontinfo_id2_count = 0;

      return;

    }

  }

  std::vector<int> font_total_score(fontinfo_size);


  // Compute the font scores for the word

  if (tessedit_debug_fonts) {

    tprintf("Examining fonts in %s\n", word->best_choice->debug_string().c_str());

  }

  for (unsigned b = 0; b < word->best_choice->length(); ++b) {

    const BLOB_CHOICE *choice = word->GetBlobChoice(b);

    if (choice == nullptr) {

      continue;

    }

    auto &fonts = choice->fonts();

    for (auto &f : fonts) {

      const int fontinfo_id = f.fontinfo_id;

      if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {

        font_total_score[fontinfo_id] += f.score;

      }

    }

  }

  // Find the top and 2nd choice for the word.

  int score1 = 0, score2 = 0;

  int16_t font_id1 = -1, font_id2 = -1;

  for (int f = 0; f < fontinfo_size; ++f) {

    if (tessedit_debug_fonts && font_total_score[f] > 0) {

      tprintf("Font %s, total score = %d\n", fontinfo_table_.at(f).name, font_total_score[f]);

    }

    if (font_total_score[f] > score1) {

      score2 = score1;

      font_id2 = font_id1;

      score1 = font_total_score[f];

      font_id1 = f;

    } else if (font_total_score[f] > score2) {

      score2 = font_total_score[f];

      font_id2 = f;

    }

  }

  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.at(font_id1) : nullptr;

  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.at(font_id2) : nullptr;

  // Each score has a limit of UINT16_MAX, so divide by that to get the number

  // of "votes" for that font, ie number of perfect scores.

  word->fontinfo_id_count = ClipToRange<int>(score1 / UINT16_MAX, 1, INT8_MAX);

  word->fontinfo_id2_count = ClipToRange<int>(score2 / UINT16_MAX, 0, INT8_MAX);

  if (score1 > 0) {

    const FontInfo fi = fontinfo_table_.at(font_id1);

    if (tessedit_debug_fonts) {

      if (word->fontinfo_id2_count > 0 && font_id2 >= 0) {

        tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n", fi.name,

                word->fontinfo_id_count, fontinfo_table_.at(font_id2).name,

                word->fontinfo_id2_count);

      } else {

        tprintf("Word modal font=%s, score=%d. No 2nd choice\n", fi.name, word->fontinfo_id_count);

      }

    }

  }

#endif // ndef DISABLED_LEGACY_ENGINE

}


#ifndef DISABLED_LEGACY_ENGINE

void Tesseract::font_recognition_pass(PAGE_RES *page_res) {

  PAGE_RES_IT page_res_it(page_res);

  WERD_RES *word;                       // current word

  STATS doc_fonts(0, font_table_size_ - 1); // font counters


  // Gather font id statistics.

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    word = page_res_it.word();

    if (word->fontinfo != nullptr) {

      doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);

    }

    if (word->fontinfo2 != nullptr) {

      doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);

    }

  }

  int16_t doc_font;      // modal font

  int8_t doc_font_count; // modal font

  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);

  if (doc_font_count == 0) {

    return;

  }

  // Get the modal font pointer.

  const FontInfo *modal_font = nullptr;

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    word = page_res_it.word();

    if (word->fontinfo != nullptr && word->fontinfo->universal_id == doc_font) {

      modal_font = word->fontinfo;

      break;

    }

    if (word->fontinfo2 != nullptr && word->fontinfo2->universal_id == doc_font) {

      modal_font = word->fontinfo2;

      break;

    }

  }

  ASSERT_HOST(modal_font != nullptr);


  // Assign modal font to weak words.

  for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {

    word = page_res_it.word();

    const int length = word->best_choice->length();


    const int count = word->fontinfo_id_count;

    if (!(count == length || (length > 3 && count >= length * 3 / 4))) {

      word->fontinfo = modal_font;

      // Counts only get 1 as it came from the doc.

      word->fontinfo_id_count = 1;

    }

  }

}

#endif // ndef DISABLED_LEGACY_ENGINE


// If a word has multiple alternates check if the best choice is in the

// dictionary. If not, replace it with an alternate that exists in the

// dictionary.

void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {

  PAGE_RES_IT word_it(page_res);

  for (WERD_RES *word = word_it.word(); word != nullptr; word = word_it.forward()) {

    if (word->best_choices.singleton()) {

      continue; // There are no alternates.

    }


    const WERD_CHOICE *best = word->best_choice;

    if (word->tesseract->getDict().valid_word(*best) != 0) {

      continue; // The best choice is in the dictionary.

    }


    WERD_CHOICE_IT choice_it(&word->best_choices);

    for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); choice_it.forward()) {

      WERD_CHOICE *alternate = choice_it.data();

      if (word->tesseract->getDict().valid_word(*alternate)) {

        // The alternate choice is in the dictionary.

        if (tessedit_bigram_debug) {

          tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",

                  best->unichar_string().c_str(), alternate->unichar_string().c_str());

        }

        // Replace the 'best' choice with a better choice.

        word->ReplaceBestChoice(alternate);

        break;

      }

    }

  }

}


} // namespace tesseract

ocrclass.h

fixspace.h

reject.h

docqual.h

control.h

ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:28

AC_UC_ABBREV
@ AC_UC_ABBREV
A.B.C.
Definition: control.h:34

AC_INITIAL_CAP
@ AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:32

AC_LC_ABBREV
@ AC_LC_ABBREV
a.b.c.
Definition: control.h:33

AC_UNACCEPTABLE
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:29

AC_UPPER_CASE
@ AC_UPPER_CASE
ALL upper case.
Definition: control.h:31

AC_LOWER_CASE
@ AC_LOWER_CASE
ALL lower case.
Definition: control.h:30

werdit.h

kBackUpConfigFile
const char *const kBackUpConfigFile
Definition: control.cpp:47

kMinRefitXHeightFraction
const double kMinRefitXHeightFraction
Definition: control.cpp:51

output.h

tessvars.h

debug_fp
FILE * debug_fp
Definition: tessvars.cpp:24

tesseractclass.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

sorthelper.h

drawfx.h

pageres.h

lstmrecognizer.h

i
int i
Definition: gmock-matchers_test.cc:718

count
int * count
Definition: gmock_stress_test.cc:96

tesseract
Definition: baseapi.h:39

tesseract::W_BOL
@ W_BOL
start of line
Definition: werd.h:34

tesseract::W_EOL
@ W_EOL
end of line
Definition: werd.h:35

tesseract::W_REP_CHAR
@ W_REP_CHAR
repeated character
Definition: werd.h:40

tesseract::OEM_TESSERACT_LSTM_COMBINED
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:266

tesseract::OEM_LSTM_ONLY
@ OEM_LSTM_ONLY
Definition: publictypes.h:265

tesseract::OEM_TESSERACT_ONLY
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264

tesseract::SET_PARAM_CONSTRAINT_DEBUG_ONLY
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:41

tesseract::make_pseudo_word
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:38

tesseract::clear_fx_win
void clear_fx_win()
Definition: drawfx.cpp:61

tesseract::PageSegMode
PageSegMode
Definition: publictypes.h:157

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::IncorrectResultReason
IncorrectResultReason
Definition: blamer.h:56

tesseract::IRR_NUM_REASONS
@ IRR_NUM_REASONS
Definition: blamer.h:103

tesseract::create_fx_win
void create_fx_win()
Definition: drawfx.cpp:50

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::WordRecognizer
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
Definition: tesseractclass.h:176

tesseract::FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:177

tesseract::SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244

tesseract::USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:246

tesseract::FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

tesseract::fx_win
ScrollView * fx_win
Definition: drawfx.cpp:42

tesseract::EqualIgnoringCaseAndTerminalPunct
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:773

tesseract::ETEXT_DESC
Definition: ocrclass.h:102

tesseract::ETEXT_DESC::ocr_alive
volatile int8_t ocr_alive
true if not last
Definition: ocrclass.h:110

tesseract::ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:136

tesseract::ETEXT_DESC::cancel_this
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116

tesseract::ETEXT_DESC::progress_callback2
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
Definition: ocrclass.h:115

tesseract::ETEXT_DESC::progress
int16_t progress
chars in this buffer(0)
Definition: ocrclass.h:105

tesseract::ETEXT_DESC::cancel
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112

tesseract::TesseractStats::dict_words
int32_t dict_words
Definition: tesseractclass.h:141

tesseract::TesseractStats::doc_outline_errs
int16_t doc_outline_errs
Definition: tesseractclass.h:136

tesseract::TesseractStats::word_count
int32_t word_count
Definition: tesseractclass.h:140

tesseract::TesseractStats::doc_blob_quality
int16_t doc_blob_quality
Definition: tesseractclass.h:135

tesseract::TesseractStats::doc_good_char_quality
int16_t doc_good_char_quality
Definition: tesseractclass.h:139

tesseract::TesseractStats::doc_char_quality
int16_t doc_char_quality
Definition: tesseractclass.h:137

tesseract::TesseractStats::good_char_count
int16_t good_char_count
Definition: tesseractclass.h:138

tesseract::WordData
Definition: tesseractclass.h:151

tesseract::WordData::row
ROW * row
Definition: tesseractclass.h:162

tesseract::WordData::prev_word
WordData * prev_word
Definition: tesseractclass.h:164

tesseract::WordData::word
WERD_RES * word
Definition: tesseractclass.h:161

tesseract::WordData::block
BLOCK * block
Definition: tesseractclass.h:163

tesseract::WordData::lang_words
PointerVector< WERD_RES > lang_words
Definition: tesseractclass.h:165

tesseract::Tesseract
Definition: tesseractclass.h:178

tesseract::Tesseract::AssignDiacriticsToNewBlobs
void AssignDiacriticsToNewBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:1036

tesseract::Tesseract::LSTMRecognizeWord
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:230

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:81

tesseract::Tesseract::recog_interactive
bool recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:76

tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1302

tesseract::Tesseract::match_word_pass_n
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1589

tesseract::Tesseract::bigram_correction_pass
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:456

tesseract::Tesseract::word_blob_quality
int16_t word_blob_quality(WERD_RES *word)
Definition: docqual.cpp:51

tesseract::Tesseract::ComputeCompatibleXheight
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:105

tesseract::Tesseract::fix_rep_char
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1665

tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs
void AssignDiacriticsToOverlappingBlobs(const std::vector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, std::vector< bool > *word_wanted, std::vector< bool > *overlapped_any_blob, std::vector< C_BLOB * > *target_blobs)
Definition: control.cpp:981

tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:166

tesseract::Tesseract::recog_pseudo_word
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:62

tesseract::Tesseract::PrerecAllWordsPar
void PrerecAllWordsPar(const std::vector< WordData > &words)
Definition: par_control.cpp:38

tesseract::Tesseract::word_outline_errs
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:62

tesseract::Tesseract::rejection_passes
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:599

tesseract::Tesseract::tess_segment_pass_n
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:32

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1692

tesseract::Tesseract::output_pass
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:39

tesseract::Tesseract::getDict
Dict & getDict() override
Definition: tesseractclass.cpp:480

tesseract::Tesseract::TestNewNormalization
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1488

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:120

tesseract::Tesseract::classify_word_pass1
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1379

tesseract::Tesseract::BestPix
Image BestPix() const
Definition: tesseractclass.h:238

tesseract::Tesseract::RecogAllWordsPassN
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, std::vector< WordData > *words)
Definition: control.cpp:198

tesseract::Tesseract::SubAndSuperscriptFix
bool SubAndSuperscriptFix(WERD_RES *word_res)
Definition: superscript.cpp:108

tesseract::Tesseract::RetryWithLanguage
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:873

tesseract::Tesseract::dictionary_correction_pass
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2057

tesseract::Tesseract::check_debug_pt
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1799

tesseract::Tesseract::tess_add_doc_word
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:73

tesseract::Tesseract::font_recognition_pass
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:2003

tesseract::Tesseract::ReassignDiacritics
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:914

tesseract::Tesseract::AnyTessLang
bool AnyTessLang() const
Definition: tesseractclass.h:290

tesseract::Tesseract::ProcessTargetWord
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118

tesseract::Tesseract::CountMisfitTops
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:72

tesseract::Tesseract::TrainedXheightFix
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1455

tesseract::Tesseract::word_adaptable
bool word_adaptable(WERD_RES *word, uint16_t mode)
Definition: adaptions.cpp:34

tesseract::Tesseract::set_word_fonts
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1915

tesseract::Tesseract::SelectGoodDiacriticOutlines
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const std::vector< C_OUTLINE * > &outlines, int num_outlines, std::vector< bool > *ok_outlines)
Definition: control.cpp:1120

tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:77

tesseract::Tesseract::AnyLSTMLang
bool AnyLSTMLang() const
Definition: tesseractclass.h:302

tesseract::Tesseract::tess_acceptable_word
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:64

tesseract::Tesseract::right_to_left
bool right_to_left() const
Definition: tesseractclass.h:280

tesseract::Tesseract::script_pos_pass
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:707

tesseract::Tesseract::ClassifyBlobAsWord
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str, float *c2)
Definition: control.cpp:1252

tesseract::Tesseract::ClassifyBlobPlusOutlines
float ClassifyBlobPlusOutlines(const std::vector< bool > &ok_outlines, const std::vector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, std::string &best_str)
Definition: control.cpp:1207

tesseract::Tesseract::blamer_pass
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:683

tesseract::Tesseract::classify_word_pass2
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1535

tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
Definition: reject.cpp:96

tesseract::Tesseract::recog_all_words
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287

tesseract::Tesseract::SetupAllWordsPassN
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, std::vector< WordData > *words)
Definition: control.cpp:146

tesseract::Tesseract::ReportXhtFixResult
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1436

tesseract::BlamerBundle
Definition: blamer.h:107

tesseract::BlamerBundle::IncorrectReasonName
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56

tesseract::BlamerBundle::LastChanceBlame
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:540

tesseract::BlamerBundle::misadaption_debug
const std::string & misadaption_debug() const
Definition: blamer.h:143

tesseract::BlamerBundle::SetMisAdaptionDebug
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:564

tesseract::BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:214

tesseract::BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:131

tesseract::TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:863

tesseract::TWERD::plot
void plot(ScrollView *window)
Definition: blobs.cpp:907

tesseract::BoxWord::length
unsigned length() const
Definition: boxword.h:81

tesseract::CCStruct::kXHeightCapRatio
static const double kXHeightCapRatio
Definition: ccstruct.h:35

tesseract::C_OUTLINE
Definition: coutln.h:75

tesseract::FontInfo
Definition: fontinfo.h:64

tesseract::FontInfo::universal_id
int32_t universal_id
Definition: fontinfo.h:140

tesseract::FontInfo::name
char * name
Definition: fontinfo.h:134

tesseract::BLOCK
Definition: ocrblock.h:34

tesseract::BLOCK::classify_rotation
FCOORD classify_rotation() const
Definition: ocrblock.h:135

tesseract::BLOCK::pdblk
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185

tesseract::BLOCK::x_height
int32_t x_height() const
return xheight
Definition: ocrblock.h:101

tesseract::ROW
Definition: ocrrow.h:39

tesseract::ROW::x_height
float x_height() const
Definition: ocrrow.h:66

tesseract::PAGE_RES
Definition: pageres.h:77

tesseract::PAGE_RES::rej_count
int32_t rej_count
Definition: pageres.h:80

tesseract::PAGE_RES::char_count
int32_t char_count
Definition: pageres.h:79

tesseract::PAGE_RES::misadaption_log
std::vector< std::string > misadaption_log
Definition: pageres.h:92

tesseract::PAGE_RES::blame_reasons
std::vector< int > blame_reasons
Definition: pageres.h:87

tesseract::BLOCK_RES::block
BLOCK * block
Definition: pageres.h:120

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:308

tesseract::WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:278

tesseract::WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:239

tesseract::WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:244

tesseract::WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:301

tesseract::WERD_RES::fontinfo_id2_count
int8_t fontinfo_id2_count
Definition: pageres.h:310

tesseract::WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:292

tesseract::WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1077

tesseract::WERD_RES::done
bool done
Definition: pageres.h:303

tesseract::WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:210

tesseract::WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:279

tesseract::WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Image pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:304

tesseract::WERD_RES::small_caps
bool small_caps
Definition: pageres.h:304

tesseract::WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:785

tesseract::WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:888

tesseract::WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:250

tesseract::WERD_RES::fontinfo_id_count
int8_t fontinfo_id_count
Definition: pageres.h:309

tesseract::WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:201

tesseract::WERD_RES::guessed_x_ht
bool guessed_x_ht
Definition: pageres.h:311

tesseract::WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:307

tesseract::WERD_RES::word
WERD * word
Definition: pageres.h:184

tesseract::WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:779

tesseract::WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:956

tesseract::WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:247

tesseract::WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:270

tesseract::WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:824

tesseract::WERD_RES::tess_would_adapt
bool tess_would_adapt
Definition: pageres.h:302

tesseract::WERD_RES::tess_failed
bool tess_failed
Definition: pageres.h:293

tesseract::WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:731

tesseract::WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(unsigned index) const
Definition: pageres.cpp:768

tesseract::WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:455

tesseract::WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:316

tesseract::WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1047

tesseract::WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:264

tesseract::WERD_RES::part_of_combo
bool part_of_combo
Definition: pageres.h:338

tesseract::WERD_RES::caps_height
float caps_height
Definition: pageres.h:315

tesseract::WERD_RES::x_height
float x_height
Definition: pageres.h:314

tesseract::PAGE_RES_IT
Definition: pageres.h:682

tesseract::PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:769

tesseract::PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1521

tesseract::PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:684

tesseract::PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:743

tesseract::PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:763

tesseract::PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:710

tesseract::PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1488

tesseract::PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1722

tesseract::PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1571

tesseract::PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1378

tesseract::PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1252

tesseract::PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59

tesseract::FCOORD
Definition: points.h:189

tesseract::FCOORD::y
float y() const
Definition: points.h:209

tesseract::POLY_BLOCK
Definition: polyblk.h:30

tesseract::POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::BLOB_CHOICE
Definition: ratngs.h:56

tesseract::BLOB_CHOICE::fonts
const std::vector< ScoredFont > & fonts() const
Definition: ratngs.h:97

tesseract::BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:84

tesseract::WERD_CHOICE
Definition: ratngs.h:258

tesseract::WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:324

tesseract::WERD_CHOICE::debug_string
std::string debug_string() const
Definition: ratngs.h:479

tesseract::WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:315

tesseract::WERD_CHOICE::shallow_copy
WERD_CHOICE shallow_copy(unsigned start, unsigned end) const
Definition: ratngs.cpp:393

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::empty
bool empty() const
Definition: ratngs.h:284

tesseract::WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:331

tesseract::WERD_CHOICE::GetNonSuperscriptSpan
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:378

tesseract::WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:321

tesseract::WERD_CHOICE::length
unsigned length() const
Definition: ratngs.h:287

tesseract::WERD_CHOICE::unichar_string
std::string & unichar_string()
Definition: ratngs.h:519

tesseract::WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:312

tesseract::WERD_CHOICE::IsAllSpaces
bool IsAllSpaces() const
Definition: ratngs.h:497

tesseract::TBOX
Definition: rect.h:37

tesseract::TBOX::left
TDimension left() const
Definition: rect.h:82

tesseract::TBOX::top
TDimension top() const
Definition: rect.h:68

tesseract::TBOX::major_x_overlap
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:419

tesseract::TBOX::print
void print() const
Definition: rect.h:289

tesseract::TBOX::right
TDimension right() const
Definition: rect.h:89

tesseract::TBOX::bottom
TDimension bottom() const
Definition: rect.h:75

tesseract::TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:344

tesseract::TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:374

tesseract::TBOX::x_overlap
bool x_overlap(const TBOX &box) const
Definition: rect.h:409

tesseract::REJMAP::print
void print(FILE *fp) const
Definition: rejctmap.cpp:112

tesseract::REJMAP::reject_count
int16_t reject_count() const
Definition: rejctmap.h:339

tesseract::REJMAP::rej_word_bad_quality
void rej_word_bad_quality()
Definition: rejctmap.cpp:187

tesseract::REJMAP::length
uint16_t length() const
Definition: rejctmap.h:333

tesseract::REJMAP::initialise
void initialise(uint16_t length)
Definition: rejctmap.cpp:67

tesseract::REJMAP::full_print
void full_print(FILE *fp) const
Definition: rejctmap.cpp:120

tesseract::STATS
Definition: statistc.h:30

tesseract::STATS::add
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99

tesseract::STATS::pile_count
int32_t pile_count(int32_t value) const
Definition: statistc.h:72

tesseract::STATS::get_total
int32_t get_total() const
Definition: statistc.h:85

tesseract::STATS::mode
int32_t mode() const
Definition: statistc.cpp:112

tesseract::C_BLOB
Definition: stepblob.h:40

tesseract::C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:70

tesseract::C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:250

tesseract::C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124

tesseract::C_BLOB::deep_copy
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118

tesseract::WERD
Definition: werd.h:58

tesseract::WERD::flag
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::WERD::GetNoiseOutlines
void GetNoiseOutlines(std::vector< C_OUTLINE * > *outlines)
Definition: werd.cpp:508

tesseract::WERD::ConstructFromSingleBlob
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:132

tesseract::WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:155

tesseract::WERD::AddSelectedOutlines
bool AddSelectedOutlines(const std::vector< bool > &wanted, const std::vector< C_BLOB * > &target_blobs, const std::vector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:526

tesseract::WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:91

tesseract::WERD::print
void print() const
Definition: werd.cpp:262

tesseract::WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:96

tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:53

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:61

tesseract::CCUtil::lang
std::string lang
Definition: ccutil.h:59

tesseract::GenericVector::size
unsigned size() const
Definition: genericvector.h:70

tesseract::GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:574

tesseract::GenericVector::back
T & back() const
Definition: genericvector.h:516

tesseract::GenericVector::empty
bool empty() const
Definition: genericvector.h:84

tesseract::PointerVector
Definition: genericvector.h:302

tesseract::ParamUtils::ReadParamsFile
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41

tesseract::ParamUtils::PrintParams
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164

tesseract::SortHelper
Definition: sorthelper.h:37

tesseract::SortHelper::MaxCount
int MaxCount(T *max_value) const
Definition: sorthelper.h:86

tesseract::SortHelper::Add
void Add(T value, int count)
Definition: sorthelper.h:71

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:958

tesseract::UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:506

tesseract::UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:555

tesseract::UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515

tesseract::UNICHARSET::debug_str
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331

tesseract::Classify::AdaptiveClassifierIsEmpty
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:268

tesseract::Classify::AdaptableWord
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811

tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262

tesseract::Classify::StartBackupAdaptiveClassifier
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625

tesseract::Classify::SwitchAdaptiveClassifier
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609

tesseract::Classify::AdaptiveClassifierIsFull
bool AdaptiveClassifierIsFull() const
Definition: classify.h:265

tesseract::Classify::fontinfo_table_
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:434

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437

tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:836

tesseract::LSTMRecognizer::GetUnicharset
const UNICHARSET & GetUnicharset() const
Definition: lstmrecognizer.h:189

tesseract::Textord::CleanupSingleRowResult
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:264

tesseract::ScrollView::ZoomToRectangle
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:742

tesseract::ScrollView::Update
static void Update()
Definition: scrollview.cpp:700

tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387