tesseract-ocr.github.io/3.x/a00690_source.html

 /******************************************************************

  * File:        control.cpp  (Formerly control.c)

  * Description: Module-independent matcher controller.

  * Author:                                      Ray Smith

  * Created:                                     Thu Apr 23 11:09:58 BST 1992

  * ReHacked:    Tue Sep 22 08:42:49 BST 1992 Phil Cheatle

  *

  * (C) Copyright 1992, Hewlett-Packard Ltd.

  ** Licensed under the Apache License, Version 2.0 (the "License");

  ** you may not use this file except in compliance with the License.

  ** You may obtain a copy of the License at

  ** http://www.apache.org/licenses/LICENSE-2.0

  ** Unless required by applicable law or agreed to in writing, software

  ** distributed under the License is distributed on an "AS IS" BASIS,

  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

  ** See the License for the specific language governing permissions and

  ** limitations under the License.

  *

  **********************************************************************/


 #include <string.h>

 #include <math.h>

 #ifdef __UNIX__

 #include <assert.h>

 #include <unistd.h>

 #include <errno.h>

 #endif

 #include <ctype.h>

 #include "ocrclass.h"

 #include "werdit.h"

 #include "drawfx.h"

 #include "tessbox.h"

 #include "tessvars.h"

 #include "pgedit.h"

 #include "reject.h"

 #include "fixspace.h"

 #include "docqual.h"

 #include "control.h"

 #include "output.h"

 #include "callcpp.h"

 #include "globals.h"

 #include "sorthelper.h"

 #include "tesseractclass.h"


 // Include automatically generated configuration file if running autoconf.

 #ifdef HAVE_CONFIG_H

 #include "config_auto.h"

 #endif


 #define MIN_FONT_ROW_COUNT  8

 #define MAX_XHEIGHT_DIFF  3


 const char* const kBackUpConfigFile = "tempconfigdata.config";

 // Multiple of x-height to make a repeated word have spaces in it.

 const double kRepcharGapThreshold = 0.5;

 // Min believable x-height for any text when refitting as a fraction of

 // original x-height

 const double kMinRefitXHeightFraction = 0.5;


 namespace tesseract {

 void Tesseract::recog_pseudo_word(PAGE_RES* page_res,

                                   TBOX &selection_box) {

   PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);

   if (it != NULL) {

     recog_interactive(it);

     it->DeleteCurrentWord();

     delete it;

   }

 }


 BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {

   inT16 char_qual;

   inT16 good_char_qual;


   WordData word_data(*pr_it);

   SetupWordPassN(2, &word_data);

   classify_word_and_language(2, pr_it, &word_data);

   if (tessedit_debug_quality_metrics) {

     WERD_RES* word_res = pr_it->word();

     word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);

     tprintf("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "

             "char_quality: %d; good_char_quality: %d\n",

             word_res->reject_map.length(),

             word_blob_quality(word_res, pr_it->row()->row),

             word_outline_errs(word_res), char_qual, good_char_qual);

   }

   return TRUE;

 }


 // Helper function to check for a target word and handle it appropriately.

 // Inspired by Jetsoft's requirement to process only single words on pass2

 // and beyond.

 // If word_config is not null:

 //   If the word_box and target_word_box overlap, read the word_config file

 //   else reset to previous config data.

 //   return true.

 // else

 //   If the word_box and target_word_box overlap or pass <= 1, return true.

 // Note that this function uses a fixed temporary file for storing the previous

 // configs, so it is neither thread-safe, nor process-safe, but the assumption

 // is that it will only be used for one debug window at a time.

 //

 // Since this function is used for debugging (and not to change OCR results)

 // set only debug params from the word config file.

 bool Tesseract::ProcessTargetWord(const TBOX& word_box,

                                   const TBOX& target_word_box,

                                   const char* word_config,

                                   int pass) {

   if (word_config != NULL) {

     if (word_box.major_overlap(target_word_box)) {

       if (backup_config_file_ == NULL) {

         backup_config_file_ = kBackUpConfigFile;

         FILE* config_fp = fopen(backup_config_file_, "wb");

         ParamUtils::PrintParams(config_fp, params());

         fclose(config_fp);

         ParamUtils::ReadParamsFile(word_config,

                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,

                                    params());

       }

     } else {

       if (backup_config_file_ != NULL) {

         ParamUtils::ReadParamsFile(backup_config_file_,

                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,

                                    params());

         backup_config_file_ = NULL;

       }

     }

   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {

     return false;

   }

   return true;

 }


 void Tesseract::SetupAllWordsPassN(int pass_n,

                                    const TBOX* target_word_box,

                                    const char* word_config,

                                    PAGE_RES* page_res,

                                    GenericVector<WordData>* words) {

   // Prepare all the words.

   PAGE_RES_IT page_res_it(page_res);

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

        page_res_it.forward()) {

     if (target_word_box == NULL ||

         ProcessTargetWord(page_res_it.word()->word->bounding_box(),

                           *target_word_box, word_config, 1)) {

       words->push_back(WordData(page_res_it));

     }

   }

   // Setup all the words for recognition with polygonal approximation.

   for (int w = 0; w < words->size(); ++w) {

     SetupWordPassN(pass_n, &(*words)[w]);

     if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];

   }

 }


 // Sets up the single word ready for whichever engine is to be run.

 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {

   if (pass_n == 1 || !word->word->done) {

     if (pass_n == 1) {

       word->word->SetupForRecognition(unicharset, this, BestPix(),

                                       tessedit_ocr_engine_mode, NULL,

                                       classify_bln_numeric_mode,

                                       textord_use_cjk_fp_model,

                                       poly_allow_detailed_fx,

                                       word->row, word->block);

     } else if (pass_n == 2) {

       // TODO(rays) Should we do this on pass1 too?

       word->word->caps_height = 0.0;

       if (word->word->x_height == 0.0f)

         word->word->x_height = word->row->x_height();

     }

     word->lang_words.truncate(0);

     for (int s = 0; s <= sub_langs_.size(); ++s) {

       // The sub_langs_.size() entry is for the master language.

       Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;

       WERD_RES* word_res = new WERD_RES;

       word_res->InitForRetryRecognition(*word->word);

       word->lang_words.push_back(word_res);

       // Cube doesn't get setup for pass2.

       if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) {

         word_res->SetupForRecognition(

               lang_t->unicharset, lang_t, BestPix(),

               lang_t->tessedit_ocr_engine_mode, NULL,

               lang_t->classify_bln_numeric_mode,

               lang_t->textord_use_cjk_fp_model,

               lang_t->poly_allow_detailed_fx, word->row, word->block);

       }

     }

   }

 }


 // Runs word recognition on all the words.

 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,

                                    PAGE_RES_IT* pr_it,

                                    GenericVector<WordData>* words) {

   // TODO(rays) Before this loop can be parallelized (it would yield a massive

   // speed-up) all remaining member globals need to be converted to local/heap

   // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be

   // added. The results will be significantly different with adaption on, and

   // deterioration will need investigation.

   pr_it->restart_page();

   for (int w = 0; w < words->size(); ++w) {

     WordData* word = &(*words)[w];

     if (w > 0) word->prev_word = &(*words)[w - 1];

     if (monitor != NULL) {

       monitor->ocr_alive = TRUE;

       if (pass_n == 1)

         monitor->progress = 30 + 50 * w / words->size();

       else

         monitor->progress = 80 + 10 * w / words->size();

       if (monitor->deadline_exceeded() ||

           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,

                                                          words->size()))) {

         // Timeout. Fake out the rest of the words.

         for (; w < words->size(); ++w) {

           (*words)[w].word->SetupFake(unicharset);

         }

         return false;

       }

     }

     if (word->word->tess_failed) {

       int s;

       for (s = 0; s < word->lang_words.size() &&

            word->lang_words[s]->tess_failed; ++s) {}

       // If all are failed, skip it. Image words are skipped by this test.

       if (s > word->lang_words.size()) continue;

     }

     // Sync pr_it with the wth WordData.

     while (pr_it->word() != NULL && pr_it->word() != word->word)

       pr_it->forward();

     ASSERT_HOST(pr_it->word() != NULL);

     bool make_next_word_fuzzy = false;

     if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {

       // Needs to be setup again to see the new outlines in the chopped_word.

       SetupWordPassN(pass_n, word);

     }


     classify_word_and_language(pass_n, pr_it, word);

     if (tessedit_dump_choices || debug_noise_removal) {

       tprintf("Pass%d: %s [%s]\n", pass_n,

               word->word->best_choice->unichar_string().string(),

               word->word->best_choice->debug_string().string());

     }

     pr_it->forward();

     if (make_next_word_fuzzy && pr_it->word() != NULL) {

       pr_it->MakeCurrentWordFuzzy();

     }

   }

   return true;

 }


 bool Tesseract::recog_all_words(PAGE_RES* page_res,

                                 ETEXT_DESC* monitor,

                                 const TBOX* target_word_box,

                                 const char* word_config,

                                 int dopasses) {

   PAGE_RES_IT page_res_it(page_res);


   if (tessedit_minimal_rej_pass1) {

     tessedit_test_adaption.set_value (TRUE);

     tessedit_minimal_rejection.set_value (TRUE);

   }


   if (dopasses==0 || dopasses==1) {

     page_res_it.restart_page();

     // ****************** Pass 1 *******************


     // If the adaptive classifier is full switch to one we prepared earlier,

     // ie on the previous page. If the current adaptive classifier is non-empty,

     // prepare a backup starting at this page, in case it fills up. Do all this

     // independently for each language.

     if (AdaptiveClassifierIsFull()) {

       SwitchAdaptiveClassifier();

     } else if (!AdaptiveClassifierIsEmpty()) {

       StartBackupAdaptiveClassifier();

     }

     // Now check the sub-langs as well.

     for (int i = 0; i < sub_langs_.size(); ++i) {

       if (sub_langs_[i]->AdaptiveClassifierIsFull()) {

         sub_langs_[i]->SwitchAdaptiveClassifier();

       } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {

         sub_langs_[i]->StartBackupAdaptiveClassifier();

       }

     }

     // Set up all words ready for recognition, so that if parallelism is on

     // all the input and output classes are ready to run the classifier.

     GenericVector<WordData> words;

     SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);

     if (tessedit_parallelize) {

       PrerecAllWordsPar(words);

     }


     stats_.word_count = words.size();


     stats_.dict_words = 0;

     stats_.doc_blob_quality = 0;

     stats_.doc_outline_errs = 0;

     stats_.doc_char_quality = 0;

     stats_.good_char_count = 0;

     stats_.doc_good_char_quality = 0;


     most_recently_used_ = this;

     // Run pass 1 word recognition.

     if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;

     // Pass 1 post-processing.

     for (page_res_it.restart_page(); page_res_it.word() != NULL;

          page_res_it.forward()) {

       if (page_res_it.word()->word->flag(W_REP_CHAR)) {

         fix_rep_char(&page_res_it);

         continue;

       }


       // Count dict words.

       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)

         ++(stats_.dict_words);


       // Update misadaption log (we only need to do it on pass 1, since

       // adaption only happens on this pass).

       if (page_res_it.word()->blamer_bundle != NULL &&

           page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {

         page_res->misadaption_log.push_back(

             page_res_it.word()->blamer_bundle->misadaption_debug());

       }

     }

   }


   if (dopasses == 1) return true;


   // ****************** Pass 2 *******************

   if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&

       AnyTessLang()) {

     page_res_it.restart_page();

     GenericVector<WordData> words;

     SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);

     if (tessedit_parallelize) {

       PrerecAllWordsPar(words);

     }

     most_recently_used_ = this;

     // Run pass 2 word recognition.

     if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;

   }


   // The next passes can only be run if tesseract has been used, as cube

   // doesn't set all the necessary outputs in WERD_RES.

   if (AnyTessLang()) {

     // ****************** Pass 3 *******************

     // Fix fuzzy spaces.

     set_global_loc_code(LOC_FUZZY_SPACE);


     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces

         && !tessedit_word_for_word && !right_to_left())

       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);


     // ****************** Pass 4 *******************

     if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res);

     if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);


     // ****************** Pass 5,6 *******************

     rejection_passes(page_res, monitor, target_word_box, word_config);


 #ifndef ANDROID_BUILD

     // ****************** Pass 7 *******************

     // Cube combiner.

     // If cube is loaded and its combiner is present, run it.

     if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) {

       run_cube_combiner(page_res);

     }

 #endif


     // ****************** Pass 8 *******************

     font_recognition_pass(page_res);


     // ****************** Pass 9 *******************

     // Check the correctness of the final results.

     blamer_pass(page_res);

     script_pos_pass(page_res);

   }


   // Write results pass.

   set_global_loc_code(LOC_WRITE_RESULTS);

   // This is now redundant, but retained commented so show how to obtain

   // bounding boxes and style information.


   // changed by jetsoft

   // needed for dll to output memory structure

   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))

     output_pass(page_res_it, target_word_box);

   // end jetsoft

   PageSegMode pageseg_mode = static_cast<PageSegMode>(

       static_cast<int>(tessedit_pageseg_mode));

   textord_.CleanupSingleRowResult(pageseg_mode, page_res);


   // Remove empty words, as these mess up the result iterators.

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

        page_res_it.forward()) {

     WERD_RES* word = page_res_it.word();

     if (word->best_choice == NULL || word->best_choice->length() == 0)

       page_res_it.DeleteCurrentWord();

   }


   if (monitor != NULL) {

     monitor->progress = 100;

   }

   return true;

 }


 void Tesseract::bigram_correction_pass(PAGE_RES *page_res) {

   PAGE_RES_IT word_it(page_res);


   WERD_RES *w_prev = NULL;

   WERD_RES *w = word_it.word();

   while (1) {

     w_prev = w;

     while (word_it.forward() != NULL &&

            (!word_it.word() || word_it.word()->part_of_combo)) {

       // advance word_it, skipping over parts of combos

     }

     if (!word_it.word()) break;

     w = word_it.word();

     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {

       continue;

     }

     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {

       if (tessedit_bigram_debug) {

         tprintf("Skipping because one of the words is W_REP_CHAR\n");

       }

       continue;

     }

     // Two words sharing the same language model, excellent!

     GenericVector<WERD_CHOICE *> overrides_word1;

     GenericVector<WERD_CHOICE *> overrides_word2;


     STRING orig_w1_str = w_prev->best_choice->unichar_string();

     STRING orig_w2_str = w->best_choice->unichar_string();

     WERD_CHOICE prev_best(w->uch_set);

     {

       int w1start, w1end;

       w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);

       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);

     }

     WERD_CHOICE this_best(w->uch_set);

     {

       int w2start, w2end;

       w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);

       this_best = w->best_choice->shallow_copy(w2start, w2end);

     }


     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {

       if (tessedit_bigram_debug) {

         tprintf("Top choice \"%s %s\" verified by bigram model.\n",

                 orig_w1_str.string(), orig_w2_str.string());

       }

       continue;

     }

     if (tessedit_bigram_debug > 2) {

       tprintf("Examining alt choices for \"%s %s\".\n",

               orig_w1_str.string(), orig_w2_str.string());

     }

     if (tessedit_bigram_debug > 1) {

       if (!w_prev->best_choices.singleton()) {

         w_prev->PrintBestChoices();

       }

       if (!w->best_choices.singleton()) {

         w->PrintBestChoices();

       }

     }

     float best_rating = 0.0;

     int best_idx = 0;

     WERD_CHOICE_IT prev_it(&w_prev->best_choices);

     for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {

       WERD_CHOICE *p1 = prev_it.data();

       WERD_CHOICE strip1(w->uch_set);

       {

         int p1start, p1end;

         p1->GetNonSuperscriptSpan(&p1start, &p1end);

         strip1 = p1->shallow_copy(p1start, p1end);

       }

       WERD_CHOICE_IT w_it(&w->best_choices);

       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {

         WERD_CHOICE *p2 = w_it.data();

         WERD_CHOICE strip2(w->uch_set);

         {

           int p2start, p2end;

           p2->GetNonSuperscriptSpan(&p2start, &p2end);

           strip2 = p2->shallow_copy(p2start, p2end);

         }

         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {

           overrides_word1.push_back(p1);

           overrides_word2.push_back(p2);

           if (overrides_word1.size() == 1 ||

               p1->rating() + p2->rating() < best_rating) {

             best_rating = p1->rating() + p2->rating();

             best_idx = overrides_word1.size() - 1;

           }

         }

       }

     }

     if (overrides_word1.size() >= 1) {

       // Excellent, we have some bigram matches.

       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,

                                             *overrides_word1[best_idx]) &&

           EqualIgnoringCaseAndTerminalPunct(*w->best_choice,

                                             *overrides_word2[best_idx])) {

         if (tessedit_bigram_debug > 1) {

           tprintf("Top choice \"%s %s\" verified (sans case) by bigram "

                   "model.\n", orig_w1_str.string(), orig_w2_str.string());

         }

         continue;

       }

       STRING new_w1_str = overrides_word1[best_idx]->unichar_string();

       STRING new_w2_str = overrides_word2[best_idx]->unichar_string();

       if (new_w1_str != orig_w1_str) {

         w_prev->ReplaceBestChoice(overrides_word1[best_idx]);

       }

       if (new_w2_str != orig_w2_str) {

         w->ReplaceBestChoice(overrides_word2[best_idx]);

       }

       if (tessedit_bigram_debug > 0) {

         STRING choices_description;

         int num_bigram_choices

             = overrides_word1.size() * overrides_word2.size();

         if (num_bigram_choices == 1) {

           choices_description = "This was the unique bigram choice.";

         } else {

           if (tessedit_bigram_debug > 1) {

             STRING bigrams_list;

             const int kMaxChoicesToPrint = 20;

             for (int i = 0; i < overrides_word1.size() &&

                  i < kMaxChoicesToPrint; i++) {

               if (i > 0) { bigrams_list += ", "; }

               WERD_CHOICE *p1 = overrides_word1[i];

               WERD_CHOICE *p2 = overrides_word2[i];

               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();

               if (i == kMaxChoicesToPrint) {

                 bigrams_list += " ...";

               }

             }

             choices_description = "There were many choices: {";

             choices_description += bigrams_list;

             choices_description += "}";

           } else {

             choices_description.add_str_int("There were ", num_bigram_choices);

             choices_description += " compatible bigrams.";

           }

         }

         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",

                 orig_w1_str.string(), orig_w2_str.string(),

                 new_w1_str.string(), new_w2_str.string(),

                 choices_description.string());

       }

     }

   }

 }


 void Tesseract::rejection_passes(PAGE_RES* page_res,

                                  ETEXT_DESC* monitor,

                                  const TBOX* target_word_box,

                                  const char* word_config) {

   PAGE_RES_IT page_res_it(page_res);

   // ****************** Pass 5 *******************

   // Gather statistics on rejects.

   int word_index = 0;

   while (!tessedit_test_adaption && page_res_it.word() != NULL) {

     set_global_loc_code(LOC_MM_ADAPT);

     WERD_RES* word = page_res_it.word();

     word_index++;

     if (monitor != NULL) {

       monitor->ocr_alive = TRUE;

       monitor->progress = 95 + 5 * word_index / stats_.word_count;

     }

     if (word->rebuild_word == NULL) {

       // Word was not processed by tesseract.

       page_res_it.forward();

       continue;

     }

     check_debug_pt(word, 70);


     // changed by jetsoft

     // specific to its needs to extract one word when need

     if (target_word_box &&

         !ProcessTargetWord(word->word->bounding_box(),

                            *target_word_box, word_config, 4)) {

       page_res_it.forward();

       continue;

     }

     // end jetsoft


     page_res_it.rej_stat_word();

     int chars_in_word = word->reject_map.length();

     int rejects_in_word = word->reject_map.reject_count();


     int blob_quality = word_blob_quality(word, page_res_it.row()->row);

     stats_.doc_blob_quality += blob_quality;

     int outline_errs = word_outline_errs(word);

     stats_.doc_outline_errs += outline_errs;

     inT16 all_char_quality;

     inT16 accepted_all_char_quality;

     word_char_quality(word, page_res_it.row()->row,

                       &all_char_quality, &accepted_all_char_quality);

     stats_.doc_char_quality += all_char_quality;

     uinT8 permuter_type = word->best_choice->permuter();

     if ((permuter_type == SYSTEM_DAWG_PERM) ||

         (permuter_type == FREQ_DAWG_PERM) ||

         (permuter_type == USER_DAWG_PERM)) {

       stats_.good_char_count += chars_in_word - rejects_in_word;

       stats_.doc_good_char_quality += accepted_all_char_quality;

     }

     check_debug_pt(word, 80);

     if (tessedit_reject_bad_qual_wds &&

         (blob_quality == 0) && (outline_errs >= chars_in_word))

       word->reject_map.rej_word_bad_quality();

     check_debug_pt(word, 90);

     page_res_it.forward();

   }


   if (tessedit_debug_quality_metrics) {

     tprintf

       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"

        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",

       page_res->char_count, page_res->rej_count,

       page_res->rej_count / static_cast<float>(page_res->char_count),

       stats_.doc_blob_quality,

       stats_.doc_blob_quality / static_cast<float>(page_res->char_count),

       stats_.doc_outline_errs,

       stats_.doc_outline_errs / static_cast<float>(page_res->char_count),

       stats_.doc_char_quality,

       stats_.doc_char_quality / static_cast<float>(page_res->char_count),

       stats_.doc_good_char_quality,

       (stats_.good_char_count > 0) ?

       (stats_.doc_good_char_quality /

        static_cast<float>(stats_.good_char_count)) : 0.0);

   }

   BOOL8 good_quality_doc =

     ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=

      quality_rej_pc) &&

     (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=

      quality_blob_pc) &&

     (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=

      quality_outline_pc) &&

     (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=

      quality_char_pc);


   // ****************** Pass 6 *******************

   // Do whole document or whole block rejection pass

   if (!tessedit_test_adaption) {

     set_global_loc_code(LOC_DOC_BLK_REJ);

     quality_based_rejection(page_res_it, good_quality_doc);

   }

 }


 void Tesseract::blamer_pass(PAGE_RES* page_res) {

   if (!wordrec_run_blamer) return;

   PAGE_RES_IT page_res_it(page_res);

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

       page_res_it.forward()) {

     WERD_RES *word = page_res_it.word();

     BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);

     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;

   }

   tprintf("Blame reasons:\n");

   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {

     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(

         static_cast<IncorrectResultReason>(bl)),

         page_res->blame_reasons[bl]);

   }

   if (page_res->misadaption_log.length() > 0) {

     tprintf("Misadaption log:\n");

     for (int i = 0; i < page_res->misadaption_log.length(); ++i) {

       tprintf("%s\n", page_res->misadaption_log[i].string());

     }

   }

 }


 // Sets script positions and detects smallcaps on all output words.

 void Tesseract::script_pos_pass(PAGE_RES* page_res) {

   PAGE_RES_IT page_res_it(page_res);

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

       page_res_it.forward()) {

     WERD_RES* word = page_res_it.word();

      if (word->word->flag(W_REP_CHAR)) {

       page_res_it.forward();

       continue;

     }

     float x_height = page_res_it.block()->block->x_height();

     float word_x_height = word->x_height;

     if (word_x_height < word->best_choice->min_x_height() ||

         word_x_height > word->best_choice->max_x_height()) {

       word_x_height = (word->best_choice->min_x_height() +

           word->best_choice->max_x_height()) / 2.0f;

     }

     // Test for small caps. Word capheight must be close to block xheight,

     // and word must contain no lower case letters, and at least one upper case.

     double small_cap_xheight = x_height * kXHeightCapRatio;

     double small_cap_delta = (x_height - small_cap_xheight) / 2.0;

     if (word->uch_set->script_has_xheight() &&

         small_cap_xheight - small_cap_delta <= word_x_height &&

         word_x_height <= small_cap_xheight + small_cap_delta) {

       // Scan for upper/lower.

       int num_upper = 0;

       int num_lower = 0;

       for (int i = 0; i < word->best_choice->length(); ++i) {

         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))

           ++num_upper;

         else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))

           ++num_lower;

       }

       if (num_upper > 0 && num_lower == 0)

         word->small_caps = true;

     }

     word->SetScriptPositions();

   }

 }


 // Factored helper considers the indexed word and updates all the pointed

 // values.

 static void EvaluateWord(const PointerVector<WERD_RES>& words, int index,

                          float* rating, float* certainty, bool* bad,

                          bool* valid_permuter, int* right, int* next_left) {

   *right = -MAX_INT32;

   *next_left = MAX_INT32;

   if (index < words.size()) {

     WERD_CHOICE* choice = words[index]->best_choice;

     if (choice == NULL) {

       *bad = true;

     } else {

       *rating += choice->rating();

       *certainty = MIN(*certainty, choice->certainty());

       if (!Dict::valid_word_permuter(choice->permuter(), false))

         *valid_permuter = false;

     }

     *right = words[index]->word->bounding_box().right();

     if (index + 1 < words.size())

       *next_left = words[index + 1]->word->bounding_box().left();

   } else {

     *valid_permuter = false;

     *bad = true;

   }

 }


 // Helper chooses the best combination of words, transferring good ones from

 // new_words to best_words. To win, a new word must have (better rating and

 // certainty) or (better permuter status and rating within rating ratio and

 // certainty within certainty margin) than current best.

 // All the new_words are consumed (moved to best_words or deleted.)

 // The return value is the number of new_words used minus the number of

 // best_words that remain in the output.

 static int SelectBestWords(double rating_ratio,

                            double certainty_margin,

                            bool debug,

                            PointerVector<WERD_RES>* new_words,

                            PointerVector<WERD_RES>* best_words) {

   // Process the smallest groups of words that have an overlapping word

   // boundary at the end.

   GenericVector<WERD_RES*> out_words;

   // Index into each word vector (best, new).

   int b = 0, n = 0;

   int num_best = 0, num_new = 0;

   while (b < best_words->size() || n < new_words->size()) {

     // Start of the current run in each.

     int start_b = b, start_n = n;

     // Rating of the current run in each.

     float b_rating = 0.0f, n_rating = 0.0f;

     // Certainty of the current run in each.

     float b_certainty = 0.0f, n_certainty = 0.0f;

     // True if any word is missing its best choice.

     bool b_bad = false, n_bad = false;

     // True if all words have a valid permuter.

     bool b_valid_permuter = true, n_valid_permuter = true;


     while (b < best_words->size() || n < new_words->size()) {

       int b_right = -MAX_INT32;

       int next_b_left = MAX_INT32;

       EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad,

                    &b_valid_permuter, &b_right, &next_b_left);

       int n_right = -MAX_INT32;

       int next_n_left = MAX_INT32;

       EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad,

                    &n_valid_permuter, &n_right, &next_n_left);

       if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {

         // The word breaks overlap. [start_b,b] and [start_n, n] match.

         break;

       }

       // Keep searching for the matching word break.

       if ((b_right < n_right && b < best_words->size()) ||

           n == new_words->size())

         ++b;

       else

         ++n;

     }

     bool new_better = false;

     if (!n_bad && (b_bad || (n_certainty > b_certainty &&

                              n_rating < b_rating) ||

                             (!b_valid_permuter && n_valid_permuter &&

                              n_rating < b_rating * rating_ratio &&

                              n_certainty > b_certainty - certainty_margin))) {

       // New is better.

       for (int i = start_n; i <= n; ++i) {

         out_words.push_back((*new_words)[i]);

         (*new_words)[i] = NULL;

         ++num_new;

       }

       new_better = true;

     } else if (!b_bad) {

       // Current best is better.

       for (int i = start_b; i <= b; ++i) {

         out_words.push_back((*best_words)[i]);

         (*best_words)[i] = NULL;

         ++num_best;

       }

     }

     int end_b = b < best_words->size() ? b + 1 : b;

     int end_n = n < new_words->size() ? n + 1 : n;

     if (debug) {

       tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"

               " valid dict: %d v %d\n",

               end_n - start_n, new_better ? "better" : "worse",

               end_b - start_b, n_rating, b_rating,

               n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);

     }

     // Move on to the next group.

     b = end_b;

     n = end_n;

   }

   // Transfer from out_words to best_words.

   best_words->clear();

   for (int i = 0; i < out_words.size(); ++i)

     best_words->push_back(out_words[i]);

   return num_new - num_best;

 }


 // Helper to recognize the word using the given (language-specific) tesseract.

 // Returns positive if this recognizer found more new best words than the

 // number kept from best_words.

 int Tesseract::RetryWithLanguage(const WordData& word_data,

                                  WordRecognizer recognizer,

                                  WERD_RES** in_word,

                                  PointerVector<WERD_RES>* best_words) {

   bool debug = classify_debug_level || cube_debug_level;

   if (debug) {

     tprintf("Trying word using lang %s, oem %d\n",

             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));

   }

   // Run the recognizer on the word.

   PointerVector<WERD_RES> new_words;

   (this->*recognizer)(word_data, in_word, &new_words);

   if (new_words.empty()) {

     // Transfer input word to new_words, as the classifier must have put

     // the result back in the input.

     new_words.push_back(*in_word);

     *in_word = NULL;

   }

   if (debug) {

     for (int i = 0; i < new_words.size(); ++i)

       new_words[i]->DebugTopChoice("Lang result");

   }

   // Initial version is a bit of a hack based on better certainty and rating

   // (to reduce false positives from cube) or a dictionary vs non-dictionary

   // word.

   return SelectBestWords(classify_max_rating_ratio,

                          classify_max_certainty_margin,

                          debug, &new_words, best_words);

 }


 // Helper returns true if all the words are acceptable.

 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {

   for (int w = 0; w < words.size(); ++w) {

     if (words[w]->tess_failed || !words[w]->tess_accepted) return false;

   }

   return true;

 }


 // Moves good-looking "noise"/diacritics from the reject list to the main

 // blob list on the current word. Returns true if anything was done, and

 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.

 bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,

                                    bool* make_next_word_fuzzy) {

   *make_next_word_fuzzy = false;

   WERD* real_word = pr_it->word()->word;

   if (real_word->rej_cblob_list()->empty() ||

       real_word->cblob_list()->empty() ||

       real_word->rej_cblob_list()->length() > noise_maxperword)

     return false;

   real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);

   // Get the noise outlines into a vector with matching bool map.

   GenericVector<C_OUTLINE*> outlines;

   real_word->GetNoiseOutlines(&outlines);

   GenericVector<bool> word_wanted;

   GenericVector<bool> overlapped_any_blob;

   GenericVector<C_BLOB*> target_blobs;

   AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,

                                      &word_wanted, &overlapped_any_blob,

                                      &target_blobs);

   // Filter the outlines that overlapped any blob and put them into the word

   // now. This simplifies the remaining task and also makes it more accurate

   // as it has more completed blobs to work on.

   GenericVector<bool> wanted;

   GenericVector<C_BLOB*> wanted_blobs;

   GenericVector<C_OUTLINE*> wanted_outlines;

   int num_overlapped = 0;

   int num_overlapped_used = 0;

   for (int i = 0; i < overlapped_any_blob.size(); ++i) {

     if (overlapped_any_blob[i]) {

       ++num_overlapped;

       if (word_wanted[i]) ++num_overlapped_used;

       wanted.push_back(word_wanted[i]);

       wanted_blobs.push_back(target_blobs[i]);

       wanted_outlines.push_back(outlines[i]);

       outlines[i] = NULL;

     }

   }

   real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);

   AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,

                              &target_blobs);

   int non_overlapped = 0;

   int non_overlapped_used = 0;

   for (int i = 0; i < word_wanted.size(); ++i) {

     if (word_wanted[i]) ++non_overlapped_used;

     if (outlines[i] != NULL) ++non_overlapped_used;

   }

   if (debug_noise_removal) {

     tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",

             num_overlapped_used, num_overlapped, non_overlapped_used,

             non_overlapped);

     real_word->bounding_box().print();

   }

   // Now we have decided which outlines we want, put them into the real_word.

   if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,

                                      make_next_word_fuzzy)) {

     pr_it->MakeCurrentWordFuzzy();

   }

   // TODO(rays) Parts of combos have a deep copy of the real word, and need

   // to have their noise outlines moved/assigned in the same way!!

   return num_overlapped_used != 0 || non_overlapped_used != 0;

 }


 // Attempts to put noise/diacritic outlines into the blobs that they overlap.

 // Input: a set of noisy outlines that probably belong to the real_word.

 // Output: word_wanted indicates which outlines are to be assigned to a blob,

 //   target_blobs indicates which to assign to, and overlapped_any_blob is

 //   true for all outlines that overlapped a blob.

 void Tesseract::AssignDiacriticsToOverlappingBlobs(

     const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,

     PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,

     GenericVector<bool>* overlapped_any_blob,

     GenericVector<C_BLOB*>* target_blobs) {

   GenericVector<bool> blob_wanted;

   word_wanted->init_to_size(outlines.size(), false);

   overlapped_any_blob->init_to_size(outlines.size(), false);

   target_blobs->init_to_size(outlines.size(), NULL);

   // For each real blob, find the outlines that seriously overlap it.

   // A single blob could be several merged characters, so there can be quite

   // a few outlines overlapping, and the full engine needs to be used to chop

   // and join to get a sensible result.

   C_BLOB_IT blob_it(real_word->cblob_list());

   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {

     C_BLOB* blob = blob_it.data();

     TBOX blob_box = blob->bounding_box();

     blob_wanted.init_to_size(outlines.size(), false);

     int num_blob_outlines = 0;

     for (int i = 0; i < outlines.size(); ++i) {

       if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&

           !(*word_wanted)[i]) {

         blob_wanted[i] = true;

         (*overlapped_any_blob)[i] = true;

         ++num_blob_outlines;

       }

     }

     if (debug_noise_removal) {

       tprintf("%d noise outlines overlap blob at:", num_blob_outlines);

       blob_box.print();

     }

     // If any outlines overlap the blob, and not too many, classify the blob

     // (using the full engine, languages and all), and choose the maximal

     // combination of outlines that doesn't hurt the end-result classification

     // by too much. Mark them as wanted.

     if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {

       if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,

                                       outlines, num_blob_outlines,

                                       &blob_wanted)) {

         for (int i = 0; i < blob_wanted.size(); ++i) {

           if (blob_wanted[i]) {

             // Claim the outline and record where it is going.

             (*word_wanted)[i] = true;

             (*target_blobs)[i] = blob;

           }

         }

       }

     }

   }

 }


 // Attempts to assign non-overlapping outlines to their nearest blobs or

 // make new blobs out of them.

 void Tesseract::AssignDiacriticsToNewBlobs(

     const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,

     PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,

     GenericVector<C_BLOB*>* target_blobs) {

   GenericVector<bool> blob_wanted;

   word_wanted->init_to_size(outlines.size(), false);

   target_blobs->init_to_size(outlines.size(), NULL);

   // Check for outlines that need to be turned into stand-alone blobs.

   for (int i = 0; i < outlines.size(); ++i) {

     if (outlines[i] == NULL) continue;

     // Get a set of adjacent outlines that don't overlap any existing blob.

     blob_wanted.init_to_size(outlines.size(), false);

     int num_blob_outlines = 0;

     TBOX total_ol_box(outlines[i]->bounding_box());

     while (i < outlines.size() && outlines[i] != NULL) {

       blob_wanted[i] = true;

       total_ol_box += outlines[i]->bounding_box();

       ++i;

       ++num_blob_outlines;

     }

     // Find the insertion point.

     C_BLOB_IT blob_it(real_word->cblob_list());

     while (!blob_it.at_last() &&

            blob_it.data_relative(1)->bounding_box().left() <=

                total_ol_box.left()) {

       blob_it.forward();

     }

     // Choose which combination of them we actually want and where to put

     // them.

     if (debug_noise_removal)

       tprintf("Num blobless outlines = %d\n", num_blob_outlines);

     C_BLOB* left_blob = blob_it.data();

     TBOX left_box = left_blob->bounding_box();

     C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);

     if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||

          !right_blob->bounding_box().x_overlap(total_ol_box)) &&

         SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,

                                     outlines, num_blob_outlines,

                                     &blob_wanted)) {

       if (debug_noise_removal) tprintf("Added to left blob\n");

       for (int j = 0; j < blob_wanted.size(); ++j) {

         if (blob_wanted[j]) {

           (*word_wanted)[j] = true;

           (*target_blobs)[j] = left_blob;

         }

       }

     } else if (right_blob != NULL &&

                (!left_box.x_overlap(total_ol_box) ||

                 right_blob->bounding_box().x_overlap(total_ol_box)) &&

                SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,

                                            right_blob, outlines,

                                            num_blob_outlines, &blob_wanted)) {

       if (debug_noise_removal) tprintf("Added to right blob\n");

       for (int j = 0; j < blob_wanted.size(); ++j) {

         if (blob_wanted[j]) {

           (*word_wanted)[j] = true;

           (*target_blobs)[j] = right_blob;

         }

       }

     } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,

                                            outlines, num_blob_outlines,

                                            &blob_wanted)) {

       if (debug_noise_removal) tprintf("Fitted between blobs\n");

       for (int j = 0; j < blob_wanted.size(); ++j) {

         if (blob_wanted[j]) {

           (*word_wanted)[j] = true;

           (*target_blobs)[j] = NULL;

         }

       }

     }

   }

 }


 // Starting with ok_outlines set to indicate which outlines overlap the blob,

 // chooses the optimal set (approximately) and returns true if any outlines

 // are desired, in which case ok_outlines indicates which ones.

 bool Tesseract::SelectGoodDiacriticOutlines(

     int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,

     const GenericVector<C_OUTLINE*>& outlines, int num_outlines,

     GenericVector<bool>* ok_outlines) {

   STRING best_str;

   float target_cert = certainty_threshold;

   if (blob != NULL) {

     float target_c2;

     target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);

     if (debug_noise_removal) {

       tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),

               target_cert, target_c2);

       blob->bounding_box().print();

     }

     target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;

   }

   GenericVector<bool> test_outlines = *ok_outlines;

   // Start with all the outlines in.

   STRING all_str;

   GenericVector<bool> best_outlines = *ok_outlines;

   float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,

                                              pr_it, blob, &all_str);

   if (debug_noise_removal) {

     TBOX ol_box;

     for (int i = 0; i < test_outlines.size(); ++i) {

       if (test_outlines[i]) ol_box += outlines[i]->bounding_box();

     }

     tprintf("All Noise blob classified as %s=%g, delta=%g at:",

             all_str.string(), best_cert, best_cert - target_cert);

     ol_box.print();

   }

   // Iteratively zero out the bit that improves the certainty the most, until

   // we get past the threshold, have zero bits, or fail to improve.

   int best_index = 0;  // To zero out.

   while (num_outlines > 1 && best_index >= 0 &&

          (blob == NULL || best_cert < target_cert || blob != NULL)) {

     // Find the best bit to zero out.

     best_index = -1;

     for (int i = 0; i < outlines.size(); ++i) {

       if (test_outlines[i]) {

         test_outlines[i] = false;

         STRING str;

         float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,

                                               pr_it, blob, &str);

         if (debug_noise_removal) {

           TBOX ol_box;

           for (int j = 0; j < outlines.size(); ++j) {

             if (test_outlines[j]) ol_box += outlines[j]->bounding_box();

             tprintf("%d", test_outlines[j]);

           }

           tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),

                   cert, cert - target_cert);

           ol_box.print();

         }

         if (cert > best_cert) {

           best_cert = cert;

           best_index = i;

           best_outlines = test_outlines;

         }

         test_outlines[i] = true;

       }

     }

     if (best_index >= 0) {

       test_outlines[best_index] = false;

       --num_outlines;

     }

   }

   if (best_cert >= target_cert) {

     // Save the best combination.

     *ok_outlines = best_outlines;

     if (debug_noise_removal) {

       tprintf("%s noise combination ", blob ? "Adding" : "New");

       for (int i = 0; i < best_outlines.size(); ++i) {

         tprintf("%d", best_outlines[i]);

       }

       tprintf(" yields certainty %g, beating target of %g\n", best_cert,

               target_cert);

     }

     return true;

   }

   return false;

 }


 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes

 // the inclusion of the outlines, and returns the certainty of the raw choice.

 float Tesseract::ClassifyBlobPlusOutlines(

     const GenericVector<bool>& ok_outlines,

     const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,

     C_BLOB* blob, STRING* best_str) {

   C_OUTLINE_IT ol_it;

   C_OUTLINE* first_to_keep = NULL;

   if (blob != NULL) {

     // Add the required outlines to the blob.

     ol_it.set_to_list(blob->out_list());

     first_to_keep = ol_it.data();

   }

   for (int i = 0; i < ok_outlines.size(); ++i) {

     if (ok_outlines[i]) {

       // This outline is to be added.

       if (blob == NULL) {

         blob = new C_BLOB(outlines[i]);

         ol_it.set_to_list(blob->out_list());

       } else {

         ol_it.add_before_stay_put(outlines[i]);

       }

     }

   }

   float c2;

   float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);

   ol_it.move_to_first();

   if (first_to_keep == NULL) {

     // We created blob. Empty its outlines and delete it.

     for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();

     delete blob;

     cert = -c2;

   } else {

     // Remove the outlines that we put in.

     for (; ol_it.data() != first_to_keep; ol_it.forward()) {

       ol_it.extract();

     }

   }

   return cert;

 }


 // Classifies the given blob (part of word_data->word->word) as an individual

 // word, using languages, chopper etc, returning only the certainty of the

 // best raw choice, and undoing all the work done to fake out the word.

 float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,

                                     C_BLOB* blob, STRING* best_str, float* c2) {

   WERD* real_word = pr_it->word()->word;

   WERD* word = real_word->ConstructFromSingleBlob(

       real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));

   WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);

   // Get a new iterator that points to the new word.

   PAGE_RES_IT it(pr_it->page_res);

   while (it.word() != word_res && it.word() != NULL) it.forward();

   ASSERT_HOST(it.word() == word_res);

   WordData wd(it);

   // Force full initialization.

   SetupWordPassN(1, &wd);

   classify_word_and_language(pass_n, &it, &wd);

   if (debug_noise_removal) {

     tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,

             wd.row->x_height(), wd.word->raw_choice->min_x_height(),

             wd.word->raw_choice->max_x_height());

   }

   float cert = wd.word->raw_choice->certainty();

   float rat = wd.word->raw_choice->rating();

   *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;

   *best_str = wd.word->raw_choice->unichar_string();

   it.DeleteCurrentWord();

   pr_it->ResetWordIterator();

   return cert;

 }


 // Generic function for classifying a word. Can be used either for pass1 or

 // pass2 according to the function passed to recognizer.

 // word_data holds the word to be recognized, and its block and row, and

 // pr_it points to the word as well, in case we are running LSTM and it wants

 // to output multiple words.

 // Recognizes in the current language, and if successful that is all.

 // If recognition was not successful, tries all available languages until

 // it gets a successful result or runs out of languages. Keeps the best result.

 void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,

                                            WordData* word_data) {

   WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1

                                           : &Tesseract::classify_word_pass2;

   // Best result so far.

   PointerVector<WERD_RES> best_words;

   // Points to the best result. May be word or in lang_words.

   WERD_RES* word = word_data->word;

   clock_t start_t = clock();

   if (classify_debug_level || cube_debug_level) {

     tprintf("%s word with lang %s at:",

             word->done ? "Already done" : "Processing",

             most_recently_used_->lang.string());

     word->word->bounding_box().print();

   }

   if (word->done) {

     // If done on pass1, leave it as-is.

     if (!word->tess_failed)

       most_recently_used_ = word->tesseract;

     return;

   }

   int sub = sub_langs_.size();

   if (most_recently_used_ != this) {

     // Get the index of the most_recently_used_.

     for (sub = 0; sub < sub_langs_.size() &&

          most_recently_used_ != sub_langs_[sub]; ++sub) {}

   }

   most_recently_used_->RetryWithLanguage(

       *word_data, recognizer, &word_data->lang_words[sub], &best_words);

   Tesseract* best_lang_tess = most_recently_used_;

   if (!WordsAcceptable(best_words)) {

     // Try all the other languages to see if they are any better.

     if (most_recently_used_ != this &&

         this->RetryWithLanguage(*word_data, recognizer,

                                 &word_data->lang_words[sub_langs_.size()],

                                 &best_words) > 0) {

       best_lang_tess = this;

     }

     for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();

          ++i) {

       if (most_recently_used_ != sub_langs_[i] &&

           sub_langs_[i]->RetryWithLanguage(*word_data, recognizer,

                                            &word_data->lang_words[i],

                                            &best_words) > 0) {

         best_lang_tess = sub_langs_[i];

       }

     }

   }

   most_recently_used_ = best_lang_tess;

   if (!best_words.empty()) {

     if (best_words.size() == 1 && !best_words[0]->combination) {

       // Move the best single result to the main word.

       word_data->word->ConsumeWordResults(best_words[0]);

     } else {

       // Words came from LSTM, and must be moved to the PAGE_RES properly.

       word_data->word = best_words.back();

       pr_it->ReplaceCurrentWord(&best_words);

     }

     ASSERT_HOST(word_data->word->box_word != NULL);

   } else {

     tprintf("no best words!!\n");

   }

   clock_t ocr_t = clock();

   if (tessedit_timing_debug) {

     tprintf("%s (ocr took %.2f sec)\n",

             word->best_choice->unichar_string().string(),

             static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);

   }

 }


 void Tesseract::classify_word_pass1(const WordData& word_data,

                                     WERD_RES** in_word,

                                     PointerVector<WERD_RES>* out_words) {

   ROW* row = word_data.row;

   BLOCK* block = word_data.block;

   prev_word_best_choice_ = word_data.prev_word != NULL

       ? word_data.prev_word->word->best_choice : NULL;

 #ifndef ANDROID_BUILD

   // If we only intend to run cube - run it and return.

   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {

     cube_word_pass1(block, row, *in_word);

     return;

   }

 #endif

   WERD_RES* word = *in_word;

   match_word_pass_n(1, word, row, block);

   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {

     word->tess_would_adapt = AdaptableWord(word);

     bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);


     if (adapt_ok) {

       // Send word to adaptive classifier for training.

       word->BestChoiceToCorrectText();

       LearnWord(NULL, word);

       // Mark misadaptions if running blamer.

       if (word->blamer_bundle != NULL) {

         word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,

                                                  wordrec_debug_blamer);

       }

     }


     if (tessedit_enable_doc_dict && !word->IsAmbiguous())

       tess_add_doc_word(word->best_choice);

   }

 }


 // Helper to report the result of the xheight fix.

 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,

                                    WERD_RES* word, WERD_RES* new_word) {

   tprintf("New XHT Match:%s = %s ",

           word->best_choice->unichar_string().string(),

           word->best_choice->debug_string().string());

   word->reject_map.print(debug_fp);

   tprintf(" -> %s = %s ",

           new_word->best_choice->unichar_string().string(),

           new_word->best_choice->debug_string().string());

   new_word->reject_map.print(debug_fp);

   tprintf(" %s->%s %s %s\n",

           word->guessed_x_ht ? "GUESS" : "CERT",

           new_word->guessed_x_ht ? "GUESS" : "CERT",

           new_x_ht > 0.1 ? "STILL DOUBT" : "OK",

           accept_new_word ? "ACCEPTED" : "");

 }


 // Run the x-height fix-up, based on min/max top/bottom information in

 // unicharset.

 // Returns true if the word was changed.

 // See the comment in fixxht.cpp for a description of the overall process.

 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {

   bool accept_new_x_ht = false;

   int original_misfits = CountMisfitTops(word);

   if (original_misfits == 0)

     return false;

   float baseline_shift = 0.0f;

   float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);

   if (baseline_shift != 0.0f) {

     // Try the shift on its own first.

     if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,

                               word, block, row))

       return false;

     original_misfits = CountMisfitTops(word);

     if (original_misfits > 0) {

       float new_baseline_shift;

       // Now recompute the new x_height.

       new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);

       if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {

         // No test of return value here, as we are definitely making a change

         // to the word by shifting the baseline.

         TestNewNormalization(original_misfits, baseline_shift, new_x_ht,

                              word, block, row);

       }

     }

     return true;

   } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {

     return TestNewNormalization(original_misfits, 0.0f, new_x_ht,

                                 word, block, row);

   } else {

     return false;

   }

 }


 // Runs recognition with the test baseline shift and x-height and returns true

 // if there was an improvement in recognition result.

 bool Tesseract::TestNewNormalization(int original_misfits,

                                      float baseline_shift, float new_x_ht,

                                      WERD_RES *word, BLOCK* block, ROW *row) {

   bool accept_new_x_ht = false;

   WERD_RES new_x_ht_word(word->word);

   if (word->blamer_bundle != NULL) {

     new_x_ht_word.blamer_bundle = new BlamerBundle();

     new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));

   }

   new_x_ht_word.x_height = new_x_ht;

   new_x_ht_word.baseline_shift = baseline_shift;

   new_x_ht_word.caps_height = 0.0;

   new_x_ht_word.SetupForRecognition(

         unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,

         classify_bln_numeric_mode, textord_use_cjk_fp_model,

       poly_allow_detailed_fx, row, block);

   match_word_pass_n(2, &new_x_ht_word, row, block);

   if (!new_x_ht_word.tess_failed) {

     int new_misfits = CountMisfitTops(&new_x_ht_word);

     if (debug_x_ht_level >= 1) {

       tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",

               original_misfits, word->x_height,

               new_misfits, new_x_ht);

       tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",

               word->best_choice->rating(), word->best_choice->certainty(),

               new_x_ht_word.best_choice->rating(),

               new_x_ht_word.best_choice->certainty());

     }

     // The misfits must improve and either the rating or certainty.

     accept_new_x_ht = new_misfits < original_misfits &&

                       (new_x_ht_word.best_choice->certainty() >

                           word->best_choice->certainty() ||

                        new_x_ht_word.best_choice->rating() <

                           word->best_choice->rating());

     if (debug_x_ht_level >= 1) {

       ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);

     }

   }

   if (accept_new_x_ht) {

     word->ConsumeWordResults(&new_x_ht_word);

     return true;

   }

   return false;

 }


 void Tesseract::classify_word_pass2(const WordData& word_data,

                                     WERD_RES** in_word,

                                     PointerVector<WERD_RES>* out_words) {

   // Return if we do not want to run Tesseract.

   if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY &&

       tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED &&

       word_data.word->best_choice != NULL)

     return;

   if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) {

     return;

   }

   ROW* row = word_data.row;

   BLOCK* block = word_data.block;

   WERD_RES* word = *in_word;

   prev_word_best_choice_ = word_data.prev_word != NULL

       ? word_data.prev_word->word->best_choice : NULL;


   set_global_subloc_code(SUBLOC_NORM);

   check_debug_pt(word, 30);

   if (!word->done) {

     word->caps_height = 0.0;

     if (word->x_height == 0.0f)

       word->x_height = row->x_height();

     match_word_pass_n(2, word, row, block);

     check_debug_pt(word, 40);

   }


   SubAndSuperscriptFix(word);


   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {

     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&

         block->classify_rotation().y() == 0.0f) {

       // Use the tops and bottoms since they are available.

       TrainedXheightFix(word, block, row);

     }


     set_global_subloc_code(SUBLOC_NORM);

   }

 #ifndef GRAPHICS_DISABLED

   if (tessedit_display_outwords) {

     if (fx_win == NULL)

       create_fx_win();

     clear_fx_win();

     word->rebuild_word->plot(fx_win);

     TBOX wbox = word->rebuild_word->bounding_box();

     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),

                             wbox.right(), wbox.bottom());

     ScrollView::Update();

   }

 #endif

   set_global_subloc_code(SUBLOC_NORM);

   check_debug_pt(word, 50);

 }


 void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,

                                   ROW *row, BLOCK* block) {

   if (word->tess_failed) return;

   tess_segment_pass_n(pass_n, word);


   if (!word->tess_failed) {

     if (!word->word->flag (W_REP_CHAR)) {

        word->fix_quotes();

       if (tessedit_fix_hyphens)

         word->fix_hyphens();

       /* Dont trust fix_quotes! - though I think I've fixed the bug */

       if (word->best_choice->length() != word->box_word->length()) {

         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"

                 " #Blobs=%d\n",

                 word->best_choice->debug_string().string(),

                 word->best_choice->length(),

                 word->box_word->length());


       }

       word->tess_accepted = tess_acceptable_word(word);


       // Also sets word->done flag

       make_reject_map(word, row, pass_n);

     }

   }

   set_word_fonts(word);


   ASSERT_HOST(word->raw_choice != NULL);

 }


 // Helper to return the best rated BLOB_CHOICE in the whole word that matches

 // the given char_id, or NULL if none can be found.

 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,

                                            WERD_RES* word_res) {

   // Find the corresponding best BLOB_CHOICE from any position in the word_res.

   BLOB_CHOICE* best_choice = NULL;

   for (int i = 0; i < word_res->best_choice->length(); ++i) {

     BLOB_CHOICE* choice = FindMatchingChoice(char_id,

                                              word_res->GetBlobChoices(i));

     if (choice != NULL) {

       if (best_choice == NULL || choice->rating() < best_choice->rating())

         best_choice = choice;

     }

   }

   return best_choice;

 }


 // Helper to insert blob_choice in each location in the leader word if there is

 // no matching BLOB_CHOICE there already, and correct any incorrect results

 // in the best_choice.

 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,

                                   WERD_RES* word_res) {

   WERD_CHOICE* word = word_res->best_choice;

   for (int i = 0; i < word_res->best_choice->length(); ++i) {

     BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),

                                              word_res->GetBlobChoices(i));

     if (choice == NULL) {

       BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));

       choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));

     }

   }

   // Correct any incorrect results in word.

   for (int i = 0; i < word->length(); ++i) {

     if (word->unichar_id(i) != blob_choice->unichar_id())

       word->set_unichar_id(blob_choice->unichar_id(), i);

   }

 }


 void Tesseract::fix_rep_char(PAGE_RES_IT* page_res_it) {

   WERD_RES *word_res = page_res_it->word();

   const WERD_CHOICE &word = *(word_res->best_choice);


   // Find the frequency of each unique character in the word.

   SortHelper<UNICHAR_ID> rep_ch(word.length());

   for (int i = 0; i < word.length(); ++i) {

     rep_ch.Add(word.unichar_id(i), 1);

   }


   // Find the most frequent result.

   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char

   int max_count = rep_ch.MaxCount(&maxch_id);

   // Find the best exemplar of a classifier result for maxch_id.

   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);

   if (best_choice == NULL) {

     tprintf("Failed to find a choice for %s, occurring %d times\n",

             word_res->uch_set->debug_str(maxch_id).string(), max_count);

     return;

   }

   word_res->done = TRUE;


   // Measure the mean space.

   int gap_count = 0;

   WERD* werd = word_res->word;

   C_BLOB_IT blob_it(werd->cblob_list());

   C_BLOB* prev_blob = blob_it.data();

   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {

     C_BLOB* blob = blob_it.data();

     int gap = blob->bounding_box().left();

     gap -= prev_blob->bounding_box().right();

     ++gap_count;

     prev_blob = blob;

   }

   // Just correct existing classification.

   CorrectRepcharChoices(best_choice, word_res);

   word_res->reject_map.initialise(word.length());

 }


 ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(

     const UNICHARSET& char_set, const char *s, const char *lengths) {

   int i = 0;

   int offset = 0;

   int leading_punct_count;

   int upper_count = 0;

   int hyphen_pos = -1;

   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;


   if (strlen (lengths) > 20)

     return word_type;


   /* Single Leading punctuation char*/


   if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))

     offset += lengths[i++];

   leading_punct_count = i;


   /* Initial cap */

   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {

     offset += lengths[i++];

     upper_count++;

   }

   if (upper_count > 1) {

     word_type = AC_UPPER_CASE;

   } else {

     /* Lower case word, possibly with an initial cap */

     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {

       offset += lengths[i++];

     }

     if (i - leading_punct_count < quality_min_initial_alphas_reqd)

       goto not_a_word;

     /*

     Allow a single hyphen in a lower case word

     - dont trust upper case - I've seen several cases of "H" -> "I-I"

     */

     if (lengths[i] == 1 && s[offset] == '-') {

       hyphen_pos = i;

       offset += lengths[i++];

       if (s[offset] != '\0') {

         while ((s[offset] != '\0') &&

                char_set.get_islower(s + offset, lengths[i])) {

           offset += lengths[i++];

         }

         if (i < hyphen_pos + 3)

           goto not_a_word;

       }

     } else {

       /* Allow "'s" in NON hyphenated lower case words */

       if (lengths[i] == 1 && (s[offset] == '\'') &&

           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {

         offset += lengths[i++];

         offset += lengths[i++];

       }

     }

     if (upper_count > 0)

       word_type = AC_INITIAL_CAP;

     else

       word_type = AC_LOWER_CASE;

   }


   /* Up to two different, constrained trailing punctuation chars */

   if (lengths[i] == 1 && s[offset] != '\0' &&

       STRING(chs_trailing_punct1).contains(s[offset]))

     offset += lengths[i++];

   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&

       s[offset - lengths[i - 1]] != s[offset] &&

       STRING(chs_trailing_punct2).contains (s[offset]))

     offset += lengths[i++];


   if (s[offset] != '\0')

     word_type = AC_UNACCEPTABLE;


   not_a_word:


   if (word_type == AC_UNACCEPTABLE) {

     /* Look for abbreviation string */

     i = 0;

     offset = 0;

     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {

       word_type = AC_UC_ABBREV;

       while (s[offset] != '\0' &&

              char_set.get_isupper(s + offset, lengths[i]) &&

              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {

         offset += lengths[i++];

         offset += lengths[i++];

       }

     }

     else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {

       word_type = AC_LC_ABBREV;

       while (s[offset] != '\0' &&

              char_set.get_islower(s + offset, lengths[i]) &&

              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {

         offset += lengths[i++];

         offset += lengths[i++];

       }

     }

     if (s[offset] != '\0')

       word_type = AC_UNACCEPTABLE;

   }


   return word_type;

 }


 BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {

   BOOL8 show_map_detail = FALSE;

   inT16 i;


   if (!test_pt)

     return FALSE;


   tessedit_rejection_debug.set_value (FALSE);

   debug_x_ht_level.set_value(0);


   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {

     if (location < 0)

       return TRUE;               // For breakpoint use

     tessedit_rejection_debug.set_value (TRUE);

     debug_x_ht_level.set_value(2);

     tprintf ("\n\nTESTWD::");

     switch (location) {

       case 0:

         tprintf ("classify_word_pass1 start\n");

         word->word->print();

         break;

       case 10:

         tprintf ("make_reject_map: initial map");

         break;

       case 20:

         tprintf ("make_reject_map: after NN");

         break;

       case 30:

         tprintf ("classify_word_pass2 - START");

         break;

       case 40:

         tprintf ("classify_word_pass2 - Pre Xht");

         break;

       case 50:

         tprintf ("classify_word_pass2 - END");

         show_map_detail = TRUE;

         break;

       case 60:

         tprintf ("fixspace");

         break;

       case 70:

         tprintf ("MM pass START");

         break;

       case 80:

         tprintf ("MM pass END");

         break;

       case 90:

         tprintf ("After Poor quality rejection");

         break;

       case 100:

         tprintf ("unrej_good_quality_words - START");

         break;

       case 110:

         tprintf ("unrej_good_quality_words - END");

         break;

       case 120:

         tprintf ("Write results pass");

         show_map_detail = TRUE;

         break;

     }

     if (word->best_choice != NULL) {

       tprintf(" \"%s\" ", word->best_choice->unichar_string().string());

       word->reject_map.print(debug_fp);

       tprintf("\n");

       if (show_map_detail) {

         tprintf("\"%s\"\n", word->best_choice->unichar_string().string());

         for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {

           tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);

           word->reject_map[i].full_print(debug_fp);

         }

       }

     } else {

       tprintf("null best choice\n");

     }

     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");

     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");

     return TRUE;

   } else {

     return FALSE;

   }

 }


 static void find_modal_font(           //good chars in word

                      STATS *fonts,     //font stats

                      inT16 *font_out,   //output font

                      inT8 *font_count  //output count

                     ) {

   inT16 font;                     //font index

   inT32 count;                   //pile couat


   if (fonts->get_total () > 0) {

     font = (inT16) fonts->mode ();

     *font_out = font;

     count = fonts->pile_count (font);

     *font_count = count < MAX_INT8 ? count : MAX_INT8;

     fonts->add (font, -*font_count);

   }

   else {

     *font_out = -1;

     *font_count = 0;

   }

 }


 void Tesseract::set_word_fonts(WERD_RES *word) {

   // Don't try to set the word fonts for a cube word, as the configs

   // will be meaningless.

   if (word->chopped_word == NULL) return;

   ASSERT_HOST(word->best_choice != NULL);


   int fontinfo_size = get_fontinfo_table().size();

   if (fontinfo_size == 0) return;

   GenericVector<int> font_total_score;

   font_total_score.init_to_size(fontinfo_size, 0);


   word->italic = 0;

   word->bold = 0;

   // Compute the font scores for the word

   if (tessedit_debug_fonts) {

     tprintf("Examining fonts in %s\n",

             word->best_choice->debug_string().string());

   }

   for (int b = 0; b < word->best_choice->length(); ++b) {

     BLOB_CHOICE* choice = word->GetBlobChoice(b);

     if (choice == NULL) continue;

     const GenericVector<ScoredFont>& fonts = choice->fonts();

     for (int f = 0; f < fonts.size(); ++f) {

       int fontinfo_id = fonts[f].fontinfo_id;

       if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {

         font_total_score[fontinfo_id] += fonts[f].score;

       }

     }

   }

   // Find the top and 2nd choice for the word.

   int score1 = 0, score2 = 0;

   inT16 font_id1 = -1, font_id2 = -1;

   for (int f = 0; f < fontinfo_size; ++f) {

     if (tessedit_debug_fonts && font_total_score[f] > 0) {

       tprintf("Font %s, total score = %d\n",

               fontinfo_table_.get(f).name, font_total_score[f]);

     }

     if (font_total_score[f] > score1) {

       score2 = score1;

       font_id2 = font_id1;

       score1 = font_total_score[f];

       font_id1 = f;

     } else if (font_total_score[f] > score2) {

       score2 = font_total_score[f];

       font_id2 = f;

     }

   }

   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;

   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;

   // Each score has a limit of MAX_UINT16, so divide by that to get the number

   // of "votes" for that font, ie number of perfect scores.

   word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);

   word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);

   if (score1 > 0) {

     FontInfo fi = fontinfo_table_.get(font_id1);

     if (tessedit_debug_fonts) {

       if (word->fontinfo_id2_count > 0) {

         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",

                 fi.name, word->fontinfo_id_count,

                 fontinfo_table_.get(font_id2).name,

                 word->fontinfo_id2_count);

       } else {

         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",

                 fi.name, word->fontinfo_id_count);

       }

     }

     word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;

     word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;

   }

 }


 void Tesseract::font_recognition_pass(PAGE_RES* page_res) {

   PAGE_RES_IT page_res_it(page_res);

   WERD_RES *word;                // current word

   STATS doc_fonts(0, font_table_size_);           // font counters


   // Gather font id statistics.

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

        page_res_it.forward()) {

     word = page_res_it.word();

     if (word->fontinfo != NULL) {

       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);

     }

     if (word->fontinfo2 != NULL) {

       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);

     }

   }

   inT16 doc_font;                 // modal font

   inT8 doc_font_count;           // modal font

   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);

   if (doc_font_count == 0)

     return;

   // Get the modal font pointer.

   const FontInfo* modal_font = NULL;

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

        page_res_it.forward()) {

     word = page_res_it.word();

     if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {

       modal_font = word->fontinfo;

       break;

     }

     if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {

       modal_font = word->fontinfo2;

       break;

     }

   }

   ASSERT_HOST(modal_font != NULL);


   // Assign modal font to weak words.

   for (page_res_it.restart_page(); page_res_it.word() != NULL;

        page_res_it.forward()) {

     word = page_res_it.word();

     int length = word->best_choice->length();


     int count = word->fontinfo_id_count;

     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {

       word->fontinfo = modal_font;

       // Counts only get 1 as it came from the doc.

       word->fontinfo_id_count = 1;

       word->italic = modal_font->is_italic() ? 1 : -1;

       word->bold = modal_font->is_bold() ? 1 : -1;

     }

   }

 }


 // If a word has multiple alternates check if the best choice is in the

 // dictionary. If not, replace it with an alternate that exists in the

 // dictionary.

 void Tesseract::dictionary_correction_pass(PAGE_RES *page_res) {

   PAGE_RES_IT word_it(page_res);

   for (WERD_RES* word = word_it.word(); word != NULL;

        word = word_it.forward()) {

     if (word->best_choices.singleton())

       continue;  // There are no alternates.


     WERD_CHOICE* best = word->best_choice;

     if (word->tesseract->getDict().valid_word(*best) != 0)

       continue;  // The best choice is in the dictionary.


     WERD_CHOICE_IT choice_it(&word->best_choices);

     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();

          choice_it.forward()) {

       WERD_CHOICE* alternate = choice_it.data();

       if (word->tesseract->getDict().valid_word(*alternate)) {

         // The alternate choice is in the dictionary.

         if (tessedit_bigram_debug) {

           tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",

                   best->unichar_string().string(),

                   alternate->unichar_string().string());

         }

         // Replace the 'best' choice with a better choice.

         word->ReplaceBestChoice(alternate);

         break;

       }

     }

   }

 }


 }  // namespace tesseract

tesseract::FontInfo::is_bold
bool is_bold() const
Definition: fontinfo.h:112

WERD_RES::tess_accepted
BOOL8 tess_accepted
Definition: pageres.h:280

set_global_loc_code
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79

WERD_RES::SetScriptPositions
void SetScriptPositions()
Definition: pageres.cpp:853

tesseract::Tesseract::test_pt_x
double test_pt_x
Definition: tesseractclass.h:887

WERD_RES::ConsumeWordResults
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757

globals.h

tesseract::Tesseract::run_cube_combiner
void run_cube_combiner(PAGE_RES *page_res)
Definition: cube_control.cpp:193

tesseract::Tesseract
Definition: tesseractclass.h:170

STATS::get_total
inT32 get_total() const
Definition: statistc.h:86

WERD::ConstructFromSingleBlob
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137

UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318

tesseract::Tesseract::quality_blob_pc
double quality_blob_pc
Definition: tesseractclass.h:873

tesseract::Tesseract::TrainedXheightFix
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1402

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356

C_BLOB::deep_copy
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113

pgedit.h

AC_LC_ABBREV
a.b.c.
Definition: control.h:40

tesseract::Tesseract::AssignDiacriticsToNewBlobs
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:1029

PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1673

GenericVector::size
int size() const
Definition: genericvector.h:72

WERD_RES::best_choices
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:250

kRepcharGapThreshold
const double kRepcharGapThreshold
Definition: control.cpp:55

tesseract::Tesseract::right_to_left
bool right_to_left() const
Definition: tesseractclass.h:248

tesseract::Tesseract::match_word_pass_n
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1549

tesseract::Classify::classify_bln_numeric_mode
bool classify_bln_numeric_mode
Definition: classify.h:500

output.h

tesseract::Tesseract::recog_interactive
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:84

tesseract::Tesseract::word_outline_errs
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

tesseract::Tesseract::RetryWithLanguage
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:869

WERD_RES
Definition: pageres.h:155

tesseract::Tesseract::tessedit_display_outwords
bool tessedit_display_outwords
Definition: tesseractclass.h:830

REJMAP::length
inT32 length() const
Definition: rejctmap.h:237

GenericVector::length
int length() const
Definition: genericvector.h:79

MAX
#define MAX(x, y)
Definition: ndminx.h:24

WERD_RES::ReplaceBestChoice
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787

control.h

tesseract::Tesseract::word_adaptable
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45

tesseract::Tesseract::classify_word_pass2
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1488

tesseract::CCStruct::kXHeightCapRatio
static const double kXHeightCapRatio
Definition: ccstruct.h:37

tesseract::Tesseract::noise_maxperword
int noise_maxperword
Definition: tesseractclass.h:866

MAX_UINT16
#define MAX_UINT16
Definition: host.h:122

tesseract::Tesseract::make_reject_map
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)

tesseract::TesseractStats::dict_words
inT32 dict_words
Definition: tesseractclass.h:134

tesseract::Tesseract::ClassifyBlobPlusOutlines
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1190

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::Tesseract::SelectGoodDiacriticOutlines
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1105

PAGE_RES_IT::ResetWordIterator
void ResetWordIterator()
Definition: pageres.cpp:1532

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:737

tesseract::Tesseract::rejection_passes
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:590

tesseract::Tesseract::SubAndSuperscriptFix
bool SubAndSuperscriptFix(WERD_RES *word_res)
Definition: superscript.cpp:101

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:271

tesseract::PointerVector::truncate
void truncate(int size)
Definition: genericvector.h:464

USER_DAWG_PERM
Definition: ratngs.h:251

WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:201

tesseract::Tesseract::noise_cert_basechar
double noise_cert_basechar
Definition: tesseractclass.h:855

PAGE_RES_IT::ReplaceCurrentWord
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1321

tessbox.h

PAGE_RES_IT
Definition: pageres.h:656

ScrollView::Update
static void Update()
Definition: scrollview.cpp:715

tesseract::Classify::AdaptiveClassifierIsFull
bool AdaptiveClassifierIsFull() const
Definition: classify.h:284

tesseract::Tesseract::tessedit_test_adaption
bool tessedit_test_adaption
Definition: tesseractclass.h:882

PAGE_RES::char_count
inT32 char_count
Definition: pageres.h:60

tesseract::Tesseract::test_pt_y
double test_pt_y
Definition: tesseractclass.h:888

ETEXT_DESC::ocr_alive
volatile inT8 ocr_alive
Definition: ocrclass.h:117

WERD_RES::bold
inT8 bold
Definition: pageres.h:286

tprintf
#define tprintf(...)
Definition: tprintf.h:31

MIN
#define MIN(x, y)
Definition: ndminx.h:28

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::TesseractStats::doc_char_quality
inT16 doc_char_quality
Definition: tesseractclass.h:130

tesseract::Tesseract::tessedit_fix_hyphens
bool tessedit_fix_hyphens
Definition: tesseractclass.h:837

tesseract::FontInfo::name
char * name
Definition: fontinfo.h:117

make_pseudo_word
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31

tesseract::WordData::prev_word
WordData * prev_word
Definition: tesseractclass.h:155

STATS
Definition: statistc.h:33

tesseract::Tesseract::ClassifyBlobAsWord
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1232

GenericVector::back
T & back() const
Definition: genericvector.h:668

tesseract::Tesseract::noise_cert_disjoint
double noise_cert_disjoint
Definition: tesseractclass.h:858

ETEXT_DESC::cancel_this
void * cancel_this
Definition: ocrclass.h:120

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:72

tesseract::Tesseract::quality_min_initial_alphas_reqd
int quality_min_initial_alphas_reqd
Definition: tesseractclass.h:877

tesseract::WordData::lang_words
PointerVector< WERD_RES > lang_words
Definition: tesseractclass.h:156

LOC_DOC_BLK_REJ
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53

tesseract::Tesseract::ReassignDiacritics
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:910

ocrclass.h

kMinRefitXHeightFraction
const double kMinRefitXHeightFraction
Definition: control.cpp:58

TBOX::print
void print() const
Definition: rect.h:270

WERD::AddSelectedOutlines
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548

LOC_MM_ADAPT
#define LOC_MM_ADAPT
Definition: errcode.h:52

tesseract::Tesseract::blamer_pass
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:686

tesseract::Tesseract::tessedit_minimal_rej_pass1
bool tessedit_minimal_rej_pass1
Definition: tesseractclass.h:881

tesseract::ParamUtils::PrintParams
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180

clear_fx_win
void clear_fx_win()
Definition: drawfx.cpp:73

tesseract::Tesseract::tessedit_write_unlv
bool tessedit_write_unlv
Definition: tesseractclass.h:1003

tesseract::TesseractStats::doc_good_char_quality
inT16 doc_good_char_quality
Definition: tesseractclass.h:132

AC_UC_ABBREV
A.B.C.
Definition: control.h:41

WERD_RES::caps_height
float caps_height
Definition: pageres.h:296

STATS::mode
inT32 mode() const
Definition: statistc.cpp:118

tesseract::Tesseract::output_pass
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68

STATS::add
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104

PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:658

tesseract::PointerVector< WERD_RES >

BOOL8
unsigned char BOOL8
Definition: host.h:113

ROW::x_height
float x_height() const
Definition: ocrrow.h:61

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:160

WERD_RES::InitForRetryRecognition
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269

STRING::length
inT32 length() const
Definition: strngs.cpp:188

BLOB_CHOICE
Definition: ratngs.h:48

tesseract::PageSegMode
PageSegMode
Definition: publictypes.h:151

tesseract::WordData::word
WERD_RES * word
Definition: tesseractclass.h:152

tesseract::Tesseract::tessedit_timing_debug
bool tessedit_timing_debug
Definition: tesseractclass.h:832

WERD_RES::fontinfo
const FontInfo * fontinfo
Definition: pageres.h:288

C_BLOB::SortByXMiddle
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119

tesseract::Tesseract::ComputeCompatibleXheight
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101

tesseract::Tesseract::chs_leading_punct
char * chs_leading_punct
Definition: tesseractclass.h:869

tesseract::TesseractStats::doc_outline_errs
inT16 doc_outline_errs
Definition: tesseractclass.h:129

WERD_RES::fix_quotes
void fix_quotes()
Definition: pageres.cpp:1012

WERD_CHOICE
Definition: ratngs.h:271

tesseract::Tesseract::ReportXhtFixResult
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1381

tesseract::TesseractStats::doc_blob_quality
inT16 doc_blob_quality
Definition: tesseractclass.h:128

ETEXT_DESC
Definition: ocrclass.h:112

PAGE_RES::blame_reasons
GenericVector< int > blame_reasons
Definition: pageres.h:68

tesseract::Tesseract::quality_rej_pc
double quality_rej_pc
Definition: tesseractclass.h:872

WERD_RES::small_caps
bool small_caps
Definition: pageres.h:283

tesseract::Classify::fontinfo_table_
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488

PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1449

TBOX::right
inT16 right() const
Definition: rect.h:75

FindMatchingChoice
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160

tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
Definition: control.cpp:976

BlamerBundle::incorrect_result_reason
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106

WERD_RES::x_height
float x_height
Definition: pageres.h:295

tesseract::Tesseract::tessedit_word_for_word
bool tessedit_word_for_word
Definition: tesseractclass.h:1020

tesseract::Tesseract::recog_all_words
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287

werdit.h

tesseract::Tesseract::tessedit_pageseg_mode
int tessedit_pageseg_mode
Definition: tesseractclass.h:791

tesseract::Tesseract::tessedit_rejection_debug
bool tessedit_rejection_debug
Definition: tesseractclass.h:1025

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:99

WERD_CHOICE::shallow_copy
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392

WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:333

tesseract::Tesseract::CountMisfitTops
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69

ClipToRange
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115

WERD_RES::tess_would_adapt
BOOL8 tess_would_adapt
Definition: pageres.h:281

tesseract::WordData
Definition: tesseractclass.h:144

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

SortHelper
Definition: sorthelper.h:36

ROW
Definition: ocrrow.h:32

tesseract::Tesseract::script_pos_pass
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:710

tesseract::FontInfo
Definition: fontinfo.h:62

tesseract::TesseractStats::good_char_count
inT16 good_char_count
Definition: tesseractclass.h:131

tesseract::Wordrec::prev_word_best_choice_
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416

tesseract::Tesseract::debug_noise_removal
int debug_noise_removal
Definition: tesseractclass.h:852

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

W_BOL
Definition: werd.h:35

tesseract::Tesseract::tessedit_reject_bad_qual_wds
bool tessedit_reject_bad_qual_wds
Definition: tesseractclass.h:925

tesseract::Tesseract::tessedit_enable_dict_correction
bool tessedit_enable_dict_correction
Definition: tesseractclass.h:846

AC_UPPER_CASE
ALL upper case.
Definition: control.h:38

tesseract::Wordrec::wordrec_debug_blamer
bool wordrec_debug_blamer
Definition: wordrec.h:167

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:739

WERD_RES::part_of_combo
BOOL8 part_of_combo
Definition: pageres.h:316

C_BLOB::out_list
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64

tesseract::Tesseract::quality_char_pc
double quality_char_pc
Definition: tesseractclass.h:876

TWERD::plot
void plot(ScrollView *window)
Definition: blobs.cpp:918

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:713

tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:390

tesseract::WordData::block
BLOCK * block
Definition: tesseractclass.h:154

tessvars.h

tesseract::Tesseract::noise_cert_punc
double noise_cert_punc
Definition: tesseractclass.h:861

BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:79

WERD_CHOICE::GetNonSuperscriptSpan
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:140

REJMAP::full_print
void full_print(FILE *fp)
Definition: rejctmap.cpp:406

tesseract::Wordrec::wordrec_run_blamer
bool wordrec_run_blamer
Definition: wordrec.h:168

BLOCK::classify_rotation
FCOORD classify_rotation() const
Definition: ocrblock.h:144

PAGE_RES::misadaption_log
GenericVector< STRING > misadaption_log
Definition: pageres.h:73

WERD_RES::fontinfo_id_count
inT8 fontinfo_id_count
Definition: pageres.h:290

ETEXT_DESC::cancel
CANCEL_FUNC cancel
Definition: ocrclass.h:119

PAGE_RES_IT::MakeCurrentWordFuzzy
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1482

drawfx.h

tesseract::WordData::row
ROW * row
Definition: tesseractclass.h:153

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:680

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447

tesseract::Tesseract::noise_cert_factor
double noise_cert_factor
Definition: tesseractclass.h:864

W_EOL
Definition: werd.h:36

tesseract::TesseractStats::word_count
inT32 word_count
Definition: tesseractclass.h:133

tesseract::Tesseract::check_debug_pt
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767

BLOCK::x_height
inT32 x_height() const
return xheight
Definition: ocrblock.h:110

tesseract::Tesseract::tessedit_fix_fuzzy_spaces
bool tessedit_fix_fuzzy_spaces
Definition: tesseractclass.h:834

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97

tesseract::OEM_TESSERACT_ONLY
Definition: publictypes.h:257

TBOX::left
inT16 left() const
Definition: rect.h:68

tesseract::Tesseract::ProcessTargetWord
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:118

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:327

C_OUTLINE
Definition: coutln.h:69

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:244

ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765

LOC_WRITE_RESULTS
#define LOC_WRITE_RESULTS
Definition: errcode.h:54

tesseract::Tesseract::tessedit_enable_bigram_correction
bool tessedit_enable_bigram_correction
Definition: tesseractclass.h:844

UNICHARSET::script_has_xheight
bool script_has_xheight() const
Definition: unicharset.h:849

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

ETEXT_DESC::deadline_exceeded
bool deadline_exceeded() const
Definition: ocrclass.h:144

tesseract::Tesseract::tessedit_tess_adaption_mode
int tessedit_tess_adaption_mode
Definition: tesseractclass.h:879

PAGE_RES
Definition: pageres.h:58

tesseract::FontInfo::universal_id
inT32 universal_id
Definition: fontinfo.h:123

BLOCK
Definition: ocrblock.h:30

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

W_REP_CHAR
Definition: werd.h:41

ACCEPTABLE_WERD_TYPE
ACCEPTABLE_WERD_TYPE
Definition: control.h:34

GenericVector::init_to_size
void init_to_size(int size, T t)
Definition: genericvector.h:646

FREQ_DAWG_PERM
Definition: ratngs.h:252

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:736

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

tesseract::Tesseract::tessedit_ocr_engine_mode
int tessedit_ocr_engine_mode
Definition: tesseractclass.h:795

tesseract::FontInfo::is_italic
bool is_italic() const
Definition: fontinfo.h:111

tesseract::Tesseract::poly_allow_detailed_fx
bool poly_allow_detailed_fx
Definition: tesseractclass.h:1065

BlamerBundle::IncorrectReasonName
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:343

tesseract::OEM_CUBE_ONLY
Definition: publictypes.h:258

WERD::print
void print()
Definition: werd.cpp:266

tesseract::Tesseract::classify_word_pass1
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1344

tesseract::Tesseract::RecogAllWordsPassN
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:207

tesseract::Tesseract::dictionary_correction_pass
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2015

WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:502

WERD_RES::raw_choice
WERD_CHOICE * raw_choice
Definition: pageres.h:224

PAGE_RES::rej_count
inT32 rej_count
Definition: pageres.h:61

WERD_RES::tesseract
tesseract::Tesseract * tesseract
Definition: pageres.h:266

tesseract::Classify::getDict
Dict & getDict()
Definition: classify.h:65

WERD_RES::PrintBestChoices
void PrintBestChoices() const
Definition: pageres.cpp:709

kBackUpConfigFile
const char *const kBackUpConfigFile
Definition: control.cpp:53

tesseract::Tesseract::font_recognition_pass
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1958

tesseract::Tesseract::tessedit_enable_doc_dict
bool tessedit_enable_doc_dict
Definition: tesseractclass.h:840

BlamerBundle::SetMisAdaptionDebug
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574

tesseract::OEM_TESSERACT_CUBE_COMBINED
Definition: publictypes.h:259

tesseract::Tesseract::acceptable_word_string
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1663

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

WERD_RES::baseline_shift
float baseline_shift
Definition: pageres.h:297

tesseract::Tesseract::cube_debug_level
int cube_debug_level
Definition: tesseractclass.h:893

WERD_RES::guessed_x_ht
BOOL8 guessed_x_ht
Definition: pageres.h:292

tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738

fx_win
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51

WERD_RES::fontinfo2
const FontInfo * fontinfo2
Definition: pageres.h:289

MAX_INT32
#define MAX_INT32
Definition: host.h:120

tesseract::Tesseract::bigram_correction_pass
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:442

tesseract::Classify::SwitchAdaptiveClassifier
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:628

tesseract::Tesseract::tessedit_debug_fonts
bool tessedit_debug_fonts
Definition: tesseractclass.h:841

BlamerBundle::misadaption_debug
const STRING & misadaption_debug() const
Definition: blamer.h:119

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456

WERD
Definition: werd.h:60

tesseract::Tesseract::SetupWordPassN
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171

tesseract::Tesseract::debug_x_ht_level
int debug_x_ht_level
Definition: tesseractclass.h:867

tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:212

tesseract::Tesseract::tess_segment_pass_n
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39

tesseract::Tesseract::cube_word_pass1
void cube_word_pass1(BLOCK *block, ROW *row, WERD_RES *word)
Definition: cube_control.cpp:235

REJMAP::rej_word_bad_quality
void rej_word_bad_quality()
Definition: rejctmap.cpp:488

tesseract::Tesseract::word_blob_quality
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65

TBOX::major_x_overlap
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402

reject.h

tesseract::Classify::get_fontinfo_table
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:881

WERD_RES::done
BOOL8 done
Definition: pageres.h:282

tesseract::Classify::AdaptiveClassifierIsEmpty
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:285

tesseract::Tesseract::SetupAllWordsPassN
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:148

TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:358

WERD_RES::fontinfo_id2_count
inT8 fontinfo_id2_count
Definition: pageres.h:291

WERD_RES::word
WERD * word
Definition: pageres.h:175

MAX_INT8
#define MAX_INT8
Definition: host.h:118

TBOX::x_overlap
bool x_overlap(const TBOX &box) const
Definition: rect.h:391

GenericVector::empty
bool empty() const
Definition: genericvector.h:84

tesseract::CCUtil::params
ParamsVectors * params()
Definition: ccutil.h:65

tesseract::Classify::classify_max_rating_ratio
double classify_max_rating_ratio
Definition: classify.h:402

tesseract::Tesseract::AnyTessLang
bool AnyTessLang() const
Definition: tesseractclass.h:258

AC_LOWER_CASE
ALL lower case.
Definition: control.h:37

IRR_NUM_REASONS
Definition: blamer.h:84

tesseract::Tesseract::fix_rep_char
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1624

STRING::add_str_int
void add_str_int(const char *str, int number)
Definition: strngs.cpp:376

WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:336

PAGE_RES_IT::InsertSimpleCloneWord
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1268

WERD::GetNoiseOutlines
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
Definition: werd.cpp:530

tesseract::Tesseract::classify_word_and_language
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268

tesseract::Tesseract::recog_pseudo_word
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:68

docqual.h

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:36

BlamerBundle::CopyTruth
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:187

UNICHARSET::top_bottom_useful
bool top_bottom_useful() const
Definition: unicharset.h:495

tesseract::BoxWord::length
const int length() const
Definition: boxword.h:85

tesseract::Tesseract::set_word_fonts
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1880

FALSE
#define FALSE
Definition: capi.h:29

tesseract
Definition: baseapi.cpp:83

WERD_RES::tess_failed
BOOL8 tess_failed
Definition: pageres.h:272

count
int count(LIST var_list)
Definition: oldlist.cpp:108

ETEXT_DESC::progress
inT16 progress
Definition: ocrclass.h:115

SortHelper::Add
void Add(T value, int count)
Definition: sorthelper.h:65

sorthelper.h

callcpp.h

WERD_RES::fix_hyphens
void fix_hyphens()
Definition: pageres.cpp:1041

tesseract::Tesseract::TestNewNormalization
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1437

tesseract::Classify::classify_max_certainty_margin
double classify_max_certainty_margin
Definition: classify.h:404

ROW_RES::row
ROW * row
Definition: pageres.h:127

REJMAP::reject_count
inT16 reject_count()
Definition: rejctmap.h:243

TBOX
Definition: rect.h:30

TRUE
#define TRUE
Definition: capi.h:28

tesseract::WordRecognizer
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: tesseractclass.h:166

tesseract::Tesseract::tess_acceptable_word
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69

UNICHARSET
Definition: unicharset.h:139

FCOORD::y
float y() const
Definition: points.h:212

WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294

debug_fp
FILE * debug_fp
Definition: tessvars.cpp:24

tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:69

WERD::flag
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128

tesseract::Tesseract::PrerecAllWordsPar
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:36

TBOX::contains
bool contains(const FCOORD pt) const
Definition: rect.h:323

tesseract::Classify::AdaptableWord
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850

C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:250

STATS::pile_count
inT32 pile_count(inT32 value) const
Definition: statistc.h:78

BlamerBundle
Definition: blamer.h:88

AC_INITIAL_CAP
ALL but initial lc.
Definition: control.h:39

tesseract::Textord::CleanupSingleRowResult
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:359

tesseract::ParamUtils::ReadParamsFile
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41

STRING
Definition: strngs.h:44

BLOB_CHOICE::fonts
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91

set_global_subloc_code
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85

REJMAP::initialise
void initialise(inT16 length)
Definition: rejctmap.cpp:318

C_BLOB
Definition: stepblob.h:30

NULL
#define NULL
Definition: host.h:144

LOC_FUZZY_SPACE
#define LOC_FUZZY_SPACE
Definition: errcode.h:50

WERD_RES::IsAmbiguous
bool IsAmbiguous()
Definition: pageres.cpp:443

inT8
SIGNED char inT8
Definition: host.h:98

WERD_RES::GetBlobChoice
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742

tesseract::Tesseract::tessedit_dump_choices
bool tessedit_dump_choices
Definition: tesseractclass.h:831

tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244

tesseract::Tesseract::chs_trailing_punct1
char * chs_trailing_punct1
Definition: tesseractclass.h:870

tesseract::Tesseract::test_pt
bool test_pt
Definition: tesseractclass.h:886

WERD_RES::italic
inT8 italic
Definition: pageres.h:285

GenericVector
Definition: baseapi.h:41

tesseract::Tesseract::quality_outline_pc
double quality_outline_pc
Definition: tesseractclass.h:875

tesseract::Tesseract::tessedit_minimal_rejection
bool tessedit_minimal_rejection
Definition: tesseractclass.h:1017

REJMAP::print
void print(FILE *fp)
Definition: rejctmap.cpp:394

tesseract::Classify::StartBackupAdaptiveClassifier
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:644

tesseract::Tesseract::tessedit_bigram_debug
int tessedit_bigram_debug
Definition: tesseractclass.h:848

STRING::string
const char * string() const
Definition: strngs.cpp:193

TBOX::top
inT16 top() const
Definition: rect.h:54

SUBLOC_NORM
#define SUBLOC_NORM
Definition: errcode.h:59

tesseract::Tesseract::tessedit_debug_quality_metrics
bool tessedit_debug_quality_metrics
Definition: tesseractclass.h:928

BlamerBundle::LastChanceBlame
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547

tesseract::SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:38

tesseract::Tesseract::noise_maxperblob
int noise_maxperblob
Definition: tesseractclass.h:865

WERD_RES::BestChoiceToCorrectText
void BestChoiceToCorrectText()
Definition: pageres.cpp:917

fixspace.h

tesseract::Tesseract::chs_trailing_punct2
char * chs_trailing_punct2
Definition: tesseractclass.h:871

BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

FCOORD
Definition: points.h:189

tesseract::Tesseract::textord_use_cjk_fp_model
bool textord_use_cjk_fp_model
Definition: tesseractclass.h:1063

WERD_RES::GetBlobChoices
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:751

WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:230

tesseract::Tesseract::fix_fuzzy_spaces
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48

tesseract::Tesseract::tessedit_parallelize
int tessedit_parallelize
Definition: tesseractclass.h:1078

EqualIgnoringCaseAndTerminalPunct
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791

WERD::rej_cblob_list
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95

WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:100

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:733

create_fx_win
void create_fx_win()
Definition: drawfx.cpp:60

STRING::contains
BOOL8 contains(const char c) const
Definition: strngs.cpp:184

inT16
short inT16
Definition: host.h:100

inT32
int inT32
Definition: host.h:102

tesseract::Tesseract::tess_add_doc_word
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79

uinT8
unsigned char uinT8
Definition: host.h:99

tesseractclass.h