tesseract-ocr.github.io/3.x/a01063_source.html

 // File:        dict.cpp

 // Description: dict class.

 // Author:      Samuel Charron

 //

 // (C) Copyright 2006, Google Inc.

 // Licensed under the Apache License, Version 2.0 (the "License");

 // you may not use this file except in compliance with the License.

 // You may obtain a copy of the License at

 // http://www.apache.org/licenses/LICENSE-2.0

 // Unless required by applicable law or agreed to in writing, software

 // distributed under the License is distributed on an "AS IS" BASIS,

 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 // See the License for the specific language governing permissions and

 // limitations under the License.

 //


 #include <stdio.h>


 #include "dict.h"

 #include "unicodes.h"


 #ifdef _MSC_VER

 #pragma warning(disable:4244)  // Conversion warnings

 #endif

 #include "tprintf.h"


 namespace tesseract {


 class Image;


 Dict::Dict(CCUtil* ccutil)

     : letter_is_okay_(&tesseract::Dict::def_letter_is_okay),

       probability_in_context_(&tesseract::Dict::def_probability_in_context),

       params_model_classify_(NULL),

       ccutil_(ccutil),

       STRING_MEMBER(user_words_file, "",

                     "A filename of user-provided words.",

                     getCCUtil()->params()),

       STRING_INIT_MEMBER(user_words_suffix, "",

                          "A suffix of user-provided words located in tessdata.",

                          getCCUtil()->params()),

       STRING_MEMBER(user_patterns_file, "",

                     "A filename of user-provided patterns.",

                     getCCUtil()->params()),

       STRING_INIT_MEMBER(user_patterns_suffix, "",

                          "A suffix of user-provided patterns located in "

                          "tessdata.",

                          getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.",

                        getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.",

                        getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",

                        getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation"

                        " patterns.", getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number"

                        " patterns.", getCCUtil()->params()),

       BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word "

                        "bigrams.", getCCUtil()->params()),

       double_MEMBER(xheight_penalty_subscripts, 0.125,

                     "Score penalty (0.1 = 10%) added if there are subscripts "

                     "or superscripts in a word, but it is otherwise OK.",

                     getCCUtil()->params()),

       double_MEMBER(xheight_penalty_inconsistent, 0.25,

                     "Score penalty (0.1 = 10%) added if an xheight is "

                     "inconsistent.", getCCUtil()->params()),

       double_MEMBER(segment_penalty_dict_frequent_word, 1.0,

                     "Score multiplier for word matches which have good case and"

                     "are frequent in the given language (lower is better).",

                     getCCUtil()->params()),

       double_MEMBER(segment_penalty_dict_case_ok, 1.1,

                     "Score multiplier for word matches that have good case "

                     "(lower is better).", getCCUtil()->params()),

       double_MEMBER(segment_penalty_dict_case_bad, 1.3125,

                     "Default score multiplier for word matches, which may have "

                     "case issues (lower is better).",

                     getCCUtil()->params()),

       double_MEMBER(segment_penalty_ngram_best_choice, 1.24,

                    "Multipler to for the best choice from the ngram model.",

                    getCCUtil()->params()),

       double_MEMBER(segment_penalty_dict_nonword, 1.25,

                     "Score multiplier for glyph fragment segmentations which "

                     "do not match a dictionary word (lower is better).",

                     getCCUtil()->params()),

       double_MEMBER(segment_penalty_garbage, 1.50,

                     "Score multiplier for poorly cased strings that are not in"

                     " the dictionary and generally look like garbage (lower is"

                     " better).", getCCUtil()->params()),

       STRING_MEMBER(output_ambig_words_file, "",

                     "Output file for ambiguities found in the dictionary",

                     getCCUtil()->params()),

       INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info"

                  ", to 2 for more details, to 3 to see all the debug messages",

                  getCCUtil()->params()),

       INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.",

                  getCCUtil()->params()),

       INT_MEMBER(max_viterbi_list_size, 10, "Maximum size of viterbi list.",

                  getCCUtil()->params()),

       BOOL_MEMBER(use_only_first_uft8_step, false,

                   "Use only the first UTF8 step of the given string"

                   " when computing log probabilities.",

                   getCCUtil()->params()),

       double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor",

                     getCCUtil()->params()),

       double_MEMBER(stopper_nondict_certainty_base, -2.50,

                     "Certainty threshold for non-dict words",

                     getCCUtil()->params()),

       double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0,

                     "Reject certainty offset",

                     getCCUtil()->params()),

       INT_MEMBER(stopper_smallword_size, 2,

                  "Size of dict word to be treated as non-dict word",

                  getCCUtil()->params()),

       double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add"

                     " for each dict char above small word size.",

                     getCCUtil()->params()),

       double_MEMBER(stopper_allowable_character_badness, 3.0,

                     "Max certaintly variation allowed in a word (in sigma)",

                     getCCUtil()->params()),

       INT_MEMBER(stopper_debug_level, 0, "Stopper debug level",

                  getCCUtil()->params()),

       BOOL_MEMBER(stopper_no_acceptable_choices, false,

                   "Make AcceptableChoice() always return false. Useful"

                   " when there is a need to explore all segmentations",

                   getCCUtil()->params()),

       BOOL_MEMBER(save_raw_choices, false,

                   "Deprecated- backward compatablity only",

                   getCCUtil()->params()),

       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,

                  "Max words to keep in list",

                  getCCUtil()->params()),

       STRING_MEMBER(word_to_debug, "", "Word for which stopper debug"

                     " information should be printed to stdout",

                     getCCUtil()->params()),

       STRING_MEMBER(word_to_debug_lengths, "",

                     "Lengths of unichars in word_to_debug",

                     getCCUtil()->params()),

       INT_MEMBER(fragments_debug, 0, "Debug character fragments",

                  getCCUtil()->params()),

       BOOL_MEMBER(segment_nonalphabetic_script, false,

                  "Don't use any alphabetic-specific tricks."

                  "Set to true in the traineddata config file for"

                  " scripts that are cursive or inherently fixed-pitch",

                  getCCUtil()->params()),

       BOOL_MEMBER(save_doc_words, 0, "Save Document Words",

                   getCCUtil()->params()),

       double_MEMBER(doc_dict_pending_threshold, 0.0,

                     "Worst certainty for using pending dictionary",

                     getCCUtil()->params()),

       double_MEMBER(doc_dict_certainty_threshold, -2.25,

                     "Worst certainty for words that can be inserted into the"

                     "document dictionary", getCCUtil()->params()),

       INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different"

                  " character choices to consider during permutation."

                  " This limit is especially useful when user patterns"

                  " are specified, since overly generic patterns can result in"

                  " dawg search exploring an overly large number of options.",

                  getCCUtil()->params()) {

   dang_ambigs_table_ = NULL;

   replace_ambigs_table_ = NULL;

   reject_offset_ = 0.0;

   go_deeper_fxn_ = NULL;

   hyphen_word_ = NULL;

   last_word_on_line_ = false;

   hyphen_unichar_id_ = INVALID_UNICHAR_ID;

   document_words_ = NULL;

   dawg_cache_ = NULL;

   dawg_cache_is_ours_ = false;

   pending_words_ = NULL;

   bigram_dawg_ = NULL;

   freq_dawg_ = NULL;

   punc_dawg_ = NULL;

   unambig_dawg_ = NULL;

   wordseg_rating_adjust_factor_ = -1.0f;

   output_ambig_words_file_ = NULL;

 }


 Dict::~Dict() {

   if (hyphen_word_ != NULL) delete hyphen_word_;

   if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_);

 }


 DawgCache *Dict::GlobalDawgCache() {

   // We dynamically allocate this global cache (a singleton) so it will outlive

   // every Tesseract instance (even those that someone else might declare as

   // global statics).

   static DawgCache *cache = new DawgCache();  // evil global singleton

   return cache;

 }


 void Dict::Load(DawgCache *dawg_cache) {

   STRING name;

   STRING &lang = getCCUtil()->lang;


   if (dawgs_.length() != 0) this->End();


   apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);

   question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);

   slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);

   hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);


   if (dawg_cache != NULL) {

     dawg_cache_ = dawg_cache;

     dawg_cache_is_ours_ = false;

   } else {

     dawg_cache_ = new DawgCache();

     dawg_cache_is_ours_ = true;

   }


   TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager;

   const char *data_file_name = tessdata_manager.GetDataFileName().string();


   // Load dawgs_.

   if (load_punc_dawg) {

     punc_dawg_ = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level);

     if (punc_dawg_) dawgs_ += punc_dawg_;

   }

   if (load_system_dawg) {

     Dawg *system_dawg = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level);

     if (system_dawg) dawgs_ += system_dawg;

   }

   if (load_number_dawg) {

     Dawg *number_dawg = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level);

     if (number_dawg) dawgs_ += number_dawg;

   }

   if (load_bigram_dawg) {

     bigram_dawg_ = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level);

   }

   if (load_freq_dawg) {

     freq_dawg_ = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level);

     if (freq_dawg_) { dawgs_ += freq_dawg_; }

   }

   if (load_unambig_dawg) {

     unambig_dawg_ = dawg_cache_->GetSquishedDawg(

         lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level);

     if (unambig_dawg_) dawgs_ += unambig_dawg_;

   }


   if (((STRING &)user_words_suffix).length() > 0 ||

       ((STRING &)user_words_file).length() > 0) {

     Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM,

                               getUnicharset().size(), dawg_debug_level);

     if (((STRING &)user_words_file).length() > 0) {

         name = user_words_file;

     } else {

         name = getCCUtil()->language_data_path_prefix;

         name += user_words_suffix;

     }

     if (!trie_ptr->read_and_add_word_list(name.string(), getUnicharset(),

                                           Trie::RRP_REVERSE_IF_HAS_RTL)) {

       tprintf("Error: failed to load %s\n", name.string());

       delete trie_ptr;

     } else {

       dawgs_ += trie_ptr;

     }

   }


   if (((STRING &)user_patterns_suffix).length() > 0 ||

       ((STRING &)user_patterns_file).length() > 0) {

     Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM,

                               getUnicharset().size(), dawg_debug_level);

     trie_ptr->initialize_patterns(&(getUnicharset()));

     if (((STRING &)user_patterns_file).length() > 0) {

         name = user_patterns_file;

     } else {

         name = getCCUtil()->language_data_path_prefix;

         name += user_patterns_suffix;

     }

     if (!trie_ptr->read_pattern_list(name.string(), getUnicharset())) {

       tprintf("Error: failed to load %s\n", name.string());

       delete trie_ptr;

     } else {

       dawgs_ += trie_ptr;

     }

   }


   document_words_ = new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM,

                              getUnicharset().size(), dawg_debug_level);

   dawgs_ += document_words_;


   // This dawg is temporary and should not be searched by letter_is_ok.

   pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM,

                             getUnicharset().size(), dawg_debug_level);


   // Construct a list of corresponding successors for each dawg. Each entry i

   // in the successors_ vector is a vector of integers that represent the

   // indices into the dawgs_ vector of the successors for dawg i.

   successors_.reserve(dawgs_.length());

   for (int i = 0; i < dawgs_.length(); ++i) {

     const Dawg *dawg = dawgs_[i];

     SuccessorList *lst = new SuccessorList();

     for (int j = 0; j < dawgs_.length(); ++j) {

       const Dawg *other = dawgs_[j];

       if (dawg != NULL && other != NULL &&

           (dawg->lang() == other->lang()) &&

           kDawgSuccessors[dawg->type()][other->type()]) *lst += j;

     }

     successors_ += lst;

   }

 }


 void Dict::End() {

   if (dawgs_.length() == 0)

     return;  // Not safe to call twice.

   for (int i = 0; i < dawgs_.size(); i++) {

     if (!dawg_cache_->FreeDawg(dawgs_[i])) {

       delete dawgs_[i];

     }

   }

   dawg_cache_->FreeDawg(bigram_dawg_);

   if (dawg_cache_is_ours_) {

     delete dawg_cache_;

     dawg_cache_ = NULL;

   }

   successors_.delete_data_pointers();

   dawgs_.clear();

   successors_.clear();

   document_words_ = NULL;

   if (pending_words_ != NULL) {

     delete pending_words_;

     pending_words_ = NULL;

   }

 }


 // Returns true if in light of the current state unichar_id is allowed

 // according to at least one of the dawgs in the dawgs_ vector.

 // See more extensive comments in dict.h where this function is declared.

 int Dict::def_letter_is_okay(void* void_dawg_args,

                              UNICHAR_ID unichar_id,

                              bool word_end) const {

   DawgArgs *dawg_args = reinterpret_cast<DawgArgs*>(void_dawg_args);


   if (dawg_debug_level >= 3) {

     tprintf("def_letter_is_okay: current unichar=%s word_end=%d"

             " num active dawgs=%d\n",

             getUnicharset().debug_str(unichar_id).string(), word_end,

             dawg_args->active_dawgs->length());

   }


   // Do not accept words that contain kPatternUnicharID.

   // (otherwise pattern dawgs would not function correctly).

   // Do not accept words containing INVALID_UNICHAR_IDs.

   if (unichar_id == Dawg::kPatternUnicharID ||

       unichar_id == INVALID_UNICHAR_ID) {

     dawg_args->permuter = NO_PERM;

     return NO_PERM;

   }


   // Initialization.

   PermuterType curr_perm = NO_PERM;

   dawg_args->updated_dawgs->clear();


   // Go over the active_dawgs vector and insert DawgPosition records

   // with the updated ref (an edge with the corresponding unichar id) into

   // dawg_args->updated_pos.

   for (int a = 0; a < dawg_args->active_dawgs->length(); ++a) {

     const DawgPosition &pos = (*dawg_args->active_dawgs)[a];

     const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : NULL;

     const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : NULL;


     if (!dawg && !punc_dawg) {

       // shouldn't happen.

       tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");

       continue;

     }

     if (!dawg) {

       // We're in the punctuation dawg.  A core dawg has not been chosen.

       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);

       EDGE_REF punc_transition_edge = punc_dawg->edge_char_of(

           punc_node, Dawg::kPatternUnicharID, word_end);

       if (punc_transition_edge != NO_EDGE) {

         // Find all successors, and see which can transition.

         const SuccessorList &slist = *(successors_[pos.punc_index]);

         for (int s = 0; s < slist.length(); ++s) {

           int sdawg_index = slist[s];

           const Dawg *sdawg = dawgs_[sdawg_index];

           UNICHAR_ID ch = char_for_dawg(unichar_id, sdawg);

           EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);

           if (dawg_edge != NO_EDGE) {

             if (dawg_debug_level >=3) {

               tprintf("Letter found in dawg %d\n", sdawg_index);

             }

             dawg_args->updated_dawgs->add_unique(

                 DawgPosition(sdawg_index, dawg_edge,

                              pos.punc_index, punc_transition_edge, false),

                 dawg_debug_level > 0,

                 "Append transition from punc dawg to current dawgs: ");

             if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter();

           }

         }

       }

       EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id,

                                                    word_end);

       if (punc_edge != NO_EDGE) {

         if (dawg_debug_level >=3) {

           tprintf("Letter found in punctuation dawg\n");

         }

         dawg_args->updated_dawgs->add_unique(

             DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false),

             dawg_debug_level > 0,

             "Extend punctuation dawg: ");

         if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM;

       }

       continue;

     }


     if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {

       // We can end the main word here.

       //  If we can continue on the punc ref, add that possibility.

       NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);

       EDGE_REF punc_edge = punc_node == NO_EDGE ? NO_EDGE

           : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);

       if (punc_edge != NO_EDGE) {

         dawg_args->updated_dawgs->add_unique(

             DawgPosition(pos.dawg_index, pos.dawg_ref,

                          pos.punc_index, punc_edge, true),

             dawg_debug_level > 0,

             "Return to punctuation dawg: ");

         if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();

       }

     }


     if (pos.back_to_punc) continue;


     // If we are dealing with the pattern dawg, look up all the

     // possible edges, not only for the exact unichar_id, but also

     // for all its character classes (alpha, digit, etc).

     if (dawg->type() == DAWG_TYPE_PATTERN) {

       ProcessPatternEdges(dawg, pos, unichar_id, word_end,

                           dawg_args->updated_dawgs, &curr_perm);

       // There can't be any successors to dawg that is of type

       // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.

       continue;

     }


     // Find the edge out of the node for the unichar_id.

     NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);

     EDGE_REF edge = (node == NO_EDGE) ? NO_EDGE

         : dawg->edge_char_of(node, char_for_dawg(unichar_id, dawg), word_end);


     if (dawg_debug_level >= 3) {

       tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",

               pos.dawg_index, node, edge);

     }


     if (edge != NO_EDGE) {  // the unichar was found in the current dawg

       if (dawg_debug_level >=3) {

         tprintf("Letter found in dawg %d\n", pos.dawg_index);

       }

       if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {

         if (dawg_debug_level >= 3) {

           tprintf("Punctuation constraint not satisfied at end of word.\n");

         }

         continue;

       }

       if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter();

       dawg_args->updated_dawgs->add_unique(

           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,

                        false),

           dawg_debug_level > 0,

           "Append current dawg to updated active dawgs: ");

     }

   }  // end for

   // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM

   // or if we found the current letter in a non-punctuation dawg. This

   // allows preserving information on which dawg the "core" word came from.

   // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.

   if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||

       (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {

     dawg_args->permuter = curr_perm;

   }

   if (dawg_debug_level >= 2) {

     tprintf("Returning %d for permuter code for this character.\n");

   }

   return dawg_args->permuter;

 }


 void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos,

                                UNICHAR_ID unichar_id, bool word_end,

                                DawgPositionVector *updated_dawgs,

                                PermuterType *curr_perm) const {

   NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);

   // Try to find the edge corresponding to the exact unichar_id and to all the

   // edges corresponding to the character class of unichar_id.

   GenericVector<UNICHAR_ID> unichar_id_patterns;

   unichar_id_patterns.push_back(unichar_id);

   dawg->unichar_id_to_patterns(unichar_id, getUnicharset(),

                                &unichar_id_patterns);

   for (int i = 0; i < unichar_id_patterns.size(); ++i) {

     // On the first iteration check all the outgoing edges.

     // On the second iteration check all self-loops.

     for (int k = 0; k < 2; ++k) {

       EDGE_REF edge = (k == 0)

       ? dawg->edge_char_of(node, unichar_id_patterns[i], word_end)

       : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_patterns[i], word_end);

       if (edge == NO_EDGE) continue;

       if (dawg_debug_level >= 3) {

         tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n",

                 pos.dawg_index, node, edge);

         tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);

       }

       if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter();

       updated_dawgs->add_unique(

           DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref,

                        pos.back_to_punc),

           dawg_debug_level > 0,

           "Append current dawg to updated active dawgs: ");

     }

   }

 }


 // Fill the given active_dawgs vector with dawgs that could contain the

 // beginning of the word. If hyphenated() returns true, copy the entries

 // from hyphen_active_dawgs_ instead.

 void Dict::init_active_dawgs(DawgPositionVector *active_dawgs,

                              bool ambigs_mode) const {

   int i;

   if (hyphenated()) {

     *active_dawgs = hyphen_active_dawgs_;

     if (dawg_debug_level >= 3) {

       for (i = 0; i < hyphen_active_dawgs_.size(); ++i) {

         tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",

                 hyphen_active_dawgs_[i].dawg_index,

                 hyphen_active_dawgs_[i].dawg_ref);

       }

     }

   } else {

     default_dawgs(active_dawgs, ambigs_mode);

   }

 }


 void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec,

                          bool suppress_patterns) const {

   bool punc_dawg_available =

     (punc_dawg_ != NULL) &&

     punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;


   for (int i = 0; i < dawgs_.length(); i++) {

     if (dawgs_[i] != NULL &&

         !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {

       int dawg_ty = dawgs_[i]->type();

       bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];

       if (dawg_ty == DAWG_TYPE_PUNCTUATION) {

         *dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);

         if (dawg_debug_level >= 3) {

           tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i,

                   NO_EDGE);

         }

       } else if (!punc_dawg_available || !subsumed_by_punc) {

         *dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);

         if (dawg_debug_level >= 3) {

           tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);

         }

       }

     }

   }

 }


 void Dict::add_document_word(const WERD_CHOICE &best_choice) {

   // Do not add hyphenated word parts to the document dawg.

   // hyphen_word_ will be non-NULL after the set_hyphen_word() is

   // called when the first part of the hyphenated word is

   // discovered and while the second part of the word is recognized.

   // hyphen_word_ is cleared in cc_recg() before the next word on

   // the line is recognized.

   if (hyphen_word_) return;


   char filename[CHARS_PER_LINE];

   FILE *doc_word_file;

   int stringlen = best_choice.length();


   if (valid_word(best_choice) || stringlen < 2)

     return;


   // Discard words that contain >= kDocDictMaxRepChars repeating unichars.

   if (best_choice.length() >= kDocDictMaxRepChars) {

     int num_rep_chars = 1;

     UNICHAR_ID uch_id = best_choice.unichar_id(0);

     for (int i = 1; i < best_choice.length(); ++i) {

       if (best_choice.unichar_id(i) != uch_id) {

         num_rep_chars = 1;

         uch_id = best_choice.unichar_id(i);

       } else {

         ++num_rep_chars;

         if (num_rep_chars == kDocDictMaxRepChars) return;

       }

     }

   }


   if (best_choice.certainty() < doc_dict_certainty_threshold ||

       stringlen == 2) {

     if (best_choice.certainty() < doc_dict_pending_threshold)

       return;


     if (!pending_words_->word_in_dawg(best_choice)) {

       if (stringlen > 2 ||

           (stringlen == 2 &&

            getUnicharset().get_isupper(best_choice.unichar_id(0)) &&

            getUnicharset().get_isupper(best_choice.unichar_id(1)))) {

         pending_words_->add_word_to_dawg(best_choice);

       }

       return;

     }

   }


   if (save_doc_words) {

     strcpy(filename, getCCUtil()->imagefile.string());

     strcat(filename, ".doc");

     doc_word_file = open_file (filename, "a");

     fprintf(doc_word_file, "%s\n",

             best_choice.debug_string().string());

     fclose(doc_word_file);

   }

   document_words_->add_word_to_dawg(best_choice);

 }


 void Dict::adjust_word(WERD_CHOICE *word,

                        bool nonword,

                        XHeightConsistencyEnum xheight_consistency,

                        float additional_adjust,

                        bool modify_rating,

                        bool debug) {

   bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&

                  word->GetTopScriptID() == getUnicharset().han_sid());

   bool case_is_ok = (is_han || case_ok(*word, getUnicharset()));

   bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));


   float adjust_factor = additional_adjust;

   float new_rating = word->rating();

   new_rating += kRatingPad;

   const char *xheight_triggered = "";

   if (word->length() > 1) {

     // Calculate x-height and y-offset consistency penalties.

     switch (xheight_consistency) {

       case XH_INCONSISTENT:

         adjust_factor += xheight_penalty_inconsistent;

         xheight_triggered = ", xhtBAD";

         break;

       case XH_SUBNORMAL:

         adjust_factor += xheight_penalty_subscripts;

         xheight_triggered = ", xhtSUB";

         break;

       case XH_GOOD:

         // leave the factor alone - all good!

         break;

     }

     // TODO(eger): if nonword is true, but there is a "core" thats' a dict

     // word, negate nonword status.

   } else {

     if (debug) {

       tprintf("Consistency could not be calculated.\n");

     }

   }

   if (debug) {

     tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "",

             word->unichar_string().string(), word->rating(),

             xheight_triggered);

   }


   if (nonword) {  // non-dictionary word

     if (case_is_ok && punc_is_ok) {

       adjust_factor += segment_penalty_dict_nonword;

       new_rating *= adjust_factor;

       if (debug) tprintf(", W");

     } else {

       adjust_factor += segment_penalty_garbage;

       new_rating *= adjust_factor;

       if (debug) {

         if (!case_is_ok) tprintf(", C");

         if (!punc_is_ok) tprintf(", P");

       }

     }

   } else {  // dictionary word

     if (case_is_ok) {

       if (!is_han && freq_dawg_ != NULL && freq_dawg_->word_in_dawg(*word)) {

         word->set_permuter(FREQ_DAWG_PERM);

         adjust_factor += segment_penalty_dict_frequent_word;

         new_rating *= adjust_factor;

         if (debug) tprintf(", F");

       } else {

         adjust_factor += segment_penalty_dict_case_ok;

         new_rating *= adjust_factor;

         if (debug) tprintf(", ");

       }

     } else {

       adjust_factor += segment_penalty_dict_case_bad;

       new_rating *= adjust_factor;

       if (debug) tprintf(", C");

     }

   }

   new_rating -= kRatingPad;

   if (modify_rating) word->set_rating(new_rating);

   if (debug) tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);

   word->set_adjust_factor(adjust_factor);

 }


 int Dict::valid_word(const WERD_CHOICE &word, bool numbers_ok) const {

   const WERD_CHOICE *word_ptr = &word;

   WERD_CHOICE temp_word(word.unicharset());

   if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {

     copy_hyphen_info(&temp_word);

     temp_word += word;

     word_ptr = &temp_word;

   }

   if (word_ptr->length() == 0) return NO_PERM;

   // Allocate vectors for holding current and updated

   // active_dawgs and initialize them.

   DawgPositionVector *active_dawgs = new DawgPositionVector[2];

   init_active_dawgs(&(active_dawgs[0]), false);

   DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);

   int last_index = word_ptr->length() - 1;

   // Call leter_is_okay for each letter in the word.

   for (int i = hyphen_base_size(); i <= last_index; ++i) {

     if (!((this->*letter_is_okay_)(&dawg_args, word_ptr->unichar_id(i),

                                    i == last_index))) break;

     // Swap active_dawgs, constraints with the corresponding updated vector.

     if (dawg_args.updated_dawgs == &(active_dawgs[1])) {

       dawg_args.updated_dawgs = &(active_dawgs[0]);

       ++(dawg_args.active_dawgs);

     } else {

       ++(dawg_args.updated_dawgs);

       dawg_args.active_dawgs = &(active_dawgs[0]);

     }

   }

   delete[] active_dawgs;

   return valid_word_permuter(dawg_args.permuter, numbers_ok) ?

     dawg_args.permuter : NO_PERM;

 }


 bool Dict::valid_bigram(const WERD_CHOICE &word1,

                         const WERD_CHOICE &word2) const {

   if (bigram_dawg_ == NULL) return false;


   // Extract the core word from the middle of each word with any digits

   //         replaced with question marks.

   int w1start, w1end, w2start, w2end;

   word1.punct_stripped(&w1start, &w1end);

   word2.punct_stripped(&w2start, &w2end);


   // We don't want to penalize a single guillemet, hyphen, etc.

   // But our bigram list doesn't have any information about punctuation.

   if (w1start >= w1end) return word1.length() < 3;

   if (w2start >= w2end) return word2.length() < 3;


   const UNICHARSET& uchset = getUnicharset();

   GenericVector<UNICHAR_ID> bigram_string;

   bigram_string.reserve(w1end + w2end + 1);

   for (int i = w1start; i < w1end; i++) {

     const GenericVector<UNICHAR_ID>& normed_ids =

         getUnicharset().normed_ids(word1.unichar_id(i));

     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))

       bigram_string.push_back(question_unichar_id_);

     else

       bigram_string += normed_ids;

   }

   bigram_string.push_back(UNICHAR_SPACE);

   for (int i = w2start; i < w2end; i++) {

     const GenericVector<UNICHAR_ID>& normed_ids =

         getUnicharset().normed_ids(word2.unichar_id(i));

     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))

       bigram_string.push_back(question_unichar_id_);

     else

       bigram_string += normed_ids;

   }

   WERD_CHOICE normalized_word(&uchset, bigram_string.size());

   for (int i = 0; i < bigram_string.size(); ++i) {

     normalized_word.append_unichar_id_space_allocated(bigram_string[i], 1,

                                                       0.0f, 0.0f);

   }

   return bigram_dawg_->word_in_dawg(normalized_word);

 }


 bool Dict::valid_punctuation(const WERD_CHOICE &word) {

   if (word.length() == 0) return NO_PERM;

   int i;

   WERD_CHOICE new_word(word.unicharset());

   int last_index = word.length() - 1;

   int new_len = 0;

   for (i = 0; i <= last_index; ++i) {

     UNICHAR_ID unichar_id = (word.unichar_id(i));

     if (getUnicharset().get_ispunctuation(unichar_id)) {

       new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);

     } else if (!getUnicharset().get_isalpha(unichar_id) &&

                !getUnicharset().get_isdigit(unichar_id)) {

       return false;  // neither punc, nor alpha, nor digit

     } else if ((new_len = new_word.length()) == 0 ||

                new_word.unichar_id(new_len-1) != Dawg::kPatternUnicharID) {

       new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);

     }

   }

   for (i = 0; i < dawgs_.size(); ++i) {

     if (dawgs_[i] != NULL &&

         dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&

         dawgs_[i]->word_in_dawg(new_word)) return true;

   }

   return false;

 }


 }  // namespace tesseract

tesseract::Dict::Dict
Dict(CCUtil *image_ptr)
Definition: dict.cpp:33

tesseract::Dawg::lang
const STRING & lang() const
Definition: dawg.h:128

tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:81

COMPOUND_PERM
Definition: ratngs.h:253

STRING_MEMBER
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:307

GenericVector::size
int size() const
Definition: genericvector.h:72

PUNC_PERM
Definition: ratngs.h:242

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:324

UNICHARSET::unichar_to_id
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

GenericVector::length
int length() const
Definition: genericvector.h:79

tesseract::DawgCache
Definition: dawg_cache.h:30

unicodes.h

tesseract::Dawg::end_of_word
virtual bool end_of_word(EDGE_REF edge_ref) const =0

tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:90

tesseract::DawgPosition::dawg_ref
EDGE_REF dawg_ref
Definition: dawg.h:362

WERD_CHOICE::punct_stripped
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:361

WERD_CHOICE::length
int length() const
Definition: ratngs.h:300

tesseract::DawgPosition::punc_index
inT8 punc_index
Definition: dawg.h:363

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:737

USER_DAWG_PERM
Definition: ratngs.h:251

WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449

tesseract::Dict::load_bigram_dawg
bool load_bigram_dawg
Definition: dict.h:561

tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:705

tesseract::DawgPositionVector
Definition: dawg.h:369

UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:831

tprintf
#define tprintf(...)
Definition: tprintf.h:31

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463

tesseract::Dict::ProcessPatternEdges
void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgPositionVector *updated_dawgs, PermuterType *current_permuter) const
Definition: dict.cpp:486

UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:836

PermuterType
PermuterType
Definition: ratngs.h:240

tesseract::Dict::segment_penalty_dict_case_ok
double segment_penalty_dict_case_ok
Definition: dict.h:574

WERD_CHOICE::set_permuter
void set_permuter(uinT8 perm)
Definition: ratngs.h:372

tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:75

tesseract::TESSDATA_NUMBER_DAWG
Definition: tessdatamanager.h:62

NO_PERM
Definition: ratngs.h:241

tesseract::Dict
Definition: dict.h:86

tesseract::TessdataManager
Definition: tessdatamanager.h:133

tesseract::Dict::def_letter_is_okay
int def_letter_is_okay(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.cpp:336

tesseract::Dict::xheight_penalty_inconsistent
double xheight_penalty_inconsistent
Definition: dict.h:567

tesseract::Trie
Definition: trie.h:62

tesseract::Dict::init_active_dawgs
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:523

USER_PATTERN_PERM
Definition: ratngs.h:248

BOOL_MEMBER
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:304

tesseract::Dict::segment_penalty_dict_case_bad
double segment_penalty_dict_case_bad
Definition: dict.h:578

tesseract::TESSDATA_FREQ_DAWG
Definition: tessdatamanager.h:63

tesseract::CCUtil::tessdata_manager
TessdataManager tessdata_manager
Definition: ccutil.h:71

UNICHARSET::normed_ids
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:783

tesseract::TESSDATA_UNAMBIG_DAWG
Definition: tessdatamanager.h:69

WERD_CHOICE
Definition: ratngs.h:271

DOC_DAWG_PERM
Definition: ratngs.h:250

tesseract::Dict::default_dawgs
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
Definition: dict.cpp:540

tesseract::DawgArgs
Definition: dict.h:77

tesseract::Dict::save_doc_words
bool save_doc_words
Definition: dict.h:628

tesseract::Dict::go_deeper_fxn_
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:203

dict.h

tprintf.h

tesseract::DAWG_TYPE_PUNCTUATION
Definition: dawg.h:72

UNICHAR_SPACE
Definition: unicharset.h:35

tesseract::Dict::user_patterns_file
char * user_patterns_file
Definition: dict.h:551

tesseract::TESSDATA_PUNC_DAWG
Definition: tessdatamanager.h:60

tesseract-c_api-demo.filename
string filename
Definition: tesseract-c_api-demo.py:29

tesseract::DawgPosition::punc_ref
EDGE_REF punc_ref
Definition: dawg.h:364

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:524

tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:125

WERD_CHOICE::GetTopScriptID
int GetTopScriptID() const
Definition: ratngs.cpp:653

tesseract::Dawg::word_in_dawg
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70

tesseract::Dict::End
void End()
Definition: dict.cpp:310

tesseract::Trie::add_word_to_dawg
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178

tesseract::Dawg::edge_char_of
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.

WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:297

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470

tesseract::XH_INCONSISTENT
Definition: dict.h:75

tesseract::Dict::doc_dict_pending_threshold
double doc_dict_pending_threshold
Definition: dict.h:630

BOOL_INIT_MEMBER
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:316

tesseract::Dict::GlobalDawgCache
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186

tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:135

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447

tesseract::Dict::valid_punctuation
bool valid_punctuation(const WERD_CHOICE &word)
Definition: dict.cpp:781

tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:58

tesseract::Dict::xheight_penalty_subscripts
double xheight_penalty_subscripts
Definition: dict.h:564

tesseract::DawgCache::GetSquishedDawg
Dawg * GetSquishedDawg(const STRING &lang, const char *data_file_name, TessdataType tessdata_dawg_type, int debug_level)
Definition: dawg_cache.cpp:47

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:327

tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:129

tesseract::Trie::initialize_patterns
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:352

tesseract::CCUtil
Definition: ccutil.h:54

WERD_CHOICE::unichar_id
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312

GenericVector::delete_data_pointers
void delete_data_pointers()
Definition: genericvector.h:827

name
name_table name
Definition: GlyphLessFont.c:308

tesseract::Dawg::unichar_id_to_patterns
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:184

tesseract::DawgPosition
Definition: dawg.h:342

FREQ_DAWG_PERM
Definition: ratngs.h:252

tesseract::XH_GOOD
Definition: dict.h:75

tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:422

tesseract::Dict::segment_penalty_dict_frequent_word
double segment_penalty_dict_frequent_word
Definition: dict.h:570

WERD_CHOICE::debug_string
const STRING debug_string() const
Definition: ratngs.h:502

tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:125

INT_MEMBER
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:301

tesseract::TessdataManager::GetDataFileName
const STRING & GetDataFileName() const
Definition: tessdatamanager.h:152

tesseract::Dict::user_patterns_suffix
char * user_patterns_suffix
Definition: dict.h:553

tesseract::Dict::load_system_dawg
bool load_system_dawg
Definition: dict.h:554

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesseract::DAWG_TYPE_PATTERN
Definition: dawg.h:75

tesseract::Dict::valid_bigram
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:738

tesseract::Dict::~Dict
~Dict()
Definition: dict.cpp:181

tesseract::Dict::doc_dict_certainty_threshold
double doc_dict_certainty_threshold
Definition: dict.h:632

tesseract::DawgPosition::dawg_index
inT8 dawg_index
Definition: dawg.h:361

UNICHARSET::get_ispunctuation
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477

tesseract::CCUtil::language_data_path_prefix
STRING language_data_path_prefix
Definition: ccutil.h:70

tesseract::Dawg
Definition: dawg.h:118

GenericVector::clear
void clear()
Definition: genericvector.h:806

tesseract::Dawg::pattern_loop_edge
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:191

tesseract::Dict::dawg_debug_level
int dawg_debug_level
Definition: dict.h:595

tesseract::Dict::load_unambig_dawg
bool load_unambig_dawg
Definition: dict.h:556

tesseract::DawgPosition::back_to_punc
bool back_to_punc
Definition: dawg.h:366

tesseract::Dict::load_freq_dawg
bool load_freq_dawg
Definition: dict.h:555

REFFORMAT
#define REFFORMAT
Definition: dawg.h:92

tesseract::Dict::Load
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194

tesseract::Trie::read_and_add_word_list
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:291

tesseract::SuccessorList
GenericVector< int > SuccessorList
Definition: dawg.h:68

tesseract::Dict::letter_is_okay_
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:347

tesseract::Dict::user_words_suffix
char * user_words_suffix
Definition: dict.h:549

tesseract
Definition: baseapi.cpp:83

tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:96

GenericVector::reserve
void reserve(int size)
Definition: genericvector.h:623

tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:127

tesseract::Dict::segment_penalty_dict_nonword
double segment_penalty_dict_nonword
Definition: dict.h:586

STRING_INIT_MEMBER
#define STRING_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:319

tesseract-c_api-demo.lang
string lang
Definition: tesseract-c_api-demo.py:28

tesseract::TESSDATA_SYSTEM_DAWG
Definition: tessdatamanager.h:61

open_file
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82

UNICHARSET
Definition: unicharset.h:139

tesseract::Dict::segment_penalty_garbage
double segment_penalty_garbage
Definition: dict.h:591

tesseract::DawgPositionVector::clear
void clear()
Definition: dawg.h:381

tesseract::DawgPositionVector::add_unique
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:385

tesseract::CCUtil::lang
STRING lang
Definition: ccutil.h:69

CHARS_PER_LINE
#define CHARS_PER_LINE
Definition: cutil.h:57

double_MEMBER
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:310

tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:82

EDGE_REF
inT64 EDGE_REF
Definition: dawg.h:54

tesseract::Dict::user_words_file
char * user_words_file
Definition: dict.h:547

STRING
Definition: strngs.h:44

NULL
#define NULL
Definition: host.h:144

tesseract::DAWG_TYPE_WORD
Definition: dawg.h:73

NODE_REF
inT64 NODE_REF
Definition: dawg.h:55

tesseract::Dict::add_document_word
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:567

GenericVector< int >

tesseract::TESSDATA_BIGRAM_DAWG
Definition: tessdatamanager.h:68

tesseract::XH_SUBNORMAL
Definition: dict.h:75

STRING::string
const char * string() const
Definition: strngs.cpp:193

tesseract::Dawg::permuter
PermuterType permuter() const
Definition: dawg.h:129

tesseract::DawgCache::FreeDawg
bool FreeDawg(Dawg *dawg)
Definition: dawg_cache.h:41

WERD_CHOICE::set_adjust_factor
void set_adjust_factor(float factor)
Definition: ratngs.h:306

ccutil
CCUtil ccutil
Definition: commontraining.cpp:53

tesseract::Trie::RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:66

tesseract::Dict::load_number_dawg
bool load_number_dawg
Definition: dict.h:559

tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:412

tesseract::Dict::adjust_word
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
Definition: dict.cpp:625

tesseract::Trie::read_pattern_list
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:409

WERD_CHOICE::set_rating
void set_rating(float new_val)
Definition: ratngs.h:366

tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:83

tesseract::Dict::load_punc_dawg
bool load_punc_dawg
Definition: dict.h:558