tesseract-ocr.github.io/5.3.3/a00707_source.html

// File:        dict.h

// Description: dict class.

// Author:      Samuel Charron

//

// (C) Copyright 2006, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


#ifndef TESSERACT_DICT_DICT_H_

#define TESSERACT_DICT_DICT_H_


#ifdef HAVE_CONFIG_H

#  include "config_auto.h" // DISABLED_LEGACY_ENGINE

#endif


#ifndef DISABLED_LEGACY_ENGINE

#  include "ambigs.h"

#endif

#include "dawg.h"

#include "dawg_cache.h"

#include "ratngs.h"

#include "stopper.h"

#include "trie.h"

#include "unicharset.h"

#ifndef DISABLED_LEGACY_ENGINE

#  include "params_training_featdef.h"

#endif // ndef DISABLED_LEGACY_ENGINE


namespace tesseract {


class MATRIX;

class WERD_RES;


#define CHARS_PER_LINE 500

#define MAX_WERD_LENGTH (int64_t)128

#define NO_RATING -1


struct CHAR_FRAGMENT_INFO {

  UNICHAR_ID unichar_id;

  const CHAR_FRAGMENT *fragment;

  int num_fragments;

  float rating;

  float certainty;

};


using DawgVector = std::vector<Dawg *>;


//

// Constants

//

static const int kRatingPad = 4;

static const int kDictMaxWildcards = 2; // max wildcards for a word

// TODO(daria): If hyphens are different in different languages and can be

// inferred from training data we should load their values dynamically.

static const char kHyphenSymbol[] = "-";

static const char kSlashSymbol[] = "/";

static const char kQuestionSymbol[] = "?";

static const char kApostropheSymbol[] = "'";

static const float kSimCertaintyScale = -10.0;  // similarity matcher scaling

static const float kSimCertaintyOffset = -10.0; // similarity matcher offset

static const float kSimilarityFloor = 100.0;    // worst E*L product to stop on

static const int kDocDictMaxRepChars = 4;


// Enum for describing whether the x-height for the word is consistent:

//  0 - everything is good.

//  1 - there are one or two secondary (but consistent) baselines

//      [think subscript and superscript], or there is an oversized

//      first character.

//  2 - the word is inconsistent.

enum XHeightConsistencyEnum { XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT };


struct DawgArgs {

  DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)

      : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {}


  DawgPositionVector *active_dawgs;

  DawgPositionVector *updated_dawgs;

  PermuterType permuter;

  // True if the current position is a valid word end.

  bool valid_end;

};


class TESS_API Dict {

public:

  Dict(CCUtil *image_ptr);

  ~Dict();

  const CCUtil *getCCUtil() const {

    return ccutil_;

  }

  CCUtil *getCCUtil() {

    return ccutil_;

  }

  const UNICHARSET &getUnicharset() const {

    return getCCUtil()->unicharset;

  }

  UNICHARSET &getUnicharset() {

    return getCCUtil()->unicharset;

  }

#ifndef DISABLED_LEGACY_ENGINE

  const UnicharAmbigs &getUnicharAmbigs() const {

    return getCCUtil()->unichar_ambigs;

  }

#endif

  // Returns true if unichar_id is a word compounding character like - or /.

  inline bool compound_marker(UNICHAR_ID unichar_id) {

    const UNICHARSET &unicharset = getUnicharset();

    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));

    const auto &normed_ids = unicharset.normed_ids(unichar_id);

    return normed_ids.size() == 1 &&

           (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);

  }

  // Returns true if unichar_id is an apostrophe-like character that may

  // separate prefix/suffix words from a main body word.

  inline bool is_apostrophe(UNICHAR_ID unichar_id) {

    const UNICHARSET &unicharset = getUnicharset();

    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));

    const auto &normed_ids = unicharset.normed_ids(unichar_id);

    return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;

  }


  /* hyphen.cpp ************************************************************/


  inline bool hyphenated() const {

    return !last_word_on_line_ && hyphen_word_;

  }

  inline int hyphen_base_size() const {

    return this->hyphenated() ? hyphen_word_->length() : 0;

  }

  inline void copy_hyphen_info(WERD_CHOICE *word) const {

    if (this->hyphenated()) {

      *word = *hyphen_word_;

      if (hyphen_debug_level) {

        word->print("copy_hyphen_info: ");

      }

    }

  }

  inline bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id,

                             bool first_pos) const {

    if (!last_word_on_line_ || first_pos) {

      return false;

    }

    ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));

    const auto &normed_ids = unicharset->normed_ids(unichar_id);

    return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;

  }

  inline bool has_hyphen_end(const WERD_CHOICE &word) const {

    int word_index = word.length() - 1;

    return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);

  }

  void reset_hyphen_vars(bool last_word_on_line);

  void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs);


  /* permdawg.cpp ************************************************************/

  // Note: Functions in permdawg.cpp are only used by NoDangerousAmbig().

  // When this function is refactored, permdawg.cpp can be removed.


  inline void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice) {

    if (word.rating() < best_choice->rating()) {

      *best_choice = word;

    }

  }

  void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const;

  // Fill the given vector with the default collection of any-length dawgs

  void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const;


  WERD_CHOICE *dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices,

                                       float rating_limit);

  void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,

                          int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,

                          bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,

                          WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args);


  void (Dict::*go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,

                               int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,

                               bool word_ending, WERD_CHOICE *word, float certainties[],

                               float *limit, WERD_CHOICE *best_choice, int *attempts_left,

                               void *void_more_args);

  //

  // Helper functions for dawg_permute_and_select().

  //

  void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,

                       int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,

                       WERD_CHOICE *word, float certainties[], float *limit,

                       WERD_CHOICE *best_choice, int *attempts_left, void *more_args);


  void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,

                      const BLOB_CHOICE &blob_choice, int char_choice_index,

                      const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word,

                      float certainties[], float *limit, WERD_CHOICE *best_choice,

                      int *attempts_left, void *more_args);


  bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty,

                           const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug,

                           int word_ending, CHAR_FRAGMENT_INFO *char_frag_info);


  /* stopper.cpp *************************************************************/

#if !defined(DISABLED_LEGACY_ENGINE)

  bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable,

                        MATRIX *ratings);

#endif // !defined(DISABLED_LEGACY_ENGINE)

  // Replaces the corresponding wrong ngram in werd_choice with the correct

  // one. The whole correct n-gram is inserted into the ratings matrix and

  // the werd_choice: no more fragments!. Rating and certainty of new entries

  // in matrix and werd_choice are the sum and mean of the wrong ngram

  // respectively.

  // E.g. for werd_choice mystring'' and ambiguity ''->": werd_choice becomes

  // mystring", with a new entry in the ratings matrix for ".

  void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id,

                    WERD_CHOICE *werd_choice, MATRIX *ratings);


  int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const;

  int UniformCertainties(const WERD_CHOICE &word);

  bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency);

  bool AcceptableResult(WERD_RES *word) const;

#if !defined(DISABLED_LEGACY_ENGINE)

  void EndDangerousAmbigs();

#endif // !defined(DISABLED_LEGACY_ENGINE)

  void DebugWordChoices();

  void SettupStopperPass1();

  void SettupStopperPass2();

  /* context.cpp *************************************************************/

  int case_ok(const WERD_CHOICE &word) const;

  bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);


  /* dict.cpp ****************************************************************/


  static DawgCache *GlobalDawgCache();

  // Sets up ready for a Load or LoadLSTM.

  void SetupForLoad(DawgCache *dawg_cache);

  // Loads the dawgs needed by Tesseract. Call FinishLoad() after.

  void Load(const std::string &lang, TessdataManager *data_file);

  // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.

  void LoadLSTM(const std::string &lang, TessdataManager *data_file);

  // Completes the loading process after Load() and/or LoadLSTM().

  // Returns false if no dictionaries were loaded.

  bool FinishLoad();

  void End();


  // Resets the document dictionary analogous to ResetAdaptiveClassifier.

  void ResetDocumentDictionary() {

    if (pending_words_ != nullptr) {

      pending_words_->clear();

    }

    if (document_words_ != nullptr) {

      document_words_->clear();

    }

  }


  //

  int def_letter_is_okay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,

                         bool word_end) const;


  int (Dict::*letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset,

                               UNICHAR_ID unichar_id, bool word_end) const;

  int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id,

                   bool word_end) const {

    return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);

  }


  double (Dict::*probability_in_context_)(const char *lang, const char *context, int context_bytes,

                                          const char *character, int character_bytes);

  double ProbabilityInContext(const char *context, int context_bytes, const char *character,

                              int character_bytes) {

    return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,

                                            character, character_bytes);

  }


  double def_probability_in_context(const char *lang, const char *context, int context_bytes,

                                    const char *character, int character_bytes) {

    (void)lang;

    (void)context;

    (void)context_bytes;

    (void)character;

    (void)character_bytes;

    return 0.0;

  }


  inline void SetWildcardID(UNICHAR_ID id) {

    wildcard_unichar_id_ = id;

  }

  inline UNICHAR_ID WildcardID() const {

    return wildcard_unichar_id_;

  }

  inline int NumDawgs() const {

    return dawgs_.size();

  }

  inline const Dawg *GetDawg(int index) const {

    return dawgs_[index];

  }

  inline const Dawg *GetPuncDawg() const {

    return punc_dawg_;

  }

  inline const Dawg *GetUnambigDawg() const {

    return unambig_dawg_;

  }

  static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {

    if (edge_ref == NO_EDGE) {

      return 0; // beginning to explore the dawg

    }

    NODE_REF node = dawg->next_node(edge_ref);

    if (node == 0) {

      node = NO_EDGE; // end of word

    }

    return node;

  }


  // Given a unichar from a string and a given dawg, return the unichar

  // we should use to match in that dawg type.  (for example, in the number

  // dawg, all numbers are transformed to kPatternUnicharId).

  UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const {

    if (!dawg) {

      return ch;

    }

    switch (dawg->type()) {

      case DAWG_TYPE_NUMBER:

        return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;

      default:

        return ch;

    }

  }


  void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id,

                           bool word_end, DawgArgs *dawg_args,

                           PermuterType *current_permuter) const;


  inline static bool valid_word_permuter(uint8_t perm, bool numbers_ok) {

    return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||

            perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||

            (numbers_ok && perm == NUMBER_PERM));

  }

  int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;

  int valid_word(const WERD_CHOICE &word) const {

    return valid_word(word, false); // return NO_PERM for words with digits

  }

  int valid_word_or_number(const WERD_CHOICE &word) const {

    return valid_word(word, true); // return NUMBER_PERM for valid numbers

  }

  int valid_word(const char *string) const {

    WERD_CHOICE word(string, getUnicharset());

    return valid_word(word);

  }

  // Do the two WERD_CHOICEs form a meaningful bigram?

  bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;

  bool valid_punctuation(const WERD_CHOICE &word);

  int good_choice(const WERD_CHOICE &choice);

  void add_document_word(const WERD_CHOICE &best_choice);

  void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency,

                   float additional_adjust, bool modify_rating, bool debug);

  inline void SetWordsegRatingAdjustFactor(float f) {

    wordseg_rating_adjust_factor_ = f;

  }

  bool IsSpaceDelimitedLang() const;


private:

  CCUtil *ccutil_;

#ifndef DISABLED_LEGACY_ENGINE

  UnicharAmbigs *dang_ambigs_table_ = nullptr;

  UnicharAmbigs *replace_ambigs_table_ = nullptr;

#endif

  float reject_offset_;

  // Cached UNICHAR_IDs:

  UNICHAR_ID wildcard_unichar_id_;   // kDictWildcard.

  UNICHAR_ID apostrophe_unichar_id_; // kApostropheSymbol.

  UNICHAR_ID question_unichar_id_;   // kQuestionSymbol.

  UNICHAR_ID slash_unichar_id_;      // kSlashSymbol.

  UNICHAR_ID hyphen_unichar_id_;     // kHyphenSymbol.

  // Hyphen-related variables.

  WERD_CHOICE *hyphen_word_;

  DawgPositionVector hyphen_active_dawgs_;

  bool last_word_on_line_;

  // List of lists of "equivalent" UNICHAR_IDs for the purposes of dictionary

  // matching.  The first member of each list is taken as canonical.  For

  // example, the first list contains hyphens and dashes with the first symbol

  // being the ASCII hyphen minus.

  std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;

  // Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.

  DawgCache *dawg_cache_;

  bool dawg_cache_is_ours_; // we should delete our own dawg_cache_

  // Dawgs.

  DawgVector dawgs_;

  SuccessorListsVector successors_;

  Trie *pending_words_;

  // bigram_dawg_ points to a dawg of two-word bigrams which always supersede if

  // any of them are present on the best choices list for a word pair.

  // the bigrams are stored as space-separated words where:

  // (1) leading and trailing punctuation has been removed from each word and

  // (2) any digits have been replaced with '?' marks.

  Dawg *bigram_dawg_;

  // TODO(daria): need to support multiple languages in the future,

  // so maybe will need to maintain a list of dawgs of each kind.

  Dawg *freq_dawg_;

  Dawg *unambig_dawg_;

  Dawg *punc_dawg_;

  Trie *document_words_;

  float wordseg_rating_adjust_factor_;

  // File for recording ambiguities discovered during dictionary search.

  FILE *output_ambig_words_file_;


public:

  STRING_VAR_H(user_words_file);

  STRING_VAR_H(user_words_suffix);

  STRING_VAR_H(user_patterns_file);

  STRING_VAR_H(user_patterns_suffix);

  BOOL_VAR_H(load_system_dawg);

  BOOL_VAR_H(load_freq_dawg);

  BOOL_VAR_H(load_unambig_dawg);

  BOOL_VAR_H(load_punc_dawg);

  BOOL_VAR_H(load_number_dawg);

  BOOL_VAR_H(load_bigram_dawg);

  double_VAR_H(xheight_penalty_subscripts);

  double_VAR_H(xheight_penalty_inconsistent);

  double_VAR_H(segment_penalty_dict_frequent_word);

  double_VAR_H(segment_penalty_dict_case_ok);

  double_VAR_H(segment_penalty_dict_case_bad);

  double_VAR_H(segment_penalty_dict_nonword);

  double_VAR_H(segment_penalty_garbage);

  STRING_VAR_H(output_ambig_words_file);

  INT_VAR_H(dawg_debug_level);

  INT_VAR_H(hyphen_debug_level);

  BOOL_VAR_H(use_only_first_uft8_step);

  double_VAR_H(certainty_scale);

  double_VAR_H(stopper_nondict_certainty_base);

  double_VAR_H(stopper_phase2_certainty_rejection_offset);

  INT_VAR_H(stopper_smallword_size);

  double_VAR_H(stopper_certainty_per_char);

  double_VAR_H(stopper_allowable_character_badness);

  INT_VAR_H(stopper_debug_level);

  BOOL_VAR_H(stopper_no_acceptable_choices);

  INT_VAR_H(tessedit_truncate_wordchoice_log);

  STRING_VAR_H(word_to_debug);

  BOOL_VAR_H(segment_nonalphabetic_script);

  BOOL_VAR_H(save_doc_words);

  double_VAR_H(doc_dict_pending_threshold);

  double_VAR_H(doc_dict_certainty_threshold);

  INT_VAR_H(max_permuter_attempts);

};


} // namespace tesseract


#endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_

unicharset.h

ambigs.h

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

ratngs.h

params_training_featdef.h

dawg_cache.h

trie.h

stopper.h

dawg.h

p
const char * p
Definition: gmock-matchers_test.cc:4030

ch
char ch
Definition: gmock-matchers_test.cc:4035

tesseract
Definition: baseapi.h:39

tesseract::DAWG_TYPE_NUMBER
@ DAWG_TYPE_NUMBER
Definition: dawg.h:67

tesseract::EDGE_REF
int64_t EDGE_REF
Definition: dawg.h:49

tesseract::XHeightConsistencyEnum
XHeightConsistencyEnum
Definition: dict.h:81

tesseract::XH_GOOD
@ XH_GOOD
Definition: dict.h:81

tesseract::XH_SUBNORMAL
@ XH_SUBNORMAL
Definition: dict.h:81

tesseract::XH_INCONSISTENT
@ XH_INCONSISTENT
Definition: dict.h:81

tesseract::SuccessorListsVector
std::vector< SuccessorList * > SuccessorListsVector
Definition: dawg.h:62

tesseract::NODE_REF
int64_t NODE_REF
Definition: dawg.h:50

tesseract::character
@ character
Definition: mfoutline.h:53

tesseract::DawgVector
std::vector< Dawg * > DawgVector
Definition: dict.h:57

tesseract::UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34

tesseract::DANGERR
std::vector< DANGERR_INFO > DANGERR
Definition: stopper.h:47

tesseract::PermuterType
PermuterType
Definition: ratngs.h:235

tesseract::SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244

tesseract::NUMBER_PERM
@ NUMBER_PERM
Definition: ratngs.h:242

tesseract::COMPOUND_PERM
@ COMPOUND_PERM
Definition: ratngs.h:248

tesseract::USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:246

tesseract::USER_PATTERN_PERM
@ USER_PATTERN_PERM
Definition: ratngs.h:243

tesseract::DOC_DAWG_PERM
@ DOC_DAWG_PERM
Definition: ratngs.h:245

tesseract::FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:247

tesseract::BLOB_CHOICE_LIST_VECTOR
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
Definition: ratngs.h:627

tesseract::MATRIX
Definition: matrix.h:657

tesseract::WERD_RES
Definition: pageres.h:164

tesseract::BLOB_CHOICE
Definition: ratngs.h:56

tesseract::WERD_CHOICE
Definition: ratngs.h:258

tesseract::WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299

tesseract::WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:281

tesseract::WERD_CHOICE::length
unsigned length() const
Definition: ratngs.h:287

tesseract::WERD_CHOICE::print
void print() const
Definition: ratngs.h:561

tesseract::WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:312

tesseract::UnicharAmbigs
Definition: ambigs.h:142

tesseract::CCUtil
Definition: ccutil.h:43

tesseract::TessdataManager
Definition: tessdatamanager.h:127

tesseract::CHAR_FRAGMENT
Definition: unicharset.h:50

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::normed_ids
const std::vector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
Definition: unicharset.h:868

tesseract::UNICHARSET::contains_unichar_id
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: unicharset.h:303

tesseract::UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524

tesseract::Dawg
Definition: dawg.h:110

tesseract::Dawg::type
DawgType type() const
Definition: dawg.h:119

tesseract::Dawg::next_node
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0

tesseract::Dawg::kPatternUnicharID
static const UNICHAR_ID kPatternUnicharID
Definition: dawg.h:117

tesseract::DawgPosition
Definition: dawg.h:355

tesseract::DawgPositionVector
Definition: dawg.h:378

tesseract::DawgCache
Definition: dawg_cache.h:29

tesseract::CHAR_FRAGMENT_INFO
Definition: dict.h:49

tesseract::CHAR_FRAGMENT_INFO::certainty
float certainty
Definition: dict.h:54

tesseract::CHAR_FRAGMENT_INFO::num_fragments
int num_fragments
Definition: dict.h:52

tesseract::CHAR_FRAGMENT_INFO::fragment
const CHAR_FRAGMENT * fragment
Definition: dict.h:51

tesseract::CHAR_FRAGMENT_INFO::rating
float rating
Definition: dict.h:53

tesseract::CHAR_FRAGMENT_INFO::unichar_id
UNICHAR_ID unichar_id
Definition: dict.h:50

tesseract::DawgArgs
Definition: dict.h:83

tesseract::DawgArgs::DawgArgs
DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p)
Definition: dict.h:84

tesseract::DawgArgs::updated_dawgs
DawgPositionVector * updated_dawgs
Definition: dict.h:88

tesseract::DawgArgs::active_dawgs
DawgPositionVector * active_dawgs
Definition: dict.h:87

tesseract::DawgArgs::permuter
PermuterType permuter
Definition: dict.h:89

tesseract::DawgArgs::valid_end
bool valid_end
Definition: dict.h:91

tesseract::Dict
Definition: dict.h:94

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(use_only_first_uft8_step)

tesseract::Dict::copy_hyphen_info
void copy_hyphen_info(WERD_CHOICE *word) const
Definition: dict.h:145

tesseract::Dict::double_VAR_H
double_VAR_H(segment_penalty_dict_case_ok)

tesseract::Dict::WildcardID
UNICHAR_ID WildcardID() const
Definition: dict.h:377

tesseract::Dict::SetWordsegRatingAdjustFactor
void SetWordsegRatingAdjustFactor(float f)
Set wordseg_rating_adjust_factor_ to the given value.
Definition: dict.h:469

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(user_words_file)

tesseract::Dict::double_VAR_H
double_VAR_H(doc_dict_certainty_threshold)

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_punc_dawg)

tesseract::Dict::double_VAR_H
double_VAR_H(stopper_nondict_certainty_base)

tesseract::Dict::getCCUtil
const CCUtil * getCCUtil() const
Definition: dict.h:98

tesseract::Dict::hyphen_base_size
int hyphen_base_size() const
Size of the base word (the part on the line before) of a hyphenated word.
Definition: dict.h:139

tesseract::Dict::valid_word
int valid_word(const WERD_CHOICE &word) const
Definition: dict.h:443

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_bigram_dawg)

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(word_to_debug)

tesseract::Dict::INT_VAR_H
INT_VAR_H(dawg_debug_level)

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(user_patterns_suffix)

tesseract::Dict::LetterIsOkay
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
Definition: dict.h:348

tesseract::Dict::valid_word_permuter
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:437

tesseract::Dict::double_VAR_H
double_VAR_H(segment_penalty_dict_case_bad)

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(user_words_suffix)

tesseract::Dict::char_for_dawg
UNICHAR_ID char_for_dawg(const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const
Definition: dict.h:411

tesseract::Dict::INT_VAR_H
INT_VAR_H(hyphen_debug_level)

tesseract::Dict::getUnicharset
UNICHARSET & getUnicharset()
Definition: dict.h:107

tesseract::Dict::double_VAR_H
double_VAR_H(certainty_scale)

tesseract::Dict::NumDawgs
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:381

tesseract::Dict::valid_word
int valid_word(const char *string) const
This function is used by api/tesseract_cube_combiner.cpp.
Definition: dict.h:450

tesseract::Dict::hyphenated
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
Definition: dict.h:135

tesseract::Dict::update_best_choice
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182

tesseract::Dict::double_VAR_H
double_VAR_H(xheight_penalty_subscripts)

tesseract::Dict::def_probability_in_context
double def_probability_in_context(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Default (no-op) implementation of probability in context function.
Definition: dict.h:364

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:154

tesseract::Dict::GetDawg
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_unambig_dawg)

tesseract::Dict::INT_VAR_H
INT_VAR_H(tessedit_truncate_wordchoice_log)

tesseract::Dict::compound_marker
bool compound_marker(UNICHAR_ID unichar_id)
Definition: dict.h:116

tesseract::Dict::double_VAR_H
double_VAR_H(stopper_phase2_certainty_rejection_offset)

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(output_ambig_words_file)

tesseract::Dict::double_VAR_H
double_VAR_H(doc_dict_pending_threshold)

tesseract::Dict::double_VAR_H
double_VAR_H(segment_penalty_dict_nonword)

tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

tesseract::Dict::GetStartingNode
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
Definition: dict.h:397

tesseract::Dict::GetUnambigDawg
const Dawg * GetUnambigDawg() const
Return the points to the unambiguous words dawg.
Definition: dict.h:393

tesseract::Dict::is_apostrophe
bool is_apostrophe(UNICHAR_ID unichar_id)
Definition: dict.h:125

tesseract::Dict::ResetDocumentDictionary
void ResetDocumentDictionary()
Definition: dict.h:297

tesseract::Dict::GetPuncDawg
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
Definition: dict.h:389

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_system_dawg)

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_number_dawg)

tesseract::Dict::ProbabilityInContext
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
Definition: dict.h:357

tesseract::Dict::good_choice
int good_choice(const WERD_CHOICE &choice)
Returns true if a good answer is found for the unknown blob rating.

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(segment_nonalphabetic_script)

tesseract::Dict::double_VAR_H
double_VAR_H(segment_penalty_garbage)

tesseract::Dict::INT_VAR_H
INT_VAR_H(stopper_debug_level)

tesseract::Dict::double_VAR_H
double_VAR_H(segment_penalty_dict_frequent_word)

tesseract::Dict::STRING_VAR_H
STRING_VAR_H(user_patterns_file)

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(stopper_no_acceptable_choices)

tesseract::Dict::has_hyphen_end
bool has_hyphen_end(const WERD_CHOICE &word) const
Same as above, but check the unichar at the end of the word.
Definition: dict.h:164

tesseract::Dict::INT_VAR_H
INT_VAR_H(stopper_smallword_size)

tesseract::Dict::double_VAR_H
double_VAR_H(stopper_certainty_per_char)

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(save_doc_words)

tesseract::Dict::double_VAR_H
double_VAR_H(stopper_allowable_character_badness)

tesseract::Dict::getCCUtil
CCUtil * getCCUtil()
Definition: dict.h:101

tesseract::Dict::BOOL_VAR_H
BOOL_VAR_H(load_freq_dawg)

tesseract::Dict::getUnicharset
const UNICHARSET & getUnicharset() const
Definition: dict.h:104

tesseract::Dict::INT_VAR_H
INT_VAR_H(max_permuter_attempts)

tesseract::Dict::SetWildcardID
void SetWildcardID(UNICHAR_ID id)
Definition: dict.h:374

tesseract::Dict::double_VAR_H
double_VAR_H(xheight_penalty_inconsistent)

tesseract::Dict::valid_word_or_number
int valid_word_or_number(const WERD_CHOICE &word) const
Definition: dict.h:446

tesseract::Trie
Definition: trie.h:53

TESS_API
#define TESS_API
Definition: export.h:32