#include <dict.h>

Public Member Functions
	Dict (CCUtil *image_ptr)

	~Dict ()

const CCUtil *	getCCUtil () const

CCUtil *	getCCUtil ()

const UNICHARSET &	getUnicharset () const

UNICHARSET &	getUnicharset ()

const UnicharAmbigs &	getUnicharAmbigs () const

bool	compound_marker (UNICHAR_ID unichar_id)

bool	is_apostrophe (UNICHAR_ID unichar_id)

bool	hyphenated () const
	Returns true if we've recorded the beginning of a hyphenated word. More...

int	hyphen_base_size () const
	Size of the base word (the part on the line before) of a hyphenated word. More...

void	copy_hyphen_info (WERD_CHOICE *word) const

bool	has_hyphen_end (const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
	Check whether the word has a hyphen at the end. More...

bool	has_hyphen_end (const WERD_CHOICE &word) const
	Same as above, but check the unichar at the end of the word. More...

void	reset_hyphen_vars (bool last_word_on_line)

void	set_hyphen_word (const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)

void	update_best_choice (const WERD_CHOICE &word, WERD_CHOICE *best_choice)

void	init_active_dawgs (DawgPositionVector *active_dawgs, bool ambigs_mode) const

void	default_dawgs (DawgPositionVector *anylength_dawgs, bool suppress_patterns) const

bool	NoDangerousAmbig (WERD_CHOICE BestChoice, DANGERR fixpt, bool fix_replaceable, MATRIX *ratings)

void	ReplaceAmbig (int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE werd_choice, MATRIX ratings)

int	LengthOfShortestAlphaRun (const WERD_CHOICE &WordChoice) const
	Returns the length of the shortest alpha run in WordChoice. More...

int	UniformCertainties (const WERD_CHOICE &word)

bool	AcceptableChoice (const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
	Returns true if the given best_choice is good enough to stop. More...

bool	AcceptableResult (WERD_RES *word) const

void	EndDangerousAmbigs ()

void	DebugWordChoices ()
	Prints the current choices for this word to stdout. More...

void	SettupStopperPass1 ()
	Sets up stopper variables in preparation for the first pass. More...

void	SettupStopperPass2 ()
	Sets up stopper variables in preparation for the second pass. More...

int	case_ok (const WERD_CHOICE &word) const
	Check a string to see if it matches a set of lexical rules. More...

bool	absolute_garbage (const WERD_CHOICE &word, const UNICHARSET &unicharset)

void	SetupForLoad (DawgCache *dawg_cache)

void	Load (const std::string &lang, TessdataManager *data_file)

void	LoadLSTM (const std::string &lang, TessdataManager *data_file)

bool	FinishLoad ()

void	End ()

void	ResetDocumentDictionary ()

int	def_letter_is_okay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

int	LetterIsOkay (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
	Calls letter_is_okay_ member function. More...

double	ProbabilityInContext (const char context, int context_bytes, const char character, int character_bytes)
	Calls probability_in_context_ member function. More...

double	def_probability_in_context (const char lang, const char context, int context_bytes, const char *character, int character_bytes)
	Default (no-op) implementation of probability in context function. More...

void	SetWildcardID (UNICHAR_ID id)

UNICHAR_ID	WildcardID () const

int	NumDawgs () const
	Return the number of dawgs in the dawgs_ vector. More...

const Dawg *	GetDawg (int index) const
	Return i-th dawg pointer recorded in the dawgs_ vector. More...

const Dawg *	GetPuncDawg () const
	Return the points to the punctuation dawg. More...

const Dawg *	GetUnambigDawg () const
	Return the points to the unambiguous words dawg. More...

UNICHAR_ID	char_for_dawg (const UNICHARSET &unicharset, UNICHAR_ID ch, const Dawg *dawg) const

void	ProcessPatternEdges (const Dawg dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, DawgArgs dawg_args, PermuterType *current_permuter) const

int	valid_word (const WERD_CHOICE &word, bool numbers_ok) const

int	valid_word (const WERD_CHOICE &word) const

int	valid_word_or_number (const WERD_CHOICE &word) const

int	valid_word (const char *string) const
	This function is used by api/tesseract_cube_combiner.cpp. More...

bool	valid_bigram (const WERD_CHOICE &word1, const WERD_CHOICE &word2) const

bool	valid_punctuation (const WERD_CHOICE &word)

int	good_choice (const WERD_CHOICE &choice)
	Returns true if a good answer is found for the unknown blob rating. More...

void	add_document_word (const WERD_CHOICE &best_choice)
	Adds a word found on this document to the document specific dictionary. More...

void	adjust_word (WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
	Adjusts the rating of the given word. More...

void	SetWordsegRatingAdjustFactor (float f)
	Set wordseg_rating_adjust_factor_ to the given value. More...

bool	IsSpaceDelimitedLang () const
	Returns true if the language is space-delimited (not CJ, or T). More...

	STRING_VAR_H (user_words_file)

	STRING_VAR_H (user_words_suffix)

	STRING_VAR_H (user_patterns_file)

	STRING_VAR_H (user_patterns_suffix)

	BOOL_VAR_H (load_system_dawg)

	BOOL_VAR_H (load_freq_dawg)

	BOOL_VAR_H (load_unambig_dawg)

	BOOL_VAR_H (load_punc_dawg)

	BOOL_VAR_H (load_number_dawg)

	BOOL_VAR_H (load_bigram_dawg)

	double_VAR_H (xheight_penalty_subscripts)

	double_VAR_H (xheight_penalty_inconsistent)

	double_VAR_H (segment_penalty_dict_frequent_word)

	double_VAR_H (segment_penalty_dict_case_ok)

	double_VAR_H (segment_penalty_dict_case_bad)

	double_VAR_H (segment_penalty_dict_nonword)

	double_VAR_H (segment_penalty_garbage)

	STRING_VAR_H (output_ambig_words_file)

	INT_VAR_H (dawg_debug_level)

	INT_VAR_H (hyphen_debug_level)

	BOOL_VAR_H (use_only_first_uft8_step)

	double_VAR_H (certainty_scale)

	double_VAR_H (stopper_nondict_certainty_base)

	double_VAR_H (stopper_phase2_certainty_rejection_offset)

	INT_VAR_H (stopper_smallword_size)

	double_VAR_H (stopper_certainty_per_char)

	double_VAR_H (stopper_allowable_character_badness)

	INT_VAR_H (stopper_debug_level)

	BOOL_VAR_H (stopper_no_acceptable_choices)

	INT_VAR_H (tessedit_truncate_wordchoice_log)

	STRING_VAR_H (word_to_debug)

	BOOL_VAR_H (segment_nonalphabetic_script)

	BOOL_VAR_H (save_doc_words)

	double_VAR_H (doc_dict_pending_threshold)

	double_VAR_H (doc_dict_certainty_threshold)

	INT_VAR_H (max_permuter_attempts)

go_deeper_dawg_fxn
If the choice being composed so far could be a dictionary word keep exploring choices.
WERD_CHOICE *	dawg_permute_and_select (const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)

void	go_deeper_dawg_fxn (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, bool word_ending, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *void_more_args)

void	permute_choices (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *more_args)

void	append_choices (const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *more_args)

fragment_state
Given the current char choice and information about previously seen fragments, determines whether adjacent character fragments are present and whether they can be concatenated. The given prev_char_frag_info contains: fragment: if not nullptr contains information about immediately preceding fragmented character choice num_fragments: number of fragments that have been used so far to construct a character certainty: certainty of the current choice or minimum certainty of all fragments concatenated so far rating: rating of the current choice or sum of fragment ratings concatenated so far The output char_frag_info is filled in as follows: character: is set to be nullptr if the choice is a non-matching or non-ending fragment piece; is set to unichar of the given choice if it represents a regular character or a matching ending fragment fragment,num_fragments,certainty,rating are set as described above Returns false if a non-matching fragment is discovered, true otherwise.
bool	fragment_state_okay (UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO prev_char_frag_info, const char debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)

Static Public Member Functions
static DawgCache *	GlobalDawgCache ()

static NODE_REF	GetStartingNode (const Dawg *dawg, EDGE_REF edge_ref)
	Returns the appropriate next node given the EDGE_REF. More...

static bool	valid_word_permuter (uint8_t perm, bool numbers_ok)
	Check all the DAWGs to see if this word is in any of them. More...

Public Attributes
void(Dict::*	go_deeper_fxn_ )(const char debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO prev_char_frag_info, bool word_ending, WERD_CHOICE word, float certainties[], float limit, WERD_CHOICE best_choice, int attempts_left, void *void_more_args)
	Pointer to go_deeper function. More...

int(Dict::*	letter_is_okay_ )(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

double(Dict::*	probability_in_context_ )(const char lang, const char context, int context_bytes, const char *character, int character_bytes)
	Probability in context function used by the ngram permuter. More...

Detailed Description

Definition at line 94 of file dict.h.

Constructor & Destructor Documentation

◆ Dict()

tesseract::Dict::Dict ( CCUtil * image_ptr )

Definition at line 29 of file dict.cpp.

    : letter_is_okay_(&tesseract::Dict::def_letter_is_okay)
    , probability_in_context_(&tesseract::Dict::def_probability_in_context)
    , ccutil_(ccutil)
    , wildcard_unichar_id_(INVALID_UNICHAR_ID)
    , apostrophe_unichar_id_(INVALID_UNICHAR_ID)
    , question_unichar_id_(INVALID_UNICHAR_ID)
    , slash_unichar_id_(INVALID_UNICHAR_ID)
    , hyphen_unichar_id_(INVALID_UNICHAR_ID)
    , STRING_MEMBER(user_words_file, "", "A filename of user-provided words.",
                    getCCUtil()->params())
    , STRING_INIT_MEMBER(user_words_suffix, "",
                         "A suffix of user-provided words located in tessdata.",
                         getCCUtil()->params())
    , STRING_MEMBER(user_patterns_file, "", "A filename of user-provided patterns.",
                    getCCUtil()->params())
    , STRING_INIT_MEMBER(user_patterns_suffix, "",
                         "A suffix of user-provided patterns located in "
                         "tessdata.",
                         getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_system_dawg, true, "Load system word dawg.", getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_freq_dawg, true, "Load frequent word dawg.", getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.",
                       getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_punc_dawg, true,
                       "Load dawg with punctuation"
                       " patterns.",
                       getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_number_dawg, true,
                       "Load dawg with number"
                       " patterns.",
                       getCCUtil()->params())
    , BOOL_INIT_MEMBER(load_bigram_dawg, true,
                       "Load dawg with special word "
                       "bigrams.",
                       getCCUtil()->params())
    , double_MEMBER(xheight_penalty_subscripts, 0.125,
                    "Score penalty (0.1 = 10%) added if there are subscripts "
                    "or superscripts in a word, but it is otherwise OK.",
                    getCCUtil()->params())
    , double_MEMBER(xheight_penalty_inconsistent, 0.25,
                    "Score penalty (0.1 = 10%) added if an xheight is "
                    "inconsistent.",
                    getCCUtil()->params())
    , double_MEMBER(segment_penalty_dict_frequent_word, 1.0,
                    "Score multiplier for word matches which have good case and"
                    " are frequent in the given language (lower is better).",
                    getCCUtil()->params())
    , double_MEMBER(segment_penalty_dict_case_ok, 1.1,
                    "Score multiplier for word matches that have good case "
                    "(lower is better).",
                    getCCUtil()->params())
    , double_MEMBER(segment_penalty_dict_case_bad, 1.3125,
                    "Default score multiplier for word matches, which may have "
                    "case issues (lower is better).",
                    getCCUtil()->params())
    , double_MEMBER(segment_penalty_dict_nonword, 1.25,
                    "Score multiplier for glyph fragment segmentations which "
                    "do not match a dictionary word (lower is better).",
                    getCCUtil()->params())
    , double_MEMBER(segment_penalty_garbage, 1.50,
                    "Score multiplier for poorly cased strings that are not in"
                    " the dictionary and generally look like garbage (lower is"
                    " better).",
                    getCCUtil()->params())
    , STRING_MEMBER(output_ambig_words_file, "",
                    "Output file for ambiguities found in the dictionary", getCCUtil()->params())
    , INT_MEMBER(dawg_debug_level, 0,
                 "Set to 1 for general debug info"
                 ", to 2 for more details, to 3 to see all the debug messages",
                 getCCUtil()->params())
    , INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", getCCUtil()->params())
    , BOOL_MEMBER(use_only_first_uft8_step, false,
                  "Use only the first UTF8 step of the given string"
                  " when computing log probabilities.",
                  getCCUtil()->params())
    , double_MEMBER(certainty_scale, 20.0, "Certainty scaling factor", getCCUtil()->params())
    , double_MEMBER(stopper_nondict_certainty_base, -2.50, "Certainty threshold for non-dict words",
                    getCCUtil()->params())
    , double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, "Reject certainty offset",
                    getCCUtil()->params())
    , INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word",
                 getCCUtil()->params())
    , double_MEMBER(stopper_certainty_per_char, -0.50,
                    "Certainty to add"
                    " for each dict char above small word size.",
                    getCCUtil()->params())
    , double_MEMBER(stopper_allowable_character_badness, 3.0,
                    "Max certainty variation allowed in a word (in sigma)", getCCUtil()->params())
    , INT_MEMBER(stopper_debug_level, 0, "Stopper debug level", getCCUtil()->params())
    , BOOL_MEMBER(stopper_no_acceptable_choices, false,
                  "Make AcceptableChoice() always return false. Useful"
                  " when there is a need to explore all segmentations",
                  getCCUtil()->params())
    , INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list",
                 getCCUtil()->params())
    , STRING_MEMBER(word_to_debug, "",
                    "Word for which stopper debug"
                    " information should be printed to stdout",
                    getCCUtil()->params())
    , BOOL_MEMBER(segment_nonalphabetic_script, false,
                  "Don't use any alphabetic-specific tricks."
                  " Set to true in the traineddata config file for"
                  " scripts that are cursive or inherently fixed-pitch",
                  getCCUtil()->params())
    , BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params())
    , double_MEMBER(doc_dict_pending_threshold, 0.0, "Worst certainty for using pending dictionary",
                    getCCUtil()->params())
    , double_MEMBER(doc_dict_certainty_threshold, -2.25,
                    "Worst certainty for words that can be inserted into the"
                    " document dictionary",
                    getCCUtil()->params())
    , INT_MEMBER(max_permuter_attempts, 10000,
                 "Maximum number of different"
                 " character choices to consider during permutation."
                 " This limit is especially useful when user patterns"
                 " are specified, since overly generic patterns can result in"
                 " dawg search exploring an overly large number of options.",
                 getCCUtil()->params()) {
  reject_offset_ = 0.0;
  go_deeper_fxn_ = nullptr;
  hyphen_word_ = nullptr;
  last_word_on_line_ = false;
  document_words_ = nullptr;
  dawg_cache_ = nullptr;
  dawg_cache_is_ours_ = false;
  pending_words_ = nullptr;
  bigram_dawg_ = nullptr;
  freq_dawg_ = nullptr;
  punc_dawg_ = nullptr;
  unambig_dawg_ = nullptr;
  wordseg_rating_adjust_factor_ = -1.0f;
  output_ambig_words_file_ = nullptr;
}

◆ ~Dict()

tesseract::Dict::~Dict ( )

Definition at line 164 of file dict.cpp.

            {
  End();
  delete hyphen_word_;
  if (output_ambig_words_file_ != nullptr) {
    fclose(output_ambig_words_file_);
  }
}

Member Function Documentation

◆ absolute_garbage()

bool tesseract::Dict::absolute_garbage	(	const WERD_CHOICE &	word,
		const UNICHARSET &	unicharset
	)

Returns true if the word looks like an absolute garbage (e.g. image mistakenly recognized as text).

Definition at line 66 of file context.cpp.

                                                                                 {
  if (word.length() < kMinAbsoluteGarbageWordLength) {
    return false;
  }
  int num_alphanum = 0;
  for (unsigned x = 0; x < word.length(); ++x) {
    num_alphanum +=
        (unicharset.get_isalpha(word.unichar_id(x)) || unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) / static_cast<float>(word.length()) <
          kMinAbsoluteGarbageAlphanumFrac);
}

◆ AcceptableChoice()

bool tesseract::Dict::AcceptableChoice	(	const WERD_CHOICE &	best_choice,
		XHeightConsistencyEnum	xheight_consistency
	)

Returns true if the given best_choice is good enough to stop.

Definition at line 42 of file stopper.cpp.

                                                                        {
  float CertaintyThreshold = stopper_nondict_certainty_base;
  int WordSize;
 
  if (stopper_no_acceptable_choices) {
    return false;
  }
 
  if (best_choice.empty()) {
    return false;
  }
 
  bool no_dang_ambigs = !best_choice.dangerous_ambig_found();
  bool is_valid_word = valid_word_permuter(best_choice.permuter(), false);
  bool is_case_ok = case_ok(best_choice);
 
  if (stopper_debug_level >= 1) {
    const char *xht = "UNKNOWN";
    switch (xheight_consistency) {
      case XH_GOOD:
        xht = "NORMAL";
        break;
      case XH_SUBNORMAL:
        xht = "SUBNORMAL";
        break;
      case XH_INCONSISTENT:
        xht = "INCONSISTENT";
        break;
      default:
        xht = "UNKNOWN";
    }
    tprintf("\nStopper:  %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
            best_choice.unichar_string().c_str(), (is_valid_word ? 'y' : 'n'),
            (is_case_ok ? 'y' : 'n'), xht, best_choice.min_x_height(), best_choice.max_x_height());
  }
  // Do not accept invalid words in PASS1.
  if (reject_offset_ <= 0.0f && !is_valid_word) {
    return false;
  }
  if (is_valid_word && is_case_ok) {
    WordSize = LengthOfShortestAlphaRun(best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0) {
      WordSize = 0;
    }
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }
 
  if (stopper_debug_level >= 1) {
    tprintf("Stopper:  Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
            best_choice.rating(), best_choice.certainty(), CertaintyThreshold);
  }
 
  if (no_dang_ambigs && best_choice.certainty() > CertaintyThreshold &&
      xheight_consistency < XH_INCONSISTENT && UniformCertainties(best_choice)) {
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf(
          "AcceptableChoice() returned false"
          " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
          no_dang_ambigs, best_choice.certainty(), CertaintyThreshold,
          UniformCertainties(best_choice));
    }
    return false;
  }
}

◆ AcceptableResult()

bool tesseract::Dict::AcceptableResult ( WERD_RES * word ) const

Returns false if the best choice for the current word is questionable and should be tried again on the second pass or should be flagged to the user.

Definition at line 111 of file stopper.cpp.

                                                {
  if (word->best_choice == nullptr) {
    return false;
  }
  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
  int WordSize;
 
  if (stopper_debug_level >= 1) {
    tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
            word->best_choice->debug_string().c_str(), (valid_word(*word->best_choice) ? 'y' : 'n'),
            (case_ok(*word->best_choice) ? 'y' : 'n'),
            word->best_choice->dangerous_ambig_found() ? 'n' : 'y',
            word->best_choices.singleton() ? 'n' : 'y');
  }
 
  if (word->best_choice->empty() || !word->best_choices.singleton()) {
    return false;
  }
  if (valid_word(*word->best_choice) && case_ok(*word->best_choice)) {
    WordSize = LengthOfShortestAlphaRun(*word->best_choice);
    WordSize -= stopper_smallword_size;
    if (WordSize < 0) {
      WordSize = 0;
    }
    CertaintyThreshold += WordSize * stopper_certainty_per_char;
  }
 
  if (stopper_debug_level >= 1) {
    tprintf("Rejecter: Certainty = %4.1f, Threshold = %4.1f   ", word->best_choice->certainty(),
            CertaintyThreshold);
  }
 
  if (word->best_choice->certainty() > CertaintyThreshold && !stopper_no_acceptable_choices) {
    if (stopper_debug_level >= 1) {
      tprintf("ACCEPTED\n");
    }
    return true;
  } else {
    if (stopper_debug_level >= 1) {
      tprintf("REJECTED\n");
    }
    return false;
  }
}

◆ add_document_word()

void tesseract::Dict::add_document_word ( const WERD_CHOICE & best_choice )

Adds a word found on this document to the document specific dictionary.

Definition at line 647 of file dict.cpp.

                                                           {
  // Do not add hyphenated word parts to the document dawg.
  // hyphen_word_ will be non-nullptr after the set_hyphen_word() is
  // called when the first part of the hyphenated word is
  // discovered and while the second part of the word is recognized.
  // hyphen_word_ is cleared in cc_recg() before the next word on
  // the line is recognized.
  if (hyphen_word_) {
    return;
  }
 
  int stringlen = best_choice.length();
 
  if (valid_word(best_choice) || stringlen < 2) {
    return;
  }
 
  // Discard words that contain >= kDocDictMaxRepChars repeating unichars.
  if (best_choice.length() >= kDocDictMaxRepChars) {
    int num_rep_chars = 1;
    UNICHAR_ID uch_id = best_choice.unichar_id(0);
    for (unsigned i = 1; i < best_choice.length(); ++i) {
      if (best_choice.unichar_id(i) != uch_id) {
        num_rep_chars = 1;
        uch_id = best_choice.unichar_id(i);
      } else {
        ++num_rep_chars;
        if (num_rep_chars == kDocDictMaxRepChars) {
          return;
        }
      }
    }
  }
 
  if (best_choice.certainty() < doc_dict_certainty_threshold || stringlen == 2) {
    if (best_choice.certainty() < doc_dict_pending_threshold) {
      return;
    }
 
    if (!pending_words_->word_in_dawg(best_choice)) {
      if (stringlen > 2 ||
          (stringlen == 2 && getUnicharset().get_isupper(best_choice.unichar_id(0)) &&
           getUnicharset().get_isupper(best_choice.unichar_id(1)))) {
        pending_words_->add_word_to_dawg(best_choice);
      }
      return;
    }
  }
 
  if (save_doc_words) {
    std::string filename(getCCUtil()->imagefile);
    filename += ".doc";
    FILE *doc_word_file = fopen(filename.c_str(), "a");
    if (doc_word_file == nullptr) {
      tprintf("Error: Could not open file %s\n", filename.c_str());
      ASSERT_HOST(doc_word_file);
    }
    fprintf(doc_word_file, "%s\n", best_choice.debug_string().c_str());
    fclose(doc_word_file);
  }
  document_words_->add_word_to_dawg(best_choice);
}

◆ adjust_word()

void tesseract::Dict::adjust_word	(	WERD_CHOICE *	word,
		bool	nonword,
		XHeightConsistencyEnum	xheight_consistency,
		float	additional_adjust,
		bool	modify_rating,
		bool	debug
	)

Adjusts the rating of the given word.

Definition at line 710 of file dict.cpp.

                                                                                {
  bool is_han = (getUnicharset().han_sid() != getUnicharset().null_sid() &&
                 word->GetTopScriptID() == getUnicharset().han_sid());
  bool case_is_ok = (is_han || case_ok(*word));
  bool punc_is_ok = (is_han || !nonword || valid_punctuation(*word));
 
  float adjust_factor = additional_adjust;
  float new_rating = word->rating();
  new_rating += kRatingPad;
  const char *xheight_triggered = "";
  if (word->length() > 1) {
    // Calculate x-height and y-offset consistency penalties.
    switch (xheight_consistency) {
      case XH_INCONSISTENT:
        adjust_factor += xheight_penalty_inconsistent;
        xheight_triggered = ", xhtBAD";
        break;
      case XH_SUBNORMAL:
        adjust_factor += xheight_penalty_subscripts;
        xheight_triggered = ", xhtSUB";
        break;
      case XH_GOOD:
        // leave the factor alone - all good!
        break;
    }
    // TODO(eger): if nonword is true, but there is a "core" that is a dict
    // word, negate nonword status.
  } else {
    if (debug) {
      tprintf("Consistency could not be calculated.\n");
    }
  }
  if (debug) {
    tprintf("%sWord: %s %4.2f%s", nonword ? "Non-" : "", word->unichar_string().c_str(),
            word->rating(), xheight_triggered);
  }
 
  if (nonword) { // non-dictionary word
    if (case_is_ok && punc_is_ok) {
      adjust_factor += segment_penalty_dict_nonword;
      new_rating *= adjust_factor;
      if (debug) {
        tprintf(", W");
      }
    } else {
      adjust_factor += segment_penalty_garbage;
      new_rating *= adjust_factor;
      if (debug) {
        if (!case_is_ok) {
          tprintf(", C");
        }
        if (!punc_is_ok) {
          tprintf(", P");
        }
      }
    }
  } else { // dictionary word
    if (case_is_ok) {
      if (!is_han && freq_dawg_ != nullptr && freq_dawg_->word_in_dawg(*word)) {
        word->set_permuter(FREQ_DAWG_PERM);
        adjust_factor += segment_penalty_dict_frequent_word;
        new_rating *= adjust_factor;
        if (debug) {
          tprintf(", F");
        }
      } else {
        adjust_factor += segment_penalty_dict_case_ok;
        new_rating *= adjust_factor;
        if (debug) {
          tprintf(", ");
        }
      }
    } else {
      adjust_factor += segment_penalty_dict_case_bad;
      new_rating *= adjust_factor;
      if (debug) {
        tprintf(", C");
      }
    }
  }
  new_rating -= kRatingPad;
  if (modify_rating) {
    word->set_rating(new_rating);
  }
  if (debug) {
    tprintf(" %4.2f --> %4.2f\n", adjust_factor, new_rating);
  }
  word->set_adjust_factor(adjust_factor);
}

◆ append_choices()

void tesseract::Dict::append_choices	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		const BLOB_CHOICE &	blob_choice,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	more_args
	)

append_choices

Checks to see whether or not the next choice is worth appending to the word being generated. If so then keeps going deeper into the word.

This function assumes that Dict::go_deeper_fxn_ is set.

Definition at line 224 of file permdawg.cpp.

                                                               {
  auto word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
 
  // Deal with fragments.
  CHAR_FRAGMENT_INFO char_frag_info;
  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(), blob_choice.certainty(),
                           prev_char_frag_info, debug, word_ending, &char_frag_info)) {
    return; // blob_choice must be an invalid fragment
  }
  // Search the next letter if this character is a fragment.
  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
    permute_choices(debug, char_choices, char_choice_index + 1, &char_frag_info, word, certainties,
                    limit, best_choice, attempts_left, more_args);
    return;
  }
 
  // Add the next unichar.
  float old_rating = word->rating();
  float old_certainty = word->certainty();
  uint8_t old_permuter = word->permuter();
  certainties[word->length()] = char_frag_info.certainty;
  word->append_unichar_id_space_allocated(char_frag_info.unichar_id, char_frag_info.num_fragments,
                                          char_frag_info.rating, char_frag_info.certainty);
 
  // Explore the next unichar.
  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index, &char_frag_info, word_ending,
                          word, certainties, limit, best_choice, attempts_left, more_args);
 
  // Remove the unichar we added to explore other choices in it's place.
  word->remove_last_unichar_id();
  word->set_rating(old_rating);
  word->set_certainty(old_certainty);
  word->set_permuter(old_permuter);
}

◆ BOOL_VAR_H() [1/10]

tesseract::Dict::BOOL_VAR_H ( load_bigram_dawg )

◆ BOOL_VAR_H() [2/10]

tesseract::Dict::BOOL_VAR_H ( load_freq_dawg )

◆ BOOL_VAR_H() [3/10]

tesseract::Dict::BOOL_VAR_H ( load_number_dawg )

◆ BOOL_VAR_H() [4/10]

tesseract::Dict::BOOL_VAR_H ( load_punc_dawg )

◆ BOOL_VAR_H() [5/10]

tesseract::Dict::BOOL_VAR_H ( load_system_dawg )

◆ BOOL_VAR_H() [6/10]

tesseract::Dict::BOOL_VAR_H ( load_unambig_dawg )

◆ BOOL_VAR_H() [7/10]

tesseract::Dict::BOOL_VAR_H ( save_doc_words )

◆ BOOL_VAR_H() [8/10]

tesseract::Dict::BOOL_VAR_H ( segment_nonalphabetic_script )

◆ BOOL_VAR_H() [9/10]

tesseract::Dict::BOOL_VAR_H ( stopper_no_acceptable_choices )

◆ BOOL_VAR_H() [10/10]

tesseract::Dict::BOOL_VAR_H ( use_only_first_uft8_step )

◆ case_ok()

int tesseract::Dict::case_ok ( const WERD_CHOICE & word ) const

Check a string to see if it matches a set of lexical rules.

Definition at line 45 of file context.cpp.

                                               {
  int state = 0;
  const UNICHARSET *unicharset = word.unicharset();
  for (unsigned x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset->get_isupper(ch_id)) {
      state = case_state_table[state][1];
    } else if (unicharset->get_islower(ch_id)) {
      state = case_state_table[state][2];
    } else if (unicharset->get_isdigit(ch_id)) {
      state = case_state_table[state][3];
    } else {
      state = case_state_table[state][0];
    }
    if (state == -1) {
      return false;
    }
  }
  return state != 5; // single lower is bad
}

◆ char_for_dawg()

UNICHAR_ID tesseract::Dict::char_for_dawg	(	const UNICHARSET &	unicharset,
		UNICHAR_ID	ch,
		const Dawg *	dawg
	)		const

inline

Definition at line 411 of file dict.h.

                                                                                                {
    if (!dawg) {
      return ch;
    }
    switch (dawg->type()) {
      case DAWG_TYPE_NUMBER:
        return unicharset.get_isdigit(ch) ? Dawg::kPatternUnicharID : ch;
      default:
        return ch;
    }
  }

◆ compound_marker()

bool tesseract::Dict::compound_marker ( UNICHAR_ID unichar_id )

inline

Definition at line 116 of file dict.h.

                                                     {
    const UNICHARSET &unicharset = getUnicharset();
    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
    const auto &normed_ids = unicharset.normed_ids(unichar_id);
    return normed_ids.size() == 1 &&
           (normed_ids[0] == hyphen_unichar_id_ || normed_ids[0] == slash_unichar_id_);
  }

◆ copy_hyphen_info()

void tesseract::Dict::copy_hyphen_info ( WERD_CHOICE * word ) const

inline

If this word is hyphenated copy the base word (the part on the line before) of a hyphenated word into the given word. This function assumes that word is not nullptr.

Definition at line 145 of file dict.h.

                                                        {
    if (this->hyphenated()) {
      *word = *hyphen_word_;
      if (hyphen_debug_level) {
        word->print("copy_hyphen_info: ");
      }
    }
  }

◆ dawg_permute_and_select()

WERD_CHOICE * tesseract::Dict::dawg_permute_and_select	(	const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		float	rating_limit
	)

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to explore all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

dawg_permute_and_select

Recursively explore all the possible character combinations in the given char_choices. Use go_deeper_dawg_fxn() to search all the dawgs in the dawgs_ vector in parallel and discard invalid words.

Allocate and return a WERD_CHOICE with the best valid word found.

Definition at line 159 of file permdawg.cpp.

                                                               {
  auto *best_choice = new WERD_CHOICE(&getUnicharset());
  best_choice->make_bad();
  best_choice->set_rating(rating_limit);
  if (char_choices.empty() || char_choices.size() > MAX_WERD_LENGTH) {
    return best_choice;
  }
  auto *active_dawgs = new DawgPositionVector[char_choices.size() + 1];
  init_active_dawgs(&(active_dawgs[0]), true);
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
  WERD_CHOICE word(&getUnicharset(), MAX_WERD_LENGTH);
 
  float certainties[MAX_WERD_LENGTH];
  this->go_deeper_fxn_ = &tesseract::Dict::go_deeper_dawg_fxn;
  int attempts_left = max_permuter_attempts;
  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr, char_choices, 0, nullptr,
                  &word, certainties, &rating_limit, best_choice, &attempts_left, &dawg_args);
  delete[] active_dawgs;
  return best_choice;
}

◆ DebugWordChoices()

void tesseract::Dict::DebugWordChoices ( )

Prints the current choices for this word to stdout.

◆ def_letter_is_okay()

int tesseract::Dict::def_letter_is_okay	(	void *	void_dawg_args,
		const UNICHARSET &	unicharset,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const

Returns the maximal permuter code (from ccstruct/ratngs.h) if in light of the current state the letter at word_index in the given word is allowed according to at least one of the dawgs in dawgs_, otherwise returns NO_PERM.

The state is described by void_dawg_args, which are interpreted as DawgArgs and contain relevant active dawg positions. Each entry in the active_dawgs vector contains an index into the dawgs_ vector and an EDGE_REF that indicates the last edge followed in the dawg. It also may contain a position in the punctuation dawg which describes surrounding punctuation (see struct DawgPosition).

Input: At word_index 0 dawg_args->active_dawgs should contain an entry for each dawg that may start at the beginning of a word, with punc_ref and edge_ref initialized to NO_EDGE. Since the punctuation dawg includes the empty pattern " " (meaning anything without surrounding punctuation), having a single entry for the punctuation dawg will cover all dawgs reachable there from – that includes all number and word dawgs. The only dawg non-reachable from the punctuation_dawg is the pattern dawg. If hyphen state needs to be applied, initial dawg_args->active_dawgs can be copied from the saved hyphen state (maintained by Dict). For word_index > 0 the corresponding state (active_dawgs and punc position) can be obtained from dawg_args->updated_dawgs passed to def_letter_is_okay for word_index-1. Note: the function assumes that active_dawgs, and updated_dawgs member variables of dawg_args are not nullptr.

Output: The function fills in dawg_args->updated_dawgs vector with the entries for dawgs that contain the word up to the letter at word_index.

Definition at line 406 of file dict.cpp.

                                                                         {
  auto *dawg_args = static_cast<DawgArgs *>(void_dawg_args);
 
  ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
 
  if (dawg_debug_level >= 3) {
    tprintf(
        "def_letter_is_okay: current unichar=%s word_end=%d"
        " num active dawgs=%zu\n",
        getUnicharset().debug_str(unichar_id).c_str(), word_end, dawg_args->active_dawgs->size());
  }
 
  // Do not accept words that contain kPatternUnicharID.
  // (otherwise pattern dawgs would not function correctly).
  // Do not accept words containing INVALID_UNICHAR_IDs.
  if (unichar_id == Dawg::kPatternUnicharID || unichar_id == INVALID_UNICHAR_ID) {
    dawg_args->permuter = NO_PERM;
    return NO_PERM;
  }
 
  // Initialization.
  PermuterType curr_perm = NO_PERM;
  dawg_args->updated_dawgs->clear();
  dawg_args->valid_end = false;
 
  // Go over the active_dawgs vector and insert DawgPosition records
  // with the updated ref (an edge with the corresponding unichar id) into
  // dawg_args->updated_pos.
  for (unsigned a = 0; a < dawg_args->active_dawgs->size(); ++a) {
    const DawgPosition &pos = (*dawg_args->active_dawgs)[a];
    const Dawg *punc_dawg = pos.punc_index >= 0 ? dawgs_[pos.punc_index] : nullptr;
    const Dawg *dawg = pos.dawg_index >= 0 ? dawgs_[pos.dawg_index] : nullptr;
 
    if (!dawg && !punc_dawg) {
      // shouldn't happen.
      tprintf("Received DawgPosition with no dawg or punc_dawg.  wth?\n");
      continue;
    }
    if (!dawg) {
      // We're in the punctuation dawg.  A core dawg has not been chosen.
      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
      EDGE_REF punc_transition_edge =
          punc_dawg->edge_char_of(punc_node, Dawg::kPatternUnicharID, word_end);
      if (punc_transition_edge != NO_EDGE) {
        // Find all successors, and see which can transition.
        const SuccessorList &slist = *(successors_[pos.punc_index]);
        for (int sdawg_index : slist) {
          const Dawg *sdawg = dawgs_[sdawg_index];
          UNICHAR_ID ch = char_for_dawg(unicharset, unichar_id, sdawg);
          EDGE_REF dawg_edge = sdawg->edge_char_of(0, ch, word_end);
          if (dawg_edge != NO_EDGE) {
            if (dawg_debug_level >= 3) {
              tprintf("Letter found in dawg %d\n", sdawg_index);
            }
            dawg_args->updated_dawgs->add_unique(
                DawgPosition(sdawg_index, dawg_edge, pos.punc_index, punc_transition_edge, false),
                dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: ");
            if (sdawg->permuter() > curr_perm) {
              curr_perm = sdawg->permuter();
            }
            if (sdawg->end_of_word(dawg_edge) && punc_dawg->end_of_word(punc_transition_edge)) {
              dawg_args->valid_end = true;
            }
          }
        }
      }
      EDGE_REF punc_edge = punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
      if (punc_edge != NO_EDGE) {
        if (dawg_debug_level >= 3) {
          tprintf("Letter found in punctuation dawg\n");
        }
        dawg_args->updated_dawgs->add_unique(
            DawgPosition(-1, NO_EDGE, pos.punc_index, punc_edge, false), dawg_debug_level > 0,
            "Extend punctuation dawg: ");
        if (PUNC_PERM > curr_perm) {
          curr_perm = PUNC_PERM;
        }
        if (punc_dawg->end_of_word(punc_edge)) {
          dawg_args->valid_end = true;
        }
      }
      continue;
    }
 
    if (punc_dawg && dawg->end_of_word(pos.dawg_ref)) {
      // We can end the main word here.
      //  If we can continue on the punc ref, add that possibility.
      NODE_REF punc_node = GetStartingNode(punc_dawg, pos.punc_ref);
      EDGE_REF punc_edge =
          punc_node == NO_EDGE ? NO_EDGE : punc_dawg->edge_char_of(punc_node, unichar_id, word_end);
      if (punc_edge != NO_EDGE) {
        dawg_args->updated_dawgs->add_unique(
            DawgPosition(pos.dawg_index, pos.dawg_ref, pos.punc_index, punc_edge, true),
            dawg_debug_level > 0, "Return to punctuation dawg: ");
        if (dawg->permuter() > curr_perm) {
          curr_perm = dawg->permuter();
        }
        if (punc_dawg->end_of_word(punc_edge)) {
          dawg_args->valid_end = true;
        }
      }
    }
 
    if (pos.back_to_punc) {
      continue;
    }
 
    // If we are dealing with the pattern dawg, look up all the
    // possible edges, not only for the exact unichar_id, but also
    // for all its character classes (alpha, digit, etc).
    if (dawg->type() == DAWG_TYPE_PATTERN) {
      ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, &curr_perm);
      // There can't be any successors to dawg that is of type
      // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition.
      continue;
    }
 
    // Find the edge out of the node for the unichar_id.
    NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
    EDGE_REF edge =
        (node == NO_EDGE)
            ? NO_EDGE
            : dawg->edge_char_of(node, char_for_dawg(unicharset, unichar_id, dawg), word_end);
 
    if (dawg_debug_level >= 3) {
      tprintf("Active dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node, edge);
    }
 
    if (edge != NO_EDGE) { // the unichar was found in the current dawg
      if (dawg_debug_level >= 3) {
        tprintf("Letter found in dawg %d\n", pos.dawg_index);
      }
      if (word_end && punc_dawg && !punc_dawg->end_of_word(pos.punc_ref)) {
        if (dawg_debug_level >= 3) {
          tprintf("Punctuation constraint not satisfied at end of word.\n");
        }
        continue;
      }
      if (dawg->permuter() > curr_perm) {
        curr_perm = dawg->permuter();
      }
      if (dawg->end_of_word(edge) &&
          (punc_dawg == nullptr || punc_dawg->end_of_word(pos.punc_ref))) {
        dawg_args->valid_end = true;
      }
      dawg_args->updated_dawgs->add_unique(
          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false),
          dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
    }
  } // end for
  // Update dawg_args->permuter if it used to be NO_PERM or became NO_PERM
  // or if we found the current letter in a non-punctuation dawg. This
  // allows preserving information on which dawg the "core" word came from.
  // Keep the old value of dawg_args->permuter if it is COMPOUND_PERM.
  if (dawg_args->permuter == NO_PERM || curr_perm == NO_PERM ||
      (curr_perm != PUNC_PERM && dawg_args->permuter != COMPOUND_PERM)) {
    dawg_args->permuter = curr_perm;
  }
  if (dawg_debug_level >= 2) {
    tprintf("Returning %d for permuter code for this character.\n", dawg_args->permuter);
  }
  return dawg_args->permuter;
}

◆ def_probability_in_context()

double tesseract::Dict::def_probability_in_context	(	const char *	lang,
		const char *	context,
		int	context_bytes,
		const char *	character,
		int	character_bytes
	)

inline

Default (no-op) implementation of probability in context function.

Definition at line 364 of file dict.h.

                                                                                {
    (void)lang;
    (void)context;
    (void)context_bytes;
    (void)character;
    (void)character_bytes;
    return 0.0;
  }

◆ default_dawgs()

void tesseract::Dict::default_dawgs	(	DawgPositionVector *	anylength_dawgs,
		bool	suppress_patterns
	)		const

Definition at line 624 of file dict.cpp.

                                                                                       {
  bool punc_dawg_available = (punc_dawg_ != nullptr) &&
                             punc_dawg_->edge_char_of(0, Dawg::kPatternUnicharID, true) != NO_EDGE;
 
  for (unsigned i = 0; i < dawgs_.size(); i++) {
    if (dawgs_[i] != nullptr && !(suppress_patterns && (dawgs_[i])->type() == DAWG_TYPE_PATTERN)) {
      int dawg_ty = dawgs_[i]->type();
      bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
      if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
        dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
        if (dawg_debug_level >= 3) {
          tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
        }
      } else if (!punc_dawg_available || !subsumed_by_punc) {
        dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
        if (dawg_debug_level >= 3) {
          tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
        }
      }
    }
  }
}

◆ double_VAR_H() [1/14]

tesseract::Dict::double_VAR_H ( certainty_scale )

◆ double_VAR_H() [2/14]

tesseract::Dict::double_VAR_H ( doc_dict_certainty_threshold )

◆ double_VAR_H() [3/14]

tesseract::Dict::double_VAR_H ( doc_dict_pending_threshold )

◆ double_VAR_H() [4/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_bad )

◆ double_VAR_H() [5/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_case_ok )

◆ double_VAR_H() [6/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_frequent_word )

◆ double_VAR_H() [7/14]

tesseract::Dict::double_VAR_H ( segment_penalty_dict_nonword )

◆ double_VAR_H() [8/14]

tesseract::Dict::double_VAR_H ( segment_penalty_garbage )

◆ double_VAR_H() [9/14]

tesseract::Dict::double_VAR_H ( stopper_allowable_character_badness )

◆ double_VAR_H() [10/14]

tesseract::Dict::double_VAR_H ( stopper_certainty_per_char )

◆ double_VAR_H() [11/14]

tesseract::Dict::double_VAR_H ( stopper_nondict_certainty_base )

◆ double_VAR_H() [12/14]

tesseract::Dict::double_VAR_H ( stopper_phase2_certainty_rejection_offset )

◆ double_VAR_H() [13/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_inconsistent )

◆ double_VAR_H() [14/14]

tesseract::Dict::double_VAR_H ( xheight_penalty_subscripts )

◆ End()

void tesseract::Dict::End ( )

Definition at line 379 of file dict.cpp.

               {
  if (dawgs_.empty()) {
    return; // Not safe to call twice.
  }
  for (auto &dawg : dawgs_) {
    if (!dawg_cache_->FreeDawg(dawg)) {
      delete dawg;
    }
  }
  dawg_cache_->FreeDawg(bigram_dawg_);
  if (dawg_cache_is_ours_) {
    delete dawg_cache_;
    dawg_cache_ = nullptr;
  }
  for (auto successor : successors_) {
    delete successor;
  }
  dawgs_.clear();
  successors_.clear();
  document_words_ = nullptr;
  delete pending_words_;
  pending_words_ = nullptr;
}

◆ EndDangerousAmbigs()

void tesseract::Dict::EndDangerousAmbigs ( )

Definition at line 358 of file stopper.cpp.

358{}

◆ FinishLoad()

bool tesseract::Dict::FinishLoad ( )

Definition at line 357 of file dict.cpp.

                      {
  if (dawgs_.empty()) {
    return false;
  }
  // Construct a list of corresponding successors for each dawg. Each entry, i,
  // in the successors_ vector is a vector of integers that represent the
  // indices into the dawgs_ vector of the successors for dawg i.
  successors_.reserve(dawgs_.size());
  for (auto dawg : dawgs_) {
    auto *lst = new SuccessorList();
    for (unsigned j = 0; j < dawgs_.size(); ++j) {
      const Dawg *other = dawgs_[j];
      if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
          kDawgSuccessors[dawg->type()][other->type()]) {
        lst->push_back(j);
      }
    }
    successors_.push_back(lst);
  }
  return true;
}

◆ fragment_state_okay()

bool tesseract::Dict::fragment_state_okay	(	UNICHAR_ID	curr_unichar_id,
		float	curr_rating,
		float	curr_certainty,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		const char *	debug,
		int	word_ending,
		CHAR_FRAGMENT_INFO *	char_frag_info
	)

Definition at line 288 of file permdawg.cpp.

                                                                                    {
  const CHAR_FRAGMENT *this_fragment = getUnicharset().get_fragment(curr_unichar_id);
  const CHAR_FRAGMENT *prev_fragment =
      prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
 
  // Print debug info for fragments.
  if (debug && (prev_fragment || this_fragment)) {
    tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
            getUnicharset().debug_str(curr_unichar_id).c_str(), word_ending);
    if (prev_fragment) {
      tprintf("prev_fragment %s\n", prev_fragment->to_string().c_str());
    }
    if (this_fragment) {
      tprintf("this_fragment %s\n", this_fragment->to_string().c_str());
    }
  }
 
  char_frag_info->unichar_id = curr_unichar_id;
  char_frag_info->fragment = this_fragment;
  char_frag_info->rating = curr_rating;
  char_frag_info->certainty = curr_certainty;
  char_frag_info->num_fragments = 1;
  if (prev_fragment && !this_fragment) {
    if (debug) {
      tprintf("Skip choice with incomplete fragment\n");
    }
    return false;
  }
  if (this_fragment) {
    // We are dealing with a fragment.
    char_frag_info->unichar_id = INVALID_UNICHAR_ID;
    if (prev_fragment) {
      if (!this_fragment->is_continuation_of(prev_fragment)) {
        if (debug) {
          tprintf("Non-matching fragment piece\n");
        }
        return false;
      }
      if (this_fragment->is_ending()) {
        char_frag_info->unichar_id = getUnicharset().unichar_to_id(this_fragment->get_unichar());
        char_frag_info->fragment = nullptr;
        if (debug) {
          tprintf("Built character %s from fragments\n",
                  getUnicharset().debug_str(char_frag_info->unichar_id).c_str());
        }
      } else {
        if (debug) {
          tprintf("Record fragment continuation\n");
        }
        char_frag_info->fragment = this_fragment;
      }
      // Update certainty and rating.
      char_frag_info->rating = prev_char_frag_info->rating + curr_rating;
      char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
      char_frag_info->certainty = std::min(curr_certainty, prev_char_frag_info->certainty);
    } else {
      if (this_fragment->is_beginning()) {
        if (debug) {
          tprintf("Record fragment beginning\n");
        }
      } else {
        if (debug) {
          tprintf("Non-starting fragment piece with no prev_fragment\n");
        }
        return false;
      }
    }
  }
  if (word_ending && char_frag_info->fragment) {
    if (debug) {
      tprintf("Word cannot end with a fragment\n");
    }
    return false;
  }
  return true;
}

◆ getCCUtil() [1/2]

CCUtil * tesseract::Dict::getCCUtil ( )

inline

Definition at line 101 of file dict.h.

                      {
    return ccutil_;
  }

◆ getCCUtil() [2/2]

const CCUtil * tesseract::Dict::getCCUtil ( ) const

inline

Definition at line 98 of file dict.h.

                                  {
    return ccutil_;
  }

◆ GetDawg()

const Dawg * tesseract::Dict::GetDawg ( int index ) const

inline

Return i-th dawg pointer recorded in the dawgs_ vector.

Definition at line 385 of file dict.h.

                                              {
    return dawgs_[index];
  }

◆ GetPuncDawg()

const Dawg * tesseract::Dict::GetPuncDawg ( ) const

inline

Return the points to the punctuation dawg.

Definition at line 389 of file dict.h.

                                         {
    return punc_dawg_;
  }

◆ GetStartingNode()

static NODE_REF tesseract::Dict::GetStartingNode	(	const Dawg *	dawg,
		EDGE_REF	edge_ref
	)

inlinestatic

Returns the appropriate next node given the EDGE_REF.

Definition at line 397 of file dict.h.

                                                                              {
    if (edge_ref == NO_EDGE) {
      return 0; // beginning to explore the dawg
    }
    NODE_REF node = dawg->next_node(edge_ref);
    if (node == 0) {
      node = NO_EDGE; // end of word
    }
    return node;
  }

◆ GetUnambigDawg()

const Dawg * tesseract::Dict::GetUnambigDawg ( ) const

inline

Return the points to the unambiguous words dawg.

Definition at line 393 of file dict.h.

                                            {
    return unambig_dawg_;
  }

◆ getUnicharAmbigs()

const UnicharAmbigs & tesseract::Dict::getUnicharAmbigs ( ) const

inline

Definition at line 111 of file dict.h.

                                                {
    return getCCUtil()->unichar_ambigs;
  }

◆ getUnicharset() [1/2]

UNICHARSET & tesseract::Dict::getUnicharset ( )

inline

Definition at line 107 of file dict.h.

                              {
    return getCCUtil()->unicharset;
  }

◆ getUnicharset() [2/2]

const UNICHARSET & tesseract::Dict::getUnicharset ( ) const

inline

Definition at line 104 of file dict.h.

                                          {
    return getCCUtil()->unicharset;
  }

◆ GlobalDawgCache()

DawgCache * tesseract::Dict::GlobalDawgCache ( )

static

Initialize Dict class - load dawgs from [lang].traineddata and user-specified wordlist and parttern list.

Definition at line 172 of file dict.cpp.

                                 {
  // This global cache (a singleton) will outlive every Tesseract instance
  // (even those that someone else might declare as global static variables).
  static DawgCache cache;
  return &cache;
}

◆ go_deeper_dawg_fxn()

void tesseract::Dict::go_deeper_dawg_fxn	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		bool	word_ending,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	void_more_args
	)

If the choice being composed so far could be a dictionary word and we have not reached the end of the word keep exploring the char_choices further.

Definition at line 43 of file permdawg.cpp.

                                                    {
  auto *more_args = static_cast<DawgArgs *>(void_more_args);
  word_ending = (static_cast<unsigned>(char_choice_index) == char_choices.size() - 1);
  int word_index = word->length() - 1;
  if (best_choice->rating() < *limit) {
    return;
  }
  // Look up char in DAWG
 
  // If the current unichar is an ngram first try calling
  // letter_is_okay() for each unigram it contains separately.
  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
  bool checked_unigrams = false;
  if (getUnicharset().get_isngram(orig_uch_id)) {
    if (dawg_debug_level) {
      tprintf("checking unigrams in an ngram %s\n", getUnicharset().debug_str(orig_uch_id).c_str());
    }
    int num_unigrams = 0;
    word->remove_last_unichar_id();
    std::vector<UNICHAR_ID> encoding;
    const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
    // Since the string came out of the unicharset, failure is impossible.
    ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr, nullptr));
    bool unigrams_ok = true;
    // Construct DawgArgs that reflect the current state.
    DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
    DawgPositionVector unigram_updated_dawgs;
    DawgArgs unigram_dawg_args(&unigram_active_dawgs, &unigram_updated_dawgs, more_args->permuter);
    // Check unigrams in the ngram with letter_is_okay().
    for (size_t i = 0; unigrams_ok && i < encoding.size(); ++i) {
      UNICHAR_ID uch_id = encoding[i];
      ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
      ++num_unigrams;
      word->append_unichar_id(uch_id, 1, 0.0, 0.0);
      unigrams_ok = (this->*letter_is_okay_)(&unigram_dawg_args, *word->unicharset(),
                                             word->unichar_id(word_index + num_unigrams - 1),
                                             word_ending && i == encoding.size() - 1);
      (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
      if (dawg_debug_level) {
        tprintf("unigram %s is %s\n", getUnicharset().debug_str(uch_id).c_str(),
                unigrams_ok ? "OK" : "not OK");
      }
    }
    // Restore the word and copy the updated dawg state if needed.
    while (num_unigrams-- > 0) {
      word->remove_last_unichar_id();
    }
    word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
    if (unigrams_ok) {
      checked_unigrams = true;
      more_args->permuter = unigram_dawg_args.permuter;
      *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
    }
  }
 
  // Check which dawgs from the dawgs_ vector contain the word
  // up to and including the current unichar.
  if (checked_unigrams || (this->*letter_is_okay_)(more_args, *word->unicharset(),
                                                   word->unichar_id(word_index), word_ending)) {
    // Add a new word choice
    if (word_ending) {
      if (dawg_debug_level) {
        tprintf("found word = %s\n", word->debug_string().c_str());
      }
      if (strcmp(output_ambig_words_file.c_str(), "") != 0) {
        if (output_ambig_words_file_ == nullptr) {
          output_ambig_words_file_ = fopen(output_ambig_words_file.c_str(), "wb+");
          if (output_ambig_words_file_ == nullptr) {
            tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
            exit(1);
          }
          std::string word_str;
          word->string_and_lengths(&word_str, nullptr);
          word_str += " ";
          fprintf(output_ambig_words_file_, "%s", word_str.c_str());
        }
        std::string word_str;
        word->string_and_lengths(&word_str, nullptr);
        word_str += " ";
        fprintf(output_ambig_words_file_, "%s", word_str.c_str());
      }
      WERD_CHOICE *adjusted_word = word;
      adjusted_word->set_permuter(more_args->permuter);
      update_best_choice(*adjusted_word, best_choice);
    } else { // search the next letter
      // Make updated_* point to the next entries in the DawgPositionVector
      // arrays (that were originally created in dawg_permute_and_select)
      ++(more_args->updated_dawgs);
      // Make active_dawgs and constraints point to the updated ones.
      ++(more_args->active_dawgs);
      permute_choices(debug, char_choices, char_choice_index + 1, prev_char_frag_info, word,
                      certainties, limit, best_choice, attempts_left, more_args);
      // Restore previous state to explore another letter in this position.
      --(more_args->updated_dawgs);
      --(more_args->active_dawgs);
    }
  } else {
    if (dawg_debug_level) {
      tprintf("last unichar not OK at index %d in %s\n", word_index, word->debug_string().c_str());
    }
  }
}

◆ good_choice()

int tesseract::Dict::good_choice ( const WERD_CHOICE & choice )

Returns true if a good answer is found for the unknown blob rating.

◆ has_hyphen_end() [1/2]

bool tesseract::Dict::has_hyphen_end	(	const UNICHARSET *	unicharset,
		UNICHAR_ID	unichar_id,
		bool	first_pos
	)		const

inline

Check whether the word has a hyphen at the end.

Definition at line 154 of file dict.h.

                                                   {
    if (!last_word_on_line_ || first_pos) {
      return false;
    }
    ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
    const auto &normed_ids = unicharset->normed_ids(unichar_id);
    return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
  }

◆ has_hyphen_end() [2/2]

bool tesseract::Dict::has_hyphen_end ( const WERD_CHOICE & word ) const

inline

Same as above, but check the unichar at the end of the word.

Definition at line 164 of file dict.h.

                                                            {
    int word_index = word.length() - 1;
    return has_hyphen_end(word.unicharset(), word.unichar_id(word_index), word_index == 0);
  }

◆ hyphen_base_size()

int tesseract::Dict::hyphen_base_size ( ) const

inline

Size of the base word (the part on the line before) of a hyphenated word.

Definition at line 139 of file dict.h.

                                      {
    return this->hyphenated() ? hyphen_word_->length() : 0;
  }

◆ hyphenated()

bool tesseract::Dict::hyphenated ( ) const

inline

Returns true if we've recorded the beginning of a hyphenated word.

Definition at line 135 of file dict.h.

                                 {
    return !last_word_on_line_ && hyphen_word_;
  }

◆ init_active_dawgs()

void tesseract::Dict::init_active_dawgs	(	DawgPositionVector *	active_dawgs,
		bool	ambigs_mode
	)		const

Fill the given active_dawgs vector with dawgs that could contain the beginning of the word. If hyphenated() returns true, copy the entries from hyphen_active_dawgs_ instead.

Definition at line 610 of file dict.cpp.

                                                                                     {
  if (hyphenated()) {
    *active_dawgs = hyphen_active_dawgs_;
    if (dawg_debug_level >= 3) {
      for (unsigned i = 0; i < hyphen_active_dawgs_.size(); ++i) {
        tprintf("Adding hyphen beginning dawg [%d, " REFFORMAT "]\n",
                hyphen_active_dawgs_[i].dawg_index, hyphen_active_dawgs_[i].dawg_ref);
      }
    }
  } else {
    default_dawgs(active_dawgs, ambigs_mode);
  }
}

◆ INT_VAR_H() [1/6]

tesseract::Dict::INT_VAR_H ( dawg_debug_level )

◆ INT_VAR_H() [2/6]

tesseract::Dict::INT_VAR_H ( hyphen_debug_level )

◆ INT_VAR_H() [3/6]

tesseract::Dict::INT_VAR_H ( max_permuter_attempts )

◆ INT_VAR_H() [4/6]

tesseract::Dict::INT_VAR_H ( stopper_debug_level )

◆ INT_VAR_H() [5/6]

tesseract::Dict::INT_VAR_H ( stopper_smallword_size )

◆ INT_VAR_H() [6/6]

tesseract::Dict::INT_VAR_H ( tessedit_truncate_wordchoice_log )

◆ is_apostrophe()

bool tesseract::Dict::is_apostrophe ( UNICHAR_ID unichar_id )

inline

Definition at line 125 of file dict.h.

                                                   {
    const UNICHARSET &unicharset = getUnicharset();
    ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
    const auto &normed_ids = unicharset.normed_ids(unichar_id);
    return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
  }

◆ IsSpaceDelimitedLang()

bool tesseract::Dict::IsSpaceDelimitedLang ( ) const

Returns true if the language is space-delimited (not CJ, or T).

Definition at line 912 of file dict.cpp.

                                      {
  const UNICHARSET &u_set = getUnicharset();
  if (u_set.han_sid() > 0) {
    return false;
  }
  if (u_set.katakana_sid() > 0) {
    return false;
  }
  if (u_set.thai_sid() > 0) {
    return false;
  }
  return true;
}

◆ LengthOfShortestAlphaRun()

int tesseract::Dict::LengthOfShortestAlphaRun ( const WERD_CHOICE & WordChoice ) const

Returns the length of the shortest alpha run in WordChoice.

Definition at line 443 of file stopper.cpp.

                                                                      {
  int shortest = INT32_MAX;
  int curr_len = 0;
  for (unsigned w = 0; w < WordChoice.length(); ++w) {
    if (WordChoice.unicharset()->get_isalpha(WordChoice.unichar_id(w))) {
      curr_len++;
    } else if (curr_len > 0) {
      if (curr_len < shortest) {
        shortest = curr_len;
      }
      curr_len = 0;
    }
  }
  if (curr_len > 0 && curr_len < shortest) {
    shortest = curr_len;
  } else if (shortest == INT32_MAX) {
    shortest = 0;
  }
  return shortest;
}

◆ LetterIsOkay()

int tesseract::Dict::LetterIsOkay	(	void *	void_dawg_args,
		const UNICHARSET &	unicharset,
		UNICHAR_ID	unichar_id,
		bool	word_end
	)		const

inline

Calls letter_is_okay_ member function.

Definition at line 348 of file dict.h.

                                        {
    return (this->*letter_is_okay_)(void_dawg_args, unicharset, unichar_id, word_end);
  }

◆ Load()

void tesseract::Dict::Load	(	const std::string &	lang,
		TessdataManager *	data_file
	)

Definition at line 200 of file dict.cpp.

                                                                 {
  // Load dawgs_.
  if (load_punc_dawg) {
    punc_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
    if (punc_dawg_) {
      dawgs_.push_back(punc_dawg_);
    }
  }
  if (load_system_dawg) {
    Dawg *system_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
    if (system_dawg) {
      dawgs_.push_back(system_dawg);
    }
  }
  if (load_number_dawg) {
    Dawg *number_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
    if (number_dawg) {
      dawgs_.push_back(number_dawg);
    }
  }
  if (load_bigram_dawg) {
    bigram_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, dawg_debug_level, data_file);
    // The bigram_dawg_ is NOT used like the other dawgs! DO NOT add to the
    // dawgs_!!
  }
  if (load_freq_dawg) {
    freq_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
    if (freq_dawg_) {
      dawgs_.push_back(freq_dawg_);
    }
  }
  if (load_unambig_dawg) {
    unambig_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
    if (unambig_dawg_) {
      dawgs_.push_back(unambig_dawg_);
    }
  }
 
  std::string name;
  if (!user_words_suffix.empty() || !user_words_file.empty()) {
    Trie *trie_ptr =
        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
    if (!user_words_file.empty()) {
      name = user_words_file;
    } else {
      name = getCCUtil()->language_data_path_prefix;
      name += user_words_suffix;
    }
    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
      dawgs_.push_back(trie_ptr);
    }
  }
 
  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
                              dawg_debug_level);
    trie_ptr->initialize_patterns(&(getUnicharset()));
    if (!user_patterns_file.empty()) {
      name = user_patterns_file;
    } else {
      name = getCCUtil()->language_data_path_prefix;
      name += user_patterns_suffix;
    }
    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
      dawgs_.push_back(trie_ptr);
    }
  }
 
  document_words_ =
      new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
  dawgs_.push_back(document_words_);
 
  // This dawg is temporary and should not be searched by letter_is_ok.
  pending_words_ =
      new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level);
}

◆ LoadLSTM()

void tesseract::Dict::LoadLSTM	(	const std::string &	lang,
		TessdataManager *	data_file
	)

Definition at line 291 of file dict.cpp.

                                                                     {
  // Load dawgs_.
  if (load_punc_dawg) {
    punc_dawg_ =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
    if (punc_dawg_) {
      dawgs_.push_back(punc_dawg_);
    }
  }
  if (load_system_dawg) {
    Dawg *system_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
    if (system_dawg) {
      dawgs_.push_back(system_dawg);
    }
  }
  if (load_number_dawg) {
    Dawg *number_dawg =
        dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
    if (number_dawg) {
      dawgs_.push_back(number_dawg);
    }
  }
 
  // stolen from Dict::Load (but needs params_ from Tesseract
  // langdata/config/api):
  std::string name;
  if (!user_words_suffix.empty() || !user_words_file.empty()) {
    Trie *trie_ptr =
        new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
    if (!user_words_file.empty()) {
      name = user_words_file;
    } else {
      name = getCCUtil()->language_data_path_prefix;
      name += user_words_suffix;
    }
    if (!trie_ptr->read_and_add_word_list(name.c_str(), getUnicharset(),
                                          Trie::RRP_REVERSE_IF_HAS_RTL)) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
      dawgs_.push_back(trie_ptr);
    }
  }
 
  if (!user_patterns_suffix.empty() || !user_patterns_file.empty()) {
    Trie *trie_ptr = new Trie(DAWG_TYPE_PATTERN, lang, USER_PATTERN_PERM, getUnicharset().size(),
                              dawg_debug_level);
    trie_ptr->initialize_patterns(&(getUnicharset()));
    if (!user_patterns_file.empty()) {
      name = user_patterns_file;
    } else {
      name = getCCUtil()->language_data_path_prefix;
      name += user_patterns_suffix;
    }
    if (!trie_ptr->read_pattern_list(name.c_str(), getUnicharset())) {
      tprintf("Error: failed to load %s\n", name.c_str());
      delete trie_ptr;
    } else {
      dawgs_.push_back(trie_ptr);
    }
  }
}

◆ NoDangerousAmbig()

bool tesseract::Dict::NoDangerousAmbig	(	WERD_CHOICE *	BestChoice,
		DANGERR *	fixpt,
		bool	fix_replaceable,
		MATRIX *	ratings
	)

Definition at line 158 of file stopper.cpp.

                                             {
  if (stopper_debug_level > 2) {
    tprintf("\nRunning NoDangerousAmbig() for %s\n", best_choice->debug_string().c_str());
  }
 
  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
  // for each unichar id in BestChoice.
  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
  bool ambigs_found = false;
  // For each position in best_choice:
  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
  // -- look for ambiguities corresponding to wrong_ngram in the list while
  //    adding the following unichar_ids from best_choice to wrong_ngram
  //
  // Repeat the above procedure twice: first time look through
  // ambigs to be replaced and replace all the ambiguities found;
  // second time look through dangerous ambiguities and construct
  // ambig_blob_choices with fake a blob choice for each ambiguity
  // and pass them to dawg_permute_and_select() to search for
  // ambiguous words in the dictionaries.
  //
  // Note that during the execution of the for loop (on the first pass)
  // if replacements are made the length of best_choice might change.
  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
    bool replace = (fix_replaceable && pass == 0);
    const UnicharAmbigsVector &table =
        replace ? getUnicharAmbigs().replace_ambigs() : getUnicharAmbigs().dang_ambigs();
    if (!replace) {
      // Initialize ambig_blob_choices with lists containing a single
      // unichar id for the corresponding position in best_choice.
      // best_choice consisting from only the original letters will
      // have a rating of 0.0.
      for (unsigned i = 0; i < best_choice->length(); ++i) {
        auto *lst = new BLOB_CHOICE_LIST();
        BLOB_CHOICE_IT lst_it(lst);
        // TODO(rays/antonova) Put real xheights and y shifts here.
        lst_it.add_to_end(
            new BLOB_CHOICE(best_choice->unichar_id(i), 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG));
        ambig_blob_choices.push_back(lst);
      }
    }
    UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
    int wrong_ngram_index;
    int blob_index = 0;
    for (unsigned i = 0; i < best_choice->length(); blob_index += best_choice->state(i), ++i) {
      auto curr_unichar_id = best_choice->unichar_id(i);
      if (stopper_debug_level > 2) {
        tprintf("Looking for %s ngrams starting with %s:\n", replace ? "replaceable" : "ambiguous",
                getUnicharset().debug_str(curr_unichar_id).c_str());
      }
      int num_wrong_blobs = best_choice->state(i);
      wrong_ngram_index = 0;
      wrong_ngram[wrong_ngram_index] = curr_unichar_id;
      if (curr_unichar_id == INVALID_UNICHAR_ID || static_cast<size_t>(curr_unichar_id) >= table.size() ||
          table[curr_unichar_id] == nullptr) {
        continue; // there is no ambig spec for this unichar id
      }
      AmbigSpec_IT spec_it(table[curr_unichar_id]);
      for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
        const AmbigSpec *ambig_spec = spec_it.data();
        wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
        int compare = UnicharIdArrayUtils::compare(wrong_ngram, ambig_spec->wrong_ngram);
        if (stopper_debug_level > 2) {
          tprintf("candidate ngram: ");
          UnicharIdArrayUtils::print(wrong_ngram, getUnicharset());
          tprintf("current ngram from spec: ");
          UnicharIdArrayUtils::print(ambig_spec->wrong_ngram, getUnicharset());
          tprintf("comparison result: %d\n", compare);
        }
        if (compare == 0) {
          // Record the place where we found an ambiguity.
          if (fixpt != nullptr) {
            UNICHAR_ID leftmost_id = ambig_spec->correct_fragments[0];
            fixpt->push_back(DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
                                          getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                                          leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs, false,
                      getUnicharset().get_isngram(ambig_spec->correct_ngram_id),
                      getUnicharset().id_to_unichar(leftmost_id));
            }
          }
 
          if (replace) {
            if (stopper_debug_level > 2) {
              tprintf("replace ambiguity with %s : ",
                      getUnicharset().id_to_unichar(ambig_spec->correct_ngram_id));
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
            }
            ReplaceAmbig(i, ambig_spec->wrong_ngram_size, ambig_spec->correct_ngram_id, best_choice,
                         ratings);
          } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
            // We found dang ambig - update ambig_blob_choices.
            if (stopper_debug_level > 2) {
              tprintf("found ambiguity: ");
              UnicharIdArrayUtils::print(ambig_spec->correct_fragments, getUnicharset());
            }
            ambigs_found = true;
            for (int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
              // Add a blob choice for the corresponding fragment of the
              // ambiguity. These fake blob choices are initialized with
              // negative ratings (which are not possible for real blob
              // choices), so that dawg_permute_and_select() considers any
              // word not consisting of only the original letters a better
              // choice and stops searching for alternatives once such a
              // choice is found.
              BLOB_CHOICE_IT bc_it(ambig_blob_choices[i + tmp_index]);
              bc_it.add_to_end(new BLOB_CHOICE(ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
                                               -1, 0, 1, 0, BCC_AMBIG));
            }
          }
          spec_it.forward();
        } else if (compare == -1) {
          unsigned next_index;
          if (wrong_ngram_index + 1 < ambig_spec->wrong_ngram_size &&
              ((next_index = wrong_ngram_index + 1 + i) < best_choice->length())) {
            // Add the next unichar id to wrong_ngram and keep looking for
            // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
            wrong_ngram[++wrong_ngram_index] = best_choice->unichar_id(next_index);
            num_wrong_blobs += best_choice->state(next_index);
          } else {
            break; // no more matching ambigs in this AMBIG_SPEC_LIST
          }
        } else {
          spec_it.forward();
        }
      } // end searching AmbigSpec_LIST
    }   // end searching best_choice
  }     // end searching replace and dangerous ambigs
 
  // If any ambiguities were found permute the constructed ambig_blob_choices
  // to see if an alternative dictionary word can be found.
  if (ambigs_found) {
    if (stopper_debug_level > 2) {
      tprintf("\nResulting ambig_blob_choices:\n");
      for (unsigned i = 0; i < ambig_blob_choices.size(); ++i) {
        print_ratings_list("", ambig_blob_choices.at(i), getUnicharset());
        tprintf("\n");
      }
    }
    WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
    ambigs_found = (alt_word->rating() < 0.0);
    if (ambigs_found) {
      if (stopper_debug_level >= 1) {
        tprintf("Stopper: Possible ambiguous word = %s\n", alt_word->debug_string().c_str());
      }
      if (fixpt != nullptr) {
        // Note: Currently character choices combined from fragments can only
        // be generated by NoDangrousAmbigs(). This code should be updated if
        // the capability to produce classifications combined from character
        // fragments is added to other functions.
        int orig_i = 0;
        for (unsigned i = 0; i < alt_word->length(); ++i) {
          const UNICHARSET &uchset = getUnicharset();
          bool replacement_is_ngram = uchset.get_isngram(alt_word->unichar_id(i));
          UNICHAR_ID leftmost_id = alt_word->unichar_id(i);
          if (replacement_is_ngram) {
            // we have to extract the leftmost unichar from the ngram.
            const char *str = uchset.id_to_unichar(leftmost_id);
            int step = uchset.step(str);
            if (step) {
              leftmost_id = uchset.unichar_to_id(str, step);
            }
          }
          int end_i = orig_i + alt_word->state(i);
          if (alt_word->state(i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
            // Compute proper blob indices.
            int blob_start = 0;
            for (int j = 0; j < orig_i; ++j) {
              blob_start += best_choice->state(j);
            }
            int blob_end = blob_start;
            for (int j = orig_i; j < end_i; ++j) {
              blob_end += best_choice->state(j);
            }
            fixpt->push_back(
                DANGERR_INFO(blob_start, blob_end, true, replacement_is_ngram, leftmost_id));
            if (stopper_debug_level > 1) {
              tprintf("fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i, true,
                      replacement_is_ngram, uchset.id_to_unichar(leftmost_id));
            }
          }
          orig_i += alt_word->state(i);
        }
      }
    }
    delete alt_word;
  }
  if (output_ambig_words_file_ != nullptr) {
    fprintf(output_ambig_words_file_, "\n");
  }
 
  for (auto data : ambig_blob_choices) {
    delete data;
  }
  return !ambigs_found;
}

◆ NumDawgs()

int tesseract::Dict::NumDawgs ( ) const

inline

Return the number of dawgs in the dawgs_ vector.

Definition at line 381 of file dict.h.

                              {
    return dawgs_.size();
  }

◆ permute_choices()

void tesseract::Dict::permute_choices	(	const char *	debug,
		const BLOB_CHOICE_LIST_VECTOR &	char_choices,
		int	char_choice_index,
		const CHAR_FRAGMENT_INFO *	prev_char_frag_info,
		WERD_CHOICE *	word,
		float	certainties[],
		float *	limit,
		WERD_CHOICE *	best_choice,
		int *	attempts_left,
		void *	more_args
	)

permute_choices

Call append_choices() for each BLOB_CHOICE in BLOB_CHOICE_LIST with the given char_choice_index in char_choices.

Definition at line 187 of file permdawg.cpp.

                                                                                          {
  if (debug) {
    tprintf(
        "%s permute_choices: char_choice_index=%d"
        " limit=%g rating=%g, certainty=%g word=%s\n",
        debug, char_choice_index, *limit, word->rating(), word->certainty(),
        word->debug_string().c_str());
  }
  if (static_cast<unsigned>(char_choice_index) < char_choices.size()) {
    BLOB_CHOICE_IT blob_choice_it;
    blob_choice_it.set_to_list(char_choices.at(char_choice_index));
    for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list(); blob_choice_it.forward()) {
      (*attempts_left)--;
      append_choices(debug, char_choices, *(blob_choice_it.data()), char_choice_index,
                     prev_char_frag_info, word, certainties, limit, best_choice, attempts_left,
                     more_args);
      if (*attempts_left <= 0) {
        if (debug) {
          tprintf("permute_choices(): attempts_left is 0\n");
        }
        break;
      }
    }
  }
}

◆ ProbabilityInContext()

double tesseract::Dict::ProbabilityInContext	(	const char *	context,
		int	context_bytes,
		const char *	character,
		int	character_bytes
	)

inline

Calls probability_in_context_ member function.

Definition at line 357 of file dict.h.

                                                   {
    return (this->*probability_in_context_)(getCCUtil()->lang.c_str(), context, context_bytes,
                                            character, character_bytes);
  }

◆ ProcessPatternEdges()

void tesseract::Dict::ProcessPatternEdges	(	const Dawg *	dawg,
		const DawgPosition &	info,
		UNICHAR_ID	unichar_id,
		bool	word_end,
		DawgArgs *	dawg_args,
		PermuterType *	current_permuter
	)		const

For each of the character classes of the given unichar_id (and the unichar_id itself) finds the corresponding outgoing node or self-loop in the given dawg and (after checking that it is valid) records it in dawg_args->updated_ative_dawgs. Updates current_permuter if any valid edges were found.

Definition at line 571 of file dict.cpp.

                                                                                                  {
  NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
  // Try to find the edge corresponding to the exact unichar_id and to all the
  // edges corresponding to the character class of unichar_id.
  std::vector<UNICHAR_ID> unichar_id_patterns;
  unichar_id_patterns.push_back(unichar_id);
  dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
  for (int unichar_id_pattern : unichar_id_patterns) {
    // On the first iteration check all the outgoing edges.
    // On the second iteration check all self-loops.
    for (int k = 0; k < 2; ++k) {
      EDGE_REF edge = (k == 0)
                          ? dawg->edge_char_of(node, unichar_id_pattern, word_end)
                          : dawg->pattern_loop_edge(pos.dawg_ref, unichar_id_pattern, word_end);
      if (edge == NO_EDGE) {
        continue;
      }
      if (dawg_debug_level >= 3) {
        tprintf("Pattern dawg: [%d, " REFFORMAT "] edge=" REFFORMAT "\n", pos.dawg_index, node,
                edge);
        tprintf("Letter found in pattern dawg %d\n", pos.dawg_index);
      }
      if (dawg->permuter() > *curr_perm) {
        *curr_perm = dawg->permuter();
      }
      if (dawg->end_of_word(edge)) {
        dawg_args->valid_end = true;
      }
      dawg_args->updated_dawgs->add_unique(
          DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc),
          dawg_debug_level > 0, "Append current dawg to updated active dawgs: ");
    }
  }
}

◆ ReplaceAmbig()

void tesseract::Dict::ReplaceAmbig	(	int	wrong_ngram_begin_index,
		int	wrong_ngram_size,
		UNICHAR_ID	correct_ngram_id,
		WERD_CHOICE *	werd_choice,
		MATRIX *	ratings
	)

Definition at line 370 of file stopper.cpp.

                                                                                                {
  int num_blobs_to_replace = 0;
  int begin_blob_index = 0;
  int i;
  // Rating and certainty for the new BLOB_CHOICE are derived from the
  // replaced choices.
  float new_rating = 0.0f;
  float new_certainty = 0.0f;
  BLOB_CHOICE *old_choice = nullptr;
  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
    if (i >= wrong_ngram_begin_index) {
      int num_blobs = werd_choice->state(i);
      int col = begin_blob_index + num_blobs_to_replace;
      int row = col + num_blobs - 1;
      BLOB_CHOICE_LIST *choices = ratings->get(col, row);
      ASSERT_HOST(choices != nullptr);
      old_choice = FindMatchingChoice(werd_choice->unichar_id(i), choices);
      ASSERT_HOST(old_choice != nullptr);
      new_rating += old_choice->rating();
      new_certainty += old_choice->certainty();
      num_blobs_to_replace += num_blobs;
    } else {
      begin_blob_index += werd_choice->state(i);
    }
  }
  new_certainty /= wrong_ngram_size;
  // If there is no entry in the ratings matrix, add it.
  MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
  if (!coord.Valid(*ratings)) {
    ratings->IncreaseBandSize(coord.row - coord.col + 1);
  }
  if (ratings->get(coord.col, coord.row) == nullptr) {
    ratings->put(coord.col, coord.row, new BLOB_CHOICE_LIST);
  }
  BLOB_CHOICE_LIST *new_choices = ratings->get(coord.col, coord.row);
  BLOB_CHOICE *choice = FindMatchingChoice(correct_ngram_id, new_choices);
  if (choice != nullptr) {
    // Already there. Upgrade if new rating better.
    if (new_rating < choice->rating()) {
      choice->set_rating(new_rating);
    }
    if (new_certainty < choice->certainty()) {
      choice->set_certainty(new_certainty);
    }
    // DO NOT SORT!! It will mess up the iterator in LanguageModel::UpdateState.
  } else {
    // Need a new choice with the correct_ngram_id.
    choice = new BLOB_CHOICE(*old_choice);
    choice->set_unichar_id(correct_ngram_id);
    choice->set_rating(new_rating);
    choice->set_certainty(new_certainty);
    choice->set_classifier(BCC_AMBIG);
    choice->set_matrix_cell(coord.col, coord.row);
    BLOB_CHOICE_IT it(new_choices);
    it.add_to_end(choice);
  }
  // Remove current unichar from werd_choice. On the last iteration
  // set the correct replacement unichar instead of removing a unichar.
  for (int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
    if (replaced_count + 1 == wrong_ngram_size) {
      werd_choice->set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
    } else {
      werd_choice->remove_unichar_id(wrong_ngram_begin_index + 1);
    }
  }
  if (stopper_debug_level >= 1) {
    werd_choice->print("ReplaceAmbig() ");
    tprintf("Modified blob_choices: ");
    print_ratings_list("\n", new_choices, getUnicharset());
  }
}

◆ reset_hyphen_vars()

void tesseract::Dict::reset_hyphen_vars ( bool last_word_on_line )

Unless the previous word was the last one on the line, and the current one is not (thus it is the first one on the line), erase hyphen_word_, clear hyphen_active_dawgs_, update last_word_on_line_.

Definition at line 27 of file hyphen.cpp.

                                                   {
  if (!(last_word_on_line_ == true && last_word_on_line == false)) {
    if (hyphen_word_ != nullptr) {
      delete hyphen_word_;
      hyphen_word_ = nullptr;
      hyphen_active_dawgs_.clear();
    }
  }
  if (hyphen_debug_level) {
    tprintf("reset_hyphen_vars: last_word_on_line %d -> %d\n", last_word_on_line_,
            last_word_on_line);
  }
  last_word_on_line_ = last_word_on_line;
}

◆ ResetDocumentDictionary()

void tesseract::Dict::ResetDocumentDictionary ( )

inline

Definition at line 297 of file dict.h.

                                 {
    if (pending_words_ != nullptr) {
      pending_words_->clear();
    }
    if (document_words_ != nullptr) {
      document_words_->clear();
    }
  }

◆ set_hyphen_word()

void tesseract::Dict::set_hyphen_word	(	const WERD_CHOICE &	word,
		const DawgPositionVector &	active_dawgs
	)

Update hyphen_word_, and copy the given DawgPositionVectors into hyphen_active_dawgs_ .

Definition at line 44 of file hyphen.cpp.

                                                                                          {
  if (hyphen_word_ == nullptr) {
    hyphen_word_ = new WERD_CHOICE(word.unicharset());
    hyphen_word_->make_bad();
  }
  if (hyphen_word_->rating() > word.rating()) {
    *hyphen_word_ = word;
    // Remove the last unichar id as it is a hyphen, and remove
    // any unichar_string/lengths that are present.
    hyphen_word_->remove_last_unichar_id();
    hyphen_active_dawgs_ = active_dawgs;
  }
  if (hyphen_debug_level) {
    hyphen_word_->print("set_hyphen_word: ");
  }
}

◆ SettupStopperPass1()

void tesseract::Dict::SettupStopperPass1 ( )

Sets up stopper variables in preparation for the first pass.

Definition at line 362 of file stopper.cpp.

                              {
  reject_offset_ = 0.0;
}

◆ SettupStopperPass2()

void tesseract::Dict::SettupStopperPass2 ( )

Sets up stopper variables in preparation for the second pass.

Definition at line 366 of file stopper.cpp.

                              {
  reject_offset_ = stopper_phase2_certainty_rejection_offset;
}

◆ SetupForLoad()

void tesseract::Dict::SetupForLoad ( DawgCache * dawg_cache )

Definition at line 180 of file dict.cpp.

                                             {
  if (dawgs_.size() != 0) {
    this->End();
  }
 
  apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol);
  question_unichar_id_ = getUnicharset().unichar_to_id(kQuestionSymbol);
  slash_unichar_id_ = getUnicharset().unichar_to_id(kSlashSymbol);
  hyphen_unichar_id_ = getUnicharset().unichar_to_id(kHyphenSymbol);
 
  if (dawg_cache != nullptr) {
    dawg_cache_ = dawg_cache;
    dawg_cache_is_ours_ = false;
  } else {
    dawg_cache_ = new DawgCache();
    dawg_cache_is_ours_ = true;
  }
}

◆ SetWildcardID()

void tesseract::Dict::SetWildcardID ( UNICHAR_ID id )

inline

Definition at line 374 of file dict.h.

                                           {
    wildcard_unichar_id_ = id;
  }

◆ SetWordsegRatingAdjustFactor()

void tesseract::Dict::SetWordsegRatingAdjustFactor ( float f )

inline

Set wordseg_rating_adjust_factor_ to the given value.

Definition at line 469 of file dict.h.

                                                    {
    wordseg_rating_adjust_factor_ = f;
  }

◆ STRING_VAR_H() [1/6]

tesseract::Dict::STRING_VAR_H ( output_ambig_words_file )

◆ STRING_VAR_H() [2/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_file )

◆ STRING_VAR_H() [3/6]

tesseract::Dict::STRING_VAR_H ( user_patterns_suffix )

◆ STRING_VAR_H() [4/6]

tesseract::Dict::STRING_VAR_H ( user_words_file )

Variable members. These have to be declared and initialized after image_ptr_, which contains the pointer to the params vector - the member of its base CCUtil class.

◆ STRING_VAR_H() [5/6]

tesseract::Dict::STRING_VAR_H ( user_words_suffix )

◆ STRING_VAR_H() [6/6]

tesseract::Dict::STRING_VAR_H ( word_to_debug )

◆ UniformCertainties()

int tesseract::Dict::UniformCertainties ( const WERD_CHOICE & word )

Returns true if the certainty of the BestChoice word is within a reasonable range of the average certainties for the best choices for each character in the segmentation. This test is used to catch words in which one character is much worse than the other characters in the word (i.e. false will be returned in that case). The algorithm computes the mean and std deviation of the certainties in the word with the worst certainty thrown out.

Definition at line 464 of file stopper.cpp.

                                                    {
  float Certainty;
  float WorstCertainty = FLT_MAX;
  float CertaintyThreshold;
  double TotalCertainty;
  double TotalCertaintySquared;
  double Variance;
  float Mean, StdDev;
  int word_length = word.length();
 
  if (word_length < 3) {
    return true;
  }
 
  TotalCertainty = TotalCertaintySquared = 0.0;
  for (int i = 0; i < word_length; ++i) {
    Certainty = word.certainty(i);
    TotalCertainty += Certainty;
    TotalCertaintySquared += static_cast<double>(Certainty) * Certainty;
    if (Certainty < WorstCertainty) {
      WorstCertainty = Certainty;
    }
  }
 
  // Subtract off worst certainty from statistics.
  word_length--;
  TotalCertainty -= WorstCertainty;
  TotalCertaintySquared -= static_cast<double>(WorstCertainty) * WorstCertainty;
 
  Mean = TotalCertainty / word_length;
  Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
              (word_length * (word_length - 1)));
  if (Variance < 0.0) {
    Variance = 0.0;
  }
  StdDev = sqrt(Variance);
 
  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
  if (CertaintyThreshold > stopper_nondict_certainty_base) {
    CertaintyThreshold = stopper_nondict_certainty_base;
  }
 
  if (word.certainty() < CertaintyThreshold) {
    if (stopper_debug_level >= 1) {
      tprintf(
          "Stopper: Non-uniform certainty = %4.1f"
          " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
          word.certainty(), Mean, StdDev, CertaintyThreshold);
    }
    return false;
  } else {
    return true;
  }
}

◆ update_best_choice()

void tesseract::Dict::update_best_choice	(	const WERD_CHOICE &	word,
		WERD_CHOICE *	best_choice
	)

inline

Copies word into best_choice if its rating is smaller than that of best_choice.

Definition at line 182 of file dict.h.

                                                                                    {
    if (word.rating() < best_choice->rating()) {
      *best_choice = word;
    }
  }

◆ valid_bigram()

bool tesseract::Dict::valid_bigram	(	const WERD_CHOICE &	word1,
		const WERD_CHOICE &	word2
	)		const

Definition at line 836 of file dict.cpp.

                                                                                {
  if (bigram_dawg_ == nullptr) {
    return false;
  }
 
  // Extract the core word from the middle of each word with any digits
  //         replaced with question marks.
  unsigned w1start, w1end, w2start, w2end;
  word1.punct_stripped(&w1start, &w1end);
  word2.punct_stripped(&w2start, &w2end);
 
  // We don't want to penalize a single guillemet, hyphen, etc.
  // But our bigram list doesn't have any information about punctuation.
  if (w1start >= w1end) {
    return word1.length() < 3;
  }
  if (w2start >= w2end) {
    return word2.length() < 3;
  }
 
  const UNICHARSET &uchset = getUnicharset();
  std::vector<UNICHAR_ID> bigram_string;
  bigram_string.reserve(w1end + w2end + 1);
  for (auto i = w1start; i < w1end; i++) {
    const auto &normed_ids = getUnicharset().normed_ids(word1.unichar_id(i));
    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
      bigram_string.push_back(question_unichar_id_);
    } else {
      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
    }
  }
  bigram_string.push_back(UNICHAR_SPACE);
  for (auto i = w2start; i < w2end; i++) {
    const auto &normed_ids = getUnicharset().normed_ids(word2.unichar_id(i));
    if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0])) {
      bigram_string.push_back(question_unichar_id_);
    } else {
      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
    }
  }
  WERD_CHOICE normalized_word(&uchset, bigram_string.size());
  for (int i : bigram_string) {
    normalized_word.append_unichar_id_space_allocated(i, 1, 0.0f, 0.0f);
  }
  return bigram_dawg_->word_in_dawg(normalized_word);
}

◆ valid_punctuation()

bool tesseract::Dict::valid_punctuation ( const WERD_CHOICE & word )

Returns true if the word contains a valid punctuation pattern. Note: Since the domains of punctuation symbols and symblos used in numbers are not disjoint, a valid number might contain an invalid punctuation pattern (e.g. .99).

Definition at line 883 of file dict.cpp.

                                                    {
  if (word.empty()) {
    return NO_PERM;
  }
  WERD_CHOICE new_word(word.unicharset());
  auto last_index = word.length() - 1;
  int new_len = 0;
  for (unsigned i = 0; i <= last_index; ++i) {
    UNICHAR_ID unichar_id = (word.unichar_id(i));
    if (getUnicharset().get_ispunctuation(unichar_id)) {
      new_word.append_unichar_id(unichar_id, 1, 0.0, 0.0);
    } else if (!getUnicharset().get_isalpha(unichar_id) &&
               !getUnicharset().get_isdigit(unichar_id)) {
      return false; // neither punc, nor alpha, nor digit
    } else if ((new_len = new_word.length()) == 0 ||
               new_word.unichar_id(new_len - 1) != Dawg::kPatternUnicharID) {
      new_word.append_unichar_id(Dawg::kPatternUnicharID, 1, 0.0, 0.0);
    }
  }
  for (unsigned i = 0; i < dawgs_.size(); ++i) {
    if (dawgs_[i] != nullptr && dawgs_[i]->type() == DAWG_TYPE_PUNCTUATION &&
        dawgs_[i]->word_in_dawg(new_word)) {
      return true;
    }
  }
  return false;
}

◆ valid_word() [1/3]

int tesseract::Dict::valid_word ( const char * string ) const

inline

This function is used by api/tesseract_cube_combiner.cpp.

Definition at line 450 of file dict.h.

                                           {
    WERD_CHOICE word(string, getUnicharset());
    return valid_word(word);
  }

◆ valid_word() [2/3]

int tesseract::Dict::valid_word ( const WERD_CHOICE & word ) const

inline

Definition at line 443 of file dict.h.

                                                {
    return valid_word(word, false); // return NO_PERM for words with digits
  }

◆ valid_word() [3/3]

int tesseract::Dict::valid_word	(	const WERD_CHOICE &	word,
		bool	numbers_ok
	)		const

Definition at line 801 of file dict.cpp.

                                                                   {
  const WERD_CHOICE *word_ptr = &word;
  WERD_CHOICE temp_word(word.unicharset());
  if (hyphenated() && hyphen_word_->unicharset() == word.unicharset()) {
    copy_hyphen_info(&temp_word);
    temp_word += word;
    word_ptr = &temp_word;
  }
  if (word_ptr->empty()) {
    return NO_PERM;
  }
  // Allocate vectors for holding current and updated
  // active_dawgs and initialize them.
  DawgPositionVector active_dawgs[2];
  init_active_dawgs(&(active_dawgs[0]), false);
  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
  int last_index = word_ptr->length() - 1;
  // Call letter_is_okay for each letter in the word.
  for (int i = hyphen_base_size(); i <= last_index; ++i) {
    if (!((this->*letter_is_okay_)(&dawg_args, *word_ptr->unicharset(), word_ptr->unichar_id(i),
                                   i == last_index))) {
      break;
    }
    // Swap active_dawgs, constraints with the corresponding updated vector.
    if (dawg_args.updated_dawgs == &(active_dawgs[1])) {
      dawg_args.updated_dawgs = &(active_dawgs[0]);
      ++(dawg_args.active_dawgs);
    } else {
      ++(dawg_args.updated_dawgs);
      dawg_args.active_dawgs = &(active_dawgs[0]);
    }
  }
  return valid_word_permuter(dawg_args.permuter, numbers_ok) ? dawg_args.permuter : NO_PERM;
}

◆ valid_word_or_number()

int tesseract::Dict::valid_word_or_number ( const WERD_CHOICE & word ) const

inline

Definition at line 446 of file dict.h.

                                                          {
    return valid_word(word, true); // return NUMBER_PERM for valid numbers
  }

◆ valid_word_permuter()

static bool tesseract::Dict::valid_word_permuter	(	uint8_t	perm,
		bool	numbers_ok
	)

inlinestatic

Check all the DAWGs to see if this word is in any of them.

Read/Write/Access special purpose dawgs which contain words only of a certain length (used for phrase search for non-space-delimited languages).

Definition at line 437 of file dict.h.

                                                                        {
    return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM || perm == DOC_DAWG_PERM ||
            perm == USER_DAWG_PERM || perm == USER_PATTERN_PERM || perm == COMPOUND_PERM ||
            (numbers_ok && perm == NUMBER_PERM));
  }

◆ WildcardID()

UNICHAR_ID tesseract::Dict::WildcardID ( ) const

inline

Definition at line 377 of file dict.h.

                                       {
    return wildcard_unichar_id_;
  }

Member Data Documentation

◆ go_deeper_fxn_

void(Dict::* tesseract::Dict::go_deeper_fxn_) (const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)

Pointer to go_deeper function.

Definition at line 210 of file dict.h.

◆ letter_is_okay_

int(Dict::* tesseract::Dict::letter_is_okay_) (void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const

Definition at line 345 of file dict.h.

◆ probability_in_context_

double(Dict::* tesseract::Dict::probability_in_context_) (const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)

Probability in context function used by the ngram permuter.

Definition at line 354 of file dict.h.

The documentation for this class was generated from the following files:

/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/dict.h
/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/context.cpp
/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/dict.cpp
/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/hyphen.cpp
/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/permdawg.cpp
/media/home/debian/src/github/tesseract-ocr/tesseract/src/dict/stopper.cpp

Public Member Functions

Static Public Member Functions

Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ Dict()

◆ ~Dict()

Member Function Documentation

◆ absolute_garbage()

◆ AcceptableChoice()

◆ AcceptableResult()

◆ add_document_word()

◆ adjust_word()

◆ append_choices()

◆ BOOL_VAR_H() [1/10]

◆ BOOL_VAR_H() [2/10]

◆ BOOL_VAR_H() [3/10]

◆ BOOL_VAR_H() [4/10]

◆ BOOL_VAR_H() [5/10]

◆ BOOL_VAR_H() [6/10]

◆ BOOL_VAR_H() [7/10]

◆ BOOL_VAR_H() [8/10]

◆ BOOL_VAR_H() [9/10]

◆ BOOL_VAR_H() [10/10]

◆ case_ok()

◆ char_for_dawg()

◆ compound_marker()

◆ copy_hyphen_info()

◆ dawg_permute_and_select()

◆ DebugWordChoices()

◆ def_letter_is_okay()

◆ def_probability_in_context()

◆ default_dawgs()

◆ double_VAR_H() [1/14]

◆ double_VAR_H() [2/14]

◆ double_VAR_H() [3/14]

◆ double_VAR_H() [4/14]

◆ double_VAR_H() [5/14]

◆ double_VAR_H() [6/14]

◆ double_VAR_H() [7/14]

◆ double_VAR_H() [8/14]

◆ double_VAR_H() [9/14]

◆ double_VAR_H() [10/14]

◆ double_VAR_H() [11/14]

◆ double_VAR_H() [12/14]

◆ double_VAR_H() [13/14]

◆ double_VAR_H() [14/14]

◆ End()

◆ EndDangerousAmbigs()

◆ FinishLoad()

◆ fragment_state_okay()

◆ getCCUtil() [1/2]

◆ getCCUtil() [2/2]

◆ GetDawg()

◆ GetPuncDawg()

◆ GetStartingNode()

◆ GetUnambigDawg()

◆ getUnicharAmbigs()

◆ getUnicharset() [1/2]

◆ getUnicharset() [2/2]

◆ GlobalDawgCache()

◆ go_deeper_dawg_fxn()

◆ good_choice()

◆ has_hyphen_end() [1/2]

◆ has_hyphen_end() [2/2]

◆ hyphen_base_size()

◆ hyphenated()

◆ init_active_dawgs()

◆ INT_VAR_H() [1/6]

◆ INT_VAR_H() [2/6]

◆ INT_VAR_H() [3/6]

◆ INT_VAR_H() [4/6]

◆ INT_VAR_H() [5/6]

◆ INT_VAR_H() [6/6]

◆ is_apostrophe()

◆ IsSpaceDelimitedLang()

◆ LengthOfShortestAlphaRun()

◆ LetterIsOkay()

◆ Load()

◆ LoadLSTM()