24#ifndef DISABLED_LEGACY_ENGINE
44 float CertaintyThreshold = stopper_nondict_certainty_base;
47 if (stopper_no_acceptable_choices) {
51 if (best_choice.
empty()) {
57 bool is_case_ok =
case_ok(best_choice);
59 if (stopper_debug_level >= 1) {
60 const char *xht =
"UNKNOWN";
61 switch (xheight_consistency) {
74 tprintf(
"\nStopper: %s (word=%c, case=%c, xht_ok=%s=[%g,%g])\n",
79 if (reject_offset_ <= 0.0f && !is_valid_word) {
82 if (is_valid_word && is_case_ok) {
84 WordSize -= stopper_smallword_size;
88 CertaintyThreshold += WordSize * stopper_certainty_per_char;
91 if (stopper_debug_level >= 1) {
92 tprintf(
"Stopper: Rating = %4.1f, Certainty = %4.1f, Threshold = %4.1f\n",
96 if (no_dang_ambigs && best_choice.
certainty() > CertaintyThreshold &&
100 if (stopper_debug_level >= 1) {
102 "AcceptableChoice() returned false"
103 " (no_dang_ambig:%d cert:%.4g thresh:%g uniform:%d)\n",
104 no_dang_ambigs, best_choice.
certainty(), CertaintyThreshold,
115 float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
118 if (stopper_debug_level >= 1) {
119 tprintf(
"\nRejecter: %s (word=%c, case=%c, unambig=%c, multiple=%c)\n",
131 WordSize -= stopper_smallword_size;
135 CertaintyThreshold += WordSize * stopper_certainty_per_char;
138 if (stopper_debug_level >= 1) {
144 if (stopper_debug_level >= 1) {
149 if (stopper_debug_level >= 1) {
156#if !defined(DISABLED_LEGACY_ENGINE)
160 if (stopper_debug_level > 2) {
167 bool ambigs_found =
false;
183 for (
int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
184 bool replace = (fix_replaceable && pass == 0);
192 for (
unsigned i = 0;
i < best_choice->
length(); ++
i) {
193 auto *lst =
new BLOB_CHOICE_LIST();
194 BLOB_CHOICE_IT lst_it(lst);
198 ambig_blob_choices.push_back(lst);
202 int wrong_ngram_index;
204 for (
unsigned i = 0;
i < best_choice->
length(); blob_index += best_choice->
state(
i), ++
i) {
206 if (stopper_debug_level > 2) {
207 tprintf(
"Looking for %s ngrams starting with %s:\n", replace ?
"replaceable" :
"ambiguous",
210 int num_wrong_blobs = best_choice->
state(
i);
211 wrong_ngram_index = 0;
212 wrong_ngram[wrong_ngram_index] = curr_unichar_id;
213 if (curr_unichar_id == INVALID_UNICHAR_ID ||
static_cast<size_t>(curr_unichar_id) >= table.size() ||
214 table[curr_unichar_id] ==
nullptr) {
217 AmbigSpec_IT spec_it(table[curr_unichar_id]);
218 for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
219 const AmbigSpec *ambig_spec = spec_it.data();
220 wrong_ngram[wrong_ngram_index + 1] = INVALID_UNICHAR_ID;
222 if (stopper_debug_level > 2) {
225 tprintf(
"current ngram from spec: ");
227 tprintf(
"comparison result: %d\n", compare);
231 if (fixpt !=
nullptr) {
233 fixpt->push_back(
DANGERR_INFO(blob_index, blob_index + num_wrong_blobs, replace,
236 if (stopper_debug_level > 1) {
237 tprintf(
"fixpt+=(%d %d %d %d %s)\n", blob_index, blob_index + num_wrong_blobs,
false,
244 if (stopper_debug_level > 2) {
245 tprintf(
"replace ambiguity with %s : ",
253 if (stopper_debug_level > 2) {
258 for (
int tmp_index = 0; tmp_index <= wrong_ngram_index; ++tmp_index) {
266 BLOB_CHOICE_IT bc_it(ambig_blob_choices[
i + tmp_index]);
272 }
else if (compare == -1) {
275 ((next_index = wrong_ngram_index + 1 +
i) < best_choice->
length())) {
278 wrong_ngram[++wrong_ngram_index] = best_choice->
unichar_id(next_index);
279 num_wrong_blobs += best_choice->
state(next_index);
293 if (stopper_debug_level > 2) {
294 tprintf(
"\nResulting ambig_blob_choices:\n");
295 for (
unsigned i = 0;
i < ambig_blob_choices.size(); ++
i) {
301 ambigs_found = (alt_word->
rating() < 0.0);
303 if (stopper_debug_level >= 1) {
306 if (fixpt !=
nullptr) {
312 for (
unsigned i = 0;
i < alt_word->
length(); ++
i) {
316 if (replacement_is_ngram) {
319 int step = uchset.
step(str);
324 int end_i = orig_i + alt_word->
state(
i);
325 if (alt_word->
state(
i) > 1 || (orig_i + 1 == end_i && replacement_is_ngram)) {
328 for (
int j = 0; j < orig_i; ++j) {
329 blob_start += best_choice->
state(j);
331 int blob_end = blob_start;
332 for (
int j = orig_i; j < end_i; ++j) {
333 blob_end += best_choice->
state(j);
336 DANGERR_INFO(blob_start, blob_end,
true, replacement_is_ngram, leftmost_id));
337 if (stopper_debug_level > 1) {
338 tprintf(
"fixpt->dangerous+=(%d %d %d %d %s)\n", orig_i, end_i,
true,
342 orig_i += alt_word->
state(
i);
348 if (output_ambig_words_file_ !=
nullptr) {
349 fprintf(output_ambig_words_file_,
"\n");
352 for (
auto data : ambig_blob_choices) {
355 return !ambigs_found;
363 reject_offset_ = 0.0;
367 reject_offset_ = stopper_phase2_certainty_rejection_offset;
372 int num_blobs_to_replace = 0;
373 int begin_blob_index = 0;
377 float new_rating = 0.0f;
378 float new_certainty = 0.0f;
380 for (
i = 0;
i < wrong_ngram_begin_index + wrong_ngram_size; ++
i) {
381 if (
i >= wrong_ngram_begin_index) {
382 int num_blobs = werd_choice->
state(
i);
383 int col = begin_blob_index + num_blobs_to_replace;
384 int row = col + num_blobs - 1;
385 BLOB_CHOICE_LIST *choices = ratings->
get(col, row);
389 new_rating += old_choice->
rating();
390 new_certainty += old_choice->
certainty();
391 num_blobs_to_replace += num_blobs;
393 begin_blob_index += werd_choice->
state(
i);
396 new_certainty /= wrong_ngram_size;
398 MATRIX_COORD coord(begin_blob_index, begin_blob_index + num_blobs_to_replace - 1);
399 if (!coord.
Valid(*ratings)) {
402 if (ratings->
get(coord.
col, coord.
row) ==
nullptr) {
403 ratings->
put(coord.
col, coord.
row,
new BLOB_CHOICE_LIST);
405 BLOB_CHOICE_LIST *new_choices = ratings->
get(coord.
col, coord.
row);
407 if (choice !=
nullptr) {
409 if (new_rating < choice->rating()) {
412 if (new_certainty < choice->certainty()) {
424 BLOB_CHOICE_IT it(new_choices);
425 it.add_to_end(choice);
429 for (
int replaced_count = 0; replaced_count < wrong_ngram_size; ++replaced_count) {
430 if (replaced_count + 1 == wrong_ngram_size) {
431 werd_choice->
set_blob_choice(wrong_ngram_begin_index, num_blobs_to_replace, choice);
436 if (stopper_debug_level >= 1) {
437 werd_choice->
print(
"ReplaceAmbig() ");
438 tprintf(
"Modified blob_choices: ");
444 int shortest = INT32_MAX;
446 for (
unsigned w = 0; w < WordChoice.
length(); ++w) {
449 }
else if (curr_len > 0) {
450 if (curr_len < shortest) {
456 if (curr_len > 0 && curr_len < shortest) {
458 }
else if (shortest == INT32_MAX) {
466 float WorstCertainty = FLT_MAX;
467 float CertaintyThreshold;
468 double TotalCertainty;
469 double TotalCertaintySquared;
472 int word_length = word.
length();
474 if (word_length < 3) {
478 TotalCertainty = TotalCertaintySquared = 0.0;
479 for (
int i = 0;
i < word_length; ++
i) {
481 TotalCertainty += Certainty;
482 TotalCertaintySquared +=
static_cast<double>(Certainty) * Certainty;
483 if (Certainty < WorstCertainty) {
484 WorstCertainty = Certainty;
490 TotalCertainty -= WorstCertainty;
491 TotalCertaintySquared -=
static_cast<double>(WorstCertainty) * WorstCertainty;
493 Mean = TotalCertainty / word_length;
494 Variance = ((word_length * TotalCertaintySquared - TotalCertainty * TotalCertainty) /
495 (word_length * (word_length - 1)));
496 if (Variance < 0.0) {
499 StdDev = sqrt(Variance);
501 CertaintyThreshold =
Mean - stopper_allowable_character_badness * StdDev;
502 if (CertaintyThreshold > stopper_nondict_certainty_base) {
503 CertaintyThreshold = stopper_nondict_certainty_base;
506 if (word.
certainty() < CertaintyThreshold) {
507 if (stopper_debug_level >= 1) {
509 "Stopper: Non-uniform certainty = %4.1f"
510 " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET ¤t_unicharset)
void tprintf(const char *format,...)
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
std::vector< DANGERR_INFO > DANGERR
std::vector< BLOB_CHOICE_LIST * > BLOB_CHOICE_LIST_VECTOR
void put(ICOORD pos, const T &thing)
void IncreaseBandSize(int bandwidth)
bool Valid(const MATRIX &m) const
WERD_CHOICE * best_choice
WERD_CHOICE_LIST best_choices
void set_certainty(float newrat)
void set_unichar_id(UNICHAR_ID newunichar_id)
void set_classifier(BlobChoiceClassifier classifier)
void set_matrix_cell(int col, int row)
void set_rating(float newrat)
float max_x_height() const
std::string debug_string() const
void remove_unichar_id(unsigned index)
void set_blob_choice(unsigned index, int blob_count, const BLOB_CHOICE *blob_choice)
UNICHAR_ID unichar_id(unsigned index) const
bool dangerous_ambig_found() const
unsigned state(unsigned index) const
const UNICHARSET * unicharset() const
float min_x_height() const
std::string & unichar_string()
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
UNICHAR_ID correct_ngram_id
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
const UnicharAmbigsVector & replace_ambigs() const
const UnicharAmbigsVector & dang_ambigs() const
bool get_isalpha(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
bool get_isngram(UNICHAR_ID unichar_id) const
int step(const char *str) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool AcceptableResult(WERD_RES *word) const
int UniformCertainties(const WERD_CHOICE &word)
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const
Returns the length of the shortest alpha run in WordChoice.
void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice, MATRIX *ratings)
static bool valid_word_permuter(uint8_t perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
const UnicharAmbigs & getUnicharAmbigs() const
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
void EndDangerousAmbigs()
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
const UNICHARSET & getUnicharset() const