21# include "config_auto.h"
26#ifdef DISABLED_LEGACY_ENGINE
33 const WERD_CHOICE &word = *werd_res->best_choice;
34 int dict_word_type = werd_res->tesseract->dict_word(word);
69 if (word->
done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
71 if (tessedit_rejection_debug) {
72 tprintf(
"one_ell_conflict detected\n");
78 if (tessedit_rejection_debug) {
79 tprintf(
"non-dict or ambig word detected\n");
83 if (tessedit_rejection_debug) {
105 if (tessedit_reject_mode == 0) {
109 }
else if (tessedit_reject_mode == 5) {
129 if (rej_use_tess_blanks &&
135 if (rej_use_good_perm) {
139 (!rej_use_sensible_wd ||
144 if (rej_alphas_in_number_perm) {
145 for (
int i = 0, offset = 0; best_choice->
unichar_string()[offset] !=
'\0';
162 tprintf(
"BAD tessedit_reject_mode\n");
163 ASSERT_HOST(
"Fatal error encountered!" ==
nullptr);
166 if (tessedit_image_border > -1) {
171 if (tessedit_rejection_debug) {
229 float bestgap = 0.0f;
232 auto blob_count = word->
length();
233 std::vector<float> ratings;
234 ratings.reserve(blob_count);
235 for (
unsigned i = 0;
i < blob_count; ++
i) {
238 std::sort(ratings.begin(), ratings.end());
239 gapstart = ratings[0] - 1;
240 if (blob_count >= 3) {
241 for (
unsigned index = 0; index < blob_count - 1; index++) {
242 if (ratings[index + 1] - ratings[index] > bestgap) {
243 bestgap = ratings[index + 1] - ratings[index];
245 gapstart = ratings[index];
249 threshold = gapstart + bestgap / 2;
265 if (word_box.
left() < tessedit_image_border || word_box.
bottom() < tessedit_image_border ||
269 for (
int blobindex = 0; blobindex < blobcount; blobindex++) {
271 if (blob_box.
left() < tessedit_image_border || blob_box.
bottom() < tessedit_image_border ||
274 word->
reject_map[blobindex].setrej_edge_char();
291 int16_t first_alphanum_index_;
292 int16_t first_alphanum_offset_;
295 bool non_conflict_set_char;
296 bool conflict =
false;
305 word_len = strlen(lengths);
310 if (strpbrk(word, conflict_set_I_l_1.c_str()) ==
nullptr) {
319 for (
i = 0, offset = 0, non_conflict_set_char =
false; (
i < word_len) && !non_conflict_set_char;
320 offset += lengths[
i++]) {
323 !conflict_set_I_l_1.contains(word[offset]);
325 if (!non_conflict_set_char) {
343 dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type !=
DOC_DAWG_PERM));
345 if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) ||
346 (dict_perm_type && dict_word_ok)) {
349 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] ==
'I') {
354 word_res->
reject_map[first_alphanum_index_].setrej_1Il_conflict();
363 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] ==
'l') {
368 word_res->
reject_map[first_alphanum_index_].setrej_1Il_conflict();
392 if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] ==
'l') {
399 }
else if (lengths[first_alphanum_index_] == 1 && word[first_alphanum_offset_] ==
'I') {
419 for (
i = 0, offset = 0; word[offset] !=
'\0';
421 if ((!allow_1s || (word[offset] !=
'1')) &&
422 conflict_set_I_l_1.contains(word[offset])) {
439 if (conflict_set_I_l_1.contains(word[first_alphanum_offset_])) {
441 word_res->
reject_map[first_alphanum_index_].setrej_1Il_conflict();
461 for (
i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[
i++]) {
474 for (
i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[
i++]) {
488 for (
i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[
i++]) {
500 for (
i = 0, offset = 0; word[offset] !=
'\0'; offset += word_lengths[
i++]) {
502 (word_lengths[
i] != 1 || word[offset] !=
'1')) {
517 bool accepted_1Il =
false;
521 if (conflict_set_I_l_1.contains(s[offset])) {
536 if (conflict_set_I_l_1.contains(s[offset]) && word->
reject_map[
i].accepted()) {
560 rej_whole_of_mostly_reject_word_fract) {
581 int16_t char_quality;
582 int16_t accepted_char_quality;
586 (char_quality == accepted_char_quality)) {
604 int prev_right = -9999;
609 if (tessedit_lower_flip_hyphen <= 1) {
615 for (
unsigned i = 0;
i < best_choice->
length() &&
i < num_blobs; ++
i) {
618 if (
i + 1 == num_blobs) {
625 (out_box.
right() < next_left)) {
626 aspect_ratio = out_box.
width() /
static_cast<float>(out_box.
height());
628 if (aspect_ratio >= tessedit_upper_flip_hyphen &&
637 if ((aspect_ratio > tessedit_lower_flip_hyphen) && word_res->
reject_map[
i].accepted()) {
641 }
else if (best_choice->
unichar_id(
i) == unichar_dash) {
642 if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word_res->
reject_map[
i].rejected())) {
647 if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word_res->
reject_map[
i].accepted())) {
653 prev_right = out_box.
right();
664 if (!tessedit_flip_0O) {
669 for (
unsigned i = 0;
i < best_choice->
length() &&
i < num_blobs; ++
i) {
686 for (
unsigned i = 1;
i < best_choice->
length(); ++
i) {
689 if ((
i + 1) < best_choice->
length() &&
696 (
i + 1) < best_choice->
length() &&
699 (
i + 2) < best_choice->
length() &&
707 (((
i + 1) < best_choice->
length() &&
711 (
i == best_choice->
length() - 1))) {
716 (
i + 1) < best_choice->
length() &&
722 (
i + 2) < best_choice->
length() &&
734 (
i + 2) < best_choice->
length() &&
744 (
i + 1) < best_choice->
length() &&
757 while (i < best_choice->length() && (best_choice->
unichar_id(
i) == unichar_O ||
769 return ch_set.
get_isupper(unichar_id) && !ch_set.
eq(unichar_id,
"O");
773 return ch_set.
get_isdigit(unichar_id) && !ch_set.
eq(unichar_id,
"0");
@ AC_INITIAL_CAP
ALL but initial lc.
@ AC_UNACCEPTABLE
Unacceptable word.
@ AC_UPPER_CASE
ALL upper case.
@ AC_LOWER_CASE
ALL lower case.
float compute_reject_threshold(WERD_CHOICE *word)
void tprintf(const char *format,...)
void reject_poor_matches(WERD_RES *word)
const int kBlnBaselineOffset
void reject_blanks(WERD_RES *word)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
void reject_edge_blobs(WERD_RES *word)
void word_char_quality(WERD_RES *word, int16_t *match_count, int16_t *accepted_match_count)
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
int16_t alpha_count(const char *word, const char *word_lengths)
void dont_allow_1Il(WERD_RES *word)
int16_t count_alphanums(const WERD_CHOICE &word)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
int16_t safe_dict_word(const WERD_RES *werd_res)
void set_done(WERD_RES *word, int16_t pass)
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool check_debug_pt(WERD_RES *word, int location)
void flip_hyphens(WERD_RES *word)
void reject_I_1_L(WERD_RES *word)
void reject_mostly_rejects(WERD_RES *word)
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
bool repeated_nonalphanum_wd(WERD_RES *word, ROW *row)
void flip_0O(WERD_RES *word)
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
TBOX bounding_box() const
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
const TBOX & BlobBox(unsigned index) const
tesseract::Tesseract * tesseract
WERD_CHOICE * best_choice
const UNICHARSET * uch_set
tesseract::BoxWord * box_word
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
UNICHAR_ID unichar_id(unsigned index) const
bool dangerous_ambig_found() const
const std::string & unichar_lengths() const
std::string & unichar_string()
TDimension height() const
TDimension bottom() const
void rej_word_not_tess_accepted()
int16_t reject_count() const
void rej_word_contains_blanks()
void rej_word_small_xht()
void initialise(uint16_t length)
void rej_word_bad_permuter()
void rej_word_mostly_rej()
TBOX bounding_box() const
bool get_isalpha(UNICHAR_ID unichar_id) const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool get_enabled(UNICHAR_ID unichar_id) const
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
int dict_word(const WERD_CHOICE &word)