tesseract  4.0.0-beta.1-59-g2cc4
reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 48 of file reject.cpp.

56  {
57 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
58  word->done = word->tess_accepted &&
59  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
60  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
61  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
62  word->best_choice->permuter() == FREQ_DAWG_PERM ||
64  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65  one_ell_conflict(word, FALSE)) {
66  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
67  word->done = FALSE;
68  }
69  if (word->done && ((!word_from_dict &&
70  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
71  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
72  word->done = FALSE;
73  }
74  if (tessedit_rejection_debug) {
75  tprintf("set_done(): done=%d\n", word->done);
76  word->best_choice->print("");
77  }
78 }
79 
80 
81 /*************************************************************************
82  * make_reject_map()
83  *
84  * Sets the done flag to indicate whether the resylt is acceptable.
85  *
86  * Sets a reject map for the word.
87  *************************************************************************/
88 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
89  int i;
90  int offset;
91 
92  flip_0O(word);
93  check_debug_pt(word, -1); // For trap only
94  set_done(word, pass); // Set acceptance
96  reject_blanks(word);
97  /*
98  0: Rays original heuristic - the baseline
99  */
100  if (tessedit_reject_mode == 0) {
101  if (!word->done)
102  reject_poor_matches(word);
103  } else if (tessedit_reject_mode == 5) {
104  /*
105  5: Reject I/1/l from words where there is no strong contextual confirmation;
106  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
107  and the whole of any words which are very small
108  */
109  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
111  } else {
112  one_ell_conflict(word, TRUE);
113  /*
114  Originally the code here just used the done flag. Now I have duplicated
115  and unpacked the conditions for setting the done flag so that each
116  mechanism can be turned on or off independently. This works WITHOUT
117  affecting the done flag setting.
118  */
119  if (rej_use_tess_accepted && !word->tess_accepted)
121 
122  if (rej_use_tess_blanks &&
123  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
125 
126  WERD_CHOICE* best_choice = word->best_choice;
127  if (rej_use_good_perm) {
128  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
129  best_choice->permuter() == FREQ_DAWG_PERM ||
130  best_choice->permuter() == USER_DAWG_PERM) &&
131  (!rej_use_sensible_wd ||
132  acceptable_word_string(*word->uch_set,
133  best_choice->unichar_string().string(),
134  best_choice->unichar_lengths().string()) !=
135  AC_UNACCEPTABLE)) {
136  // PASSED TEST
137  } else if (best_choice->permuter() == NUMBER_PERM) {
138  if (rej_alphas_in_number_perm) {
139  for (i = 0, offset = 0;
140  best_choice->unichar_string()[offset] != '\0';
141  offset += best_choice->unichar_lengths()[i++]) {
142  if (word->reject_map[i].accepted() &&
143  word->uch_set->get_isalpha(
144  best_choice->unichar_string().string() + offset,
145  best_choice->unichar_lengths()[i]))
146  word->reject_map[i].setrej_bad_permuter();
147  // rej alpha
148  }
149  }
150  } else {
152  }
153  }
154  /* Ambig word rejection was here once !!*/
155  }
156  } else {
157  tprintf("BAD tessedit_reject_mode\n");
158  err_exit();
159  }
160 
161  if (tessedit_image_border > -1)
162  reject_edge_blobs(word);
163 
164  check_debug_pt (word, 10);
165  if (tessedit_rejection_debug) {
166  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
167  tprintf("Certainty: %f Rating: %f\n",
168  word->best_choice->certainty (), word->best_choice->rating ());
169  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
170  }
171 
172  flip_hyphens(word);
173  check_debug_pt(word, 20);
174 }
175 } // namespace tesseract
void flip_0O(WERD_RES *word)
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define TRUE
Definition: capi.h:45
Unacceptable word.
Definition: control.h:36
float y_scale() const
Definition: normalis.h:272
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:367
void rej_word_bad_permuter()
Definition: rejctmap.cpp:385
void print() const
Definition: ratngs.h:576
uint8_t permuter() const
Definition: ratngs.h:342
const int kBlnXHeight
Definition: normalis.h:28
void flip_hyphens(WERD_RES *word)
const STRING & unichar_lengths() const
Definition: ratngs.h:544
DENORM denorm
Definition: pageres.h:190
Definition: ocrrow.h:32
const STRING & unichar_string() const
Definition: ratngs.h:537
#define FALSE
Definition: capi.h:46
#define tprintf(...)
Definition: tprintf.h:31
BOOL8 tess_accepted
Definition: pageres.h:280
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const char * string() const
Definition: strngs.cpp:198
bool dangerous_ambig_found() const
Definition: ratngs.h:359
void rej_word_contains_blanks()
Definition: rejctmap.cpp:376
int32_t length() const
Definition: strngs.cpp:193
float rating() const
Definition: ratngs.h:323
BOOL8 done
Definition: pageres.h:282
float certainty() const
Definition: ratngs.h:326
void err_exit()
Definition: globaloc.cpp:74
void rej_word_small_xht()
Definition: rejctmap.cpp:349
void initialise(int16_t length)
Definition: rejctmap.cpp:275
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
const UNICHARSET * uch_set
Definition: pageres.h:192

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 226 of file reject.cpp.

226  {
227  float threshold; // rejection threshold
228  float bestgap = 0.0f; // biggest gap
229  float gapstart; // bottom of gap
230  // super iterator
231  BLOB_CHOICE_IT choice_it; // real iterator
232 
233  int blob_count = word->length();
234  GenericVector<float> ratings;
235  ratings.resize_no_init(blob_count);
236  for (int i = 0; i < blob_count; ++i) {
237  ratings[i] = word->certainty(i);
238  }
239  ratings.sort();
240  gapstart = ratings[0] - 1; // all reject if none better
241  if (blob_count >= 3) {
242  for (int index = 0; index < blob_count - 1; index++) {
243  if (ratings[index + 1] - ratings[index] > bestgap) {
244  bestgap = ratings[index + 1] - ratings[index];
245  // find biggest
246  gapstart = ratings[index];
247  }
248  }
249  }
250  threshold = gapstart + bestgap / 2;
251 
252  return threshold;
253 }
int length() const
Definition: ratngs.h:299
void resize_no_init(int size)
Definition: genericvector.h:66
float certainty() const
Definition: ratngs.h:326

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

178  {
179  int16_t i;
180  int16_t offset;
181 
182  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
183  offset += word->best_choice->unichar_lengths()[i], i += 1) {
184  if (word->best_choice->unichar_string()[offset] == ' ')
185  //rej unrecognised blobs
186  word->reject_map[i].setrej_tess_failure ();
187  }
188 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:544
const STRING & unichar_string() const
Definition: ratngs.h:537

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

207  {
208  float threshold = compute_reject_threshold(word->best_choice);
209  for (int i = 0; i < word->best_choice->length(); ++i) {
210  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
211  word->reject_map[i].setrej_tess_failure();
212  else if (word->best_choice->certainty(i) < threshold)
213  word->reject_map[i].setrej_poor_match();
214  }
215 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
int length() const
Definition: ratngs.h:299
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226
float certainty() const
Definition: ratngs.h:326