All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
reject.cpp File Reference
#include "tessvars.h"
#include "scanutils.h"
#include <ctype.h>
#include <string.h>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "globaloc.h"
#include "globals.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

CLISTIZEH ( STRING  )

Definition at line 48 of file reject.cpp.

56  {
57 void Tesseract::set_done(WERD_RES *word, inT16 pass) {
58  word->done = word->tess_accepted &&
59  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL);
60  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
61  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
62  word->best_choice->permuter() == FREQ_DAWG_PERM ||
64  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
65  one_ell_conflict(word, FALSE)) {
66  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
67  word->done = FALSE;
68  }
69  if (word->done && ((!word_from_dict &&
70  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
71  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
72  word->done = FALSE;
73  }
74  if (tessedit_rejection_debug) {
75  tprintf("set_done(): done=%d\n", word->done);
76  word->best_choice->print("");
77  }
78 }
79 
80 
81 /*************************************************************************
82  * make_reject_map()
83  *
84  * Sets the done flag to indicate whether the resylt is acceptable.
85  *
86  * Sets a reject map for the word.
87  *************************************************************************/
88 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, inT16 pass) {
89  int i;
90  int offset;
91 
92  flip_0O(word);
93  check_debug_pt(word, -1); // For trap only
94  set_done(word, pass); // Set acceptance
96  reject_blanks(word);
97  /*
98  0: Rays original heuristic - the baseline
99  */
100  if (tessedit_reject_mode == 0) {
101  if (!word->done)
102  reject_poor_matches(word);
103  } else if (tessedit_reject_mode == 5) {
104  /*
105  5: Reject I/1/l from words where there is no strong contextual confirmation;
106  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
107  and the whole of any words which are very small
108  */
109  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
111  } else {
112  one_ell_conflict(word, TRUE);
113  /*
114  Originally the code here just used the done flag. Now I have duplicated
115  and unpacked the conditions for setting the done flag so that each
116  mechanism can be turned on or off independently. This works WITHOUT
117  affecting the done flag setting.
118  */
119  if (rej_use_tess_accepted && !word->tess_accepted)
121 
122  if (rej_use_tess_blanks &&
123  (strchr (word->best_choice->unichar_string().string (), ' ') != NULL))
125 
126  WERD_CHOICE* best_choice = word->best_choice;
127  if (rej_use_good_perm) {
128  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
129  best_choice->permuter() == FREQ_DAWG_PERM ||
130  best_choice->permuter() == USER_DAWG_PERM) &&
131  (!rej_use_sensible_wd ||
132  acceptable_word_string(*word->uch_set,
133  best_choice->unichar_string().string(),
134  best_choice->unichar_lengths().string()) !=
135  AC_UNACCEPTABLE)) {
136  // PASSED TEST
137  } else if (best_choice->permuter() == NUMBER_PERM) {
138  if (rej_alphas_in_number_perm) {
139  for (i = 0, offset = 0;
140  best_choice->unichar_string()[offset] != '\0';
141  offset += best_choice->unichar_lengths()[i++]) {
142  if (word->reject_map[i].accepted() &&
143  word->uch_set->get_isalpha(
144  best_choice->unichar_string().string() + offset,
145  best_choice->unichar_lengths()[i]))
146  word->reject_map[i].setrej_bad_permuter();
147  // rej alpha
148  }
149  }
150  } else {
152  }
153  }
154  /* Ambig word rejection was here once !!*/
155  }
156  } else {
157  tprintf("BAD tessedit_reject_mode\n");
158  err_exit();
159  }
160 
161  if (tessedit_image_border > -1)
162  reject_edge_blobs(word);
163 
164  check_debug_pt (word, 10);
165  if (tessedit_rejection_debug) {
166  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
167  tprintf("Certainty: %f Rating: %f\n",
168  word->best_choice->certainty (), word->best_choice->rating ());
169  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
170  }
171 
172  flip_hyphens(word);
173  check_debug_pt(word, 20);
174 }
175 } // namespace tesseract
const int kBlnXHeight
Definition: normalis.h:28
BOOL8 tess_accepted
Definition: pageres.h:280
float rating() const
Definition: ratngs.h:324
void err_exit()
Definition: globaloc.cpp:74
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
#define tprintf(...)
Definition: tprintf.h:31
void flip_0O(WERD_RES *word)
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:178
void flip_hyphens(WERD_RES *word)
const STRING & unichar_lengths() const
Definition: ratngs.h:531
inT32 length() const
Definition: strngs.cpp:188
bool dangerous_ambig_found() const
Definition: ratngs.h:360
Definition: ocrrow.h:32
const STRING & unichar_string() const
Definition: ratngs.h:524
void rej_word_small_xht()
Definition: rejctmap.cpp:416
float certainty() const
Definition: ratngs.h:327
const UNICHARSET * uch_set
Definition: pageres.h:192
DENORM denorm
Definition: pageres.h:190
uinT8 permuter() const
Definition: ratngs.h:343
BOOL8 done
Definition: pageres.h:282
Unacceptable word.
Definition: control.h:36
#define FALSE
Definition: capi.h:29
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
#define TRUE
Definition: capi.h:28
float y_scale() const
Definition: normalis.h:272
void print() const
Definition: ratngs.h:563
void initialise(inT16 length)
Definition: rejctmap.cpp:318
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void rej_word_contains_blanks()
Definition: rejctmap.cpp:443
void rej_word_bad_permuter()
Definition: rejctmap.cpp:452
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:207
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:434
short inT16
Definition: host.h:100
float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 226 of file reject.cpp.

226  {
227  float threshold; // rejection threshold
228  float bestgap = 0.0f; // biggest gap
229  float gapstart; // bottom of gap
230  // super iterator
231  BLOB_CHOICE_IT choice_it; // real iterator
232 
233  int blob_count = word->length();
234  GenericVector<float> ratings;
235  ratings.init_to_size(blob_count, 0.0f);
236  for (int i = 0; i < blob_count; ++i) {
237  ratings[i] = word->certainty(i);
238  }
239  ratings.sort();
240  gapstart = ratings[0] - 1; // all reject if none better
241  if (blob_count >= 3) {
242  for (int index = 0; index < blob_count - 1; index++) {
243  if (ratings[index + 1] - ratings[index] > bestgap) {
244  bestgap = ratings[index + 1] - ratings[index];
245  // find biggest
246  gapstart = ratings[index];
247  }
248  }
249  }
250  threshold = gapstart + bestgap / 2;
251 
252  return threshold;
253 }
int length() const
Definition: ratngs.h:300
float certainty() const
Definition: ratngs.h:327
void init_to_size(int size, T t)
void reject_blanks ( WERD_RES word)

Definition at line 178 of file reject.cpp.

178  {
179  inT16 i;
180  inT16 offset;
181 
182  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
183  offset += word->best_choice->unichar_lengths()[i], i += 1) {
184  if (word->best_choice->unichar_string()[offset] == ' ')
185  //rej unrecognised blobs
186  word->reject_map[i].setrej_tess_failure ();
187  }
188 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
const STRING & unichar_lengths() const
Definition: ratngs.h:531
const STRING & unichar_string() const
Definition: ratngs.h:524
short inT16
Definition: host.h:100
void reject_poor_matches ( WERD_RES word)

Definition at line 207 of file reject.cpp.

207  {
208  float threshold = compute_reject_threshold(word->best_choice);
209  for (int i = 0; i < word->best_choice->length(); ++i) {
210  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
211  word->reject_map[i].setrej_tess_failure();
212  else if (word->best_choice->certainty(i) < threshold)
213  word->reject_map[i].setrej_poor_match();
214  }
215 }
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
float certainty() const
Definition: ratngs.h:327
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:226