24static int LeadingUnicharsToChopped(WERD_RES *word,
int num_unichars) {
26 for (
int i = 0;
i < num_unichars;
i++) {
27 num_chopped += word->best_state[
i];
32static int TrailingUnicharsToChopped(WERD_RES *word,
int num_unichars) {
34 for (
int i = 0;
i < num_unichars;
i++) {
35 num_chopped += word->best_state[word->best_state.size() - 1 -
i];
46static void YOutlierPieces(WERD_RES *word,
int rebuilt_blob_index,
int super_y_bottom,
47 int sub_y_top,
ScriptPos *leading_pos,
int *num_leading_outliers,
48 ScriptPos *trailing_pos,
int *num_trailing_outliers) {
52 leading_pos = &sp_unused1;
54 if (!num_leading_outliers) {
55 num_leading_outliers = &unused1;
58 trailing_pos = &sp_unused2;
60 if (!num_trailing_outliers) {
61 num_trailing_outliers = &unused2;
64 *num_leading_outliers = *num_trailing_outliers = 0;
67 int chopped_start = LeadingUnicharsToChopped(word, rebuilt_blob_index);
68 int num_chopped_pieces = word->best_state[rebuilt_blob_index];
70 int trailing_outliers = 0;
71 for (
int i = 0;
i < num_chopped_pieces;
i++) {
72 TBOX box = word->chopped_word->blobs[chopped_start +
i]->bounding_box();
74 if (box.bottom() >= super_y_bottom) {
76 }
else if (box.top() <= sub_y_top) {
80 if (trailing_outliers ==
i) {
81 *num_leading_outliers = trailing_outliers;
82 *leading_pos = last_pos;
84 trailing_outliers = 0;
86 if (pos == last_pos) {
89 trailing_outliers = 1;
94 *num_trailing_outliers = trailing_outliers;
95 *trailing_pos = last_pos;
112 int num_leading, num_trailing;
114 float leading_certainty, trailing_certainty;
115 float avg_certainty, unlikely_threshold;
119 &sp_trailing, &trailing_certainty, &avg_certainty,
120 &unlikely_threshold);
122 const char *leading_pos = sp_leading ==
SP_SUBSCRIPT ?
"sub" :
"super";
123 const char *trailing_pos = sp_trailing ==
SP_SUBSCRIPT ?
"sub" :
"super";
131 int num_remainder_leading = 0, num_remainder_trailing = 0;
132 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
135 int last_word_char = num_blobs - 1 - num_trailing;
138 last_char_certainty <= unlikely_threshold) {
140 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
nullptr,
nullptr, &rpos,
141 &num_remainder_trailing);
142 if (num_trailing > 0 && rpos != sp_trailing) {
143 num_remainder_trailing = 0;
145 if (num_remainder_trailing > 0 && last_char_certainty < trailing_certainty) {
146 trailing_certainty = last_char_certainty;
149 bool another_blob_available =
150 (num_remainder_trailing == 0) || num_leading + num_trailing + 1 < num_blobs;
153 first_char_certainty <= unlikely_threshold) {
155 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top, &lpos, &num_remainder_leading,
157 if (num_leading > 0 && lpos != sp_leading) {
158 num_remainder_leading = 0;
160 if (num_remainder_leading > 0 && first_char_certainty < leading_certainty) {
161 leading_certainty = first_char_certainty;
167 if (num_leading + num_trailing + num_remainder_leading + num_remainder_trailing == 0) {
171 if (superscript_debug >= 1) {
172 tprintf(
"Candidate for superscript detection: %s (",
174 if (num_leading || num_remainder_leading) {
175 tprintf(
"%d.%d %s-leading ", num_leading, num_remainder_leading, leading_pos);
177 if (num_trailing || num_remainder_trailing) {
178 tprintf(
"%d.%d %s-trailing ", num_trailing, num_remainder_trailing, trailing_pos);
182 if (superscript_debug >= 3) {
185 if (superscript_debug >= 2) {
186 tprintf(
" Certainties -- Average: %.2f Unlikely thresh: %.2f ", avg_certainty,
189 tprintf(
"Orig. leading (min): %.2f ", leading_certainty);
192 tprintf(
"Orig. trailing (min): %.2f ", trailing_certainty);
200 int num_chopped_leading = LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
201 int num_chopped_trailing = TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good =
false;
207 num_chopped_trailing, trailing_certainty, sp_trailing,
208 word, &is_good, &retry_leading, &retry_trailing);
211 }
else if (retry_leading || retry_trailing) {
212 int retry_chopped_leading = LeadingUnicharsToChopped(revised, retry_leading);
213 int retry_chopped_trailing = TrailingUnicharsToChopped(revised, retry_trailing);
215 retry_chopped_leading, leading_certainty, sp_leading, retry_chopped_trailing,
216 trailing_certainty, sp_trailing, revised, &is_good, &retry_leading, &retry_trailing);
251 ScriptPos *leading_pos,
float *leading_certainty,
252 int *num_rebuilt_trailing,
ScriptPos *trailing_pos,
253 float *trailing_certainty,
float *avg_certainty,
254 float *unlikely_threshold) {
255 *avg_certainty = *unlikely_threshold = 0.0f;
256 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
257 *leading_certainty = *trailing_certainty = 0.0f;
265 *leading_pos = *trailing_pos =
SP_NORMAL;
266 int leading_outliers = 0;
267 int trailing_outliers = 0;
269 float normal_certainty_total = 0.0f;
270 float worst_normal_certainty = 0.0f;
273 for (
int b = 0; b < num_blobs; ++b) {
276 if (box.
bottom() >= super_y_bottom) {
278 }
else if (box.
top() <= sub_y_top) {
284 if (char_certainty < worst_normal_certainty) {
285 worst_normal_certainty = char_certainty;
288 normal_certainty_total += char_certainty;
290 if (trailing_outliers == b) {
291 leading_outliers = trailing_outliers;
292 *leading_pos = last_pos;
294 trailing_outliers = 0;
296 if (last_pos == pos) {
299 trailing_outliers = 1;
304 *trailing_pos = last_pos;
305 if (num_normal >= 3) {
307 normal_certainty_total -= worst_normal_certainty;
309 if (num_normal > 0) {
310 *avg_certainty = normal_certainty_total / num_normal;
311 *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
313 if (num_normal == 0 || (leading_outliers == 0 && trailing_outliers == 0)) {
320 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0; *num_rebuilt_leading < leading_outliers;
321 (*num_rebuilt_leading)++) {
323 if (char_certainty > *unlikely_threshold) {
326 if (char_certainty < *leading_certainty) {
327 *leading_certainty = char_certainty;
332 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
333 *num_rebuilt_trailing < trailing_outliers; (*num_rebuilt_trailing)++) {
334 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
336 if (char_certainty > *unlikely_threshold) {
339 if (char_certainty < *trailing_certainty) {
340 *trailing_certainty = char_certainty;
370 ScriptPos leading_pos,
int num_chopped_trailing,
371 float trailing_certainty,
ScriptPos trailing_pos,
372 WERD_RES *word,
bool *is_good,
int *retry_rebuild_leading,
373 int *retry_rebuild_trailing) {
376 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
385 if (num_chopped_leading > 0) {
387 split_word(prefix, num_chopped_leading, &core, &bb0);
392 if (num_chopped_trailing > 0) {
393 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
398 int saved_cp_multiplier = classify_class_pruner_multiplier;
399 int saved_im_multiplier = classify_integer_matcher_multiplier;
402 classify_class_pruner_multiplier.set_value(0);
403 classify_integer_matcher_multiplier.set_value(0);
406 if (superscript_debug >= 3) {
407 tprintf(
" recognizing first %d chopped blobs\n", num_chopped_leading);
410 if (superscript_debug >= 2) {
416 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
417 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
420 if (superscript_debug >= 3) {
421 tprintf(
" recognizing middle %d chopped blobs\n",
422 num_chopped - num_chopped_leading - num_chopped_trailing);
427 classify_class_pruner_multiplier.set_value(0);
428 classify_integer_matcher_multiplier.set_value(0);
430 if (superscript_debug >= 3) {
431 tprintf(
" recognizing last %d chopped blobs\n", num_chopped_trailing);
434 if (superscript_debug >= 2) {
440 classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
441 classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
448 superscript_bettered_certainty * leading_certainty,
449 retry_rebuild_leading,
nullptr);
452 superscript_bettered_certainty * trailing_certainty,
nullptr,
453 retry_rebuild_trailing);
455 *is_good = good_prefix && good_suffix;
456 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
478 if (superscript_debug >= 1) {
479 tprintf(
"%s superscript fix: %s\n", *is_good ?
"ACCEPT" :
"REJECT",
504 int *left_ok,
int *right_ok)
const {
505 unsigned initial_ok_run_count = 0;
506 unsigned ok_run_count = 0;
507 float worst_certainty = 0.0f;
511 for (
unsigned i = 0;
i < wc.
length();
i++) {
515 bool bad_certainty = char_certainty < certainty_threshold;
519 if (choice && fontinfo_table.
size() > 0) {
522 bool font1_is_italic = font_id1 >= 0 ? fontinfo_table.
at(font_id1).is_italic() :
false;
524 is_italic = font1_is_italic && (font_id2 < 0 || fontinfo_table.
at(font_id2).is_italic());
527 float height_fraction = 1.0f;
529 float normal_height = char_height;
531 int min_bot, max_bot, min_top, max_top;
533 float hi_height = max_top - max_bot;
534 float lo_height = min_top - min_bot;
535 normal_height = (hi_height + lo_height) / 2;
539 height_fraction = char_height / normal_height;
542 bool bad_height = height_fraction < superscript_scaledown_ratio;
546 tprintf(
" Rejecting: superscript is italic.\n");
549 tprintf(
" Rejecting: punctuation present.\n");
554 " Rejecting: don't believe character %s with certainty %.2f "
555 "which is less than threshold %.2f\n",
556 char_str, char_certainty, certainty_threshold);
560 " Rejecting: character %s seems too small @ %.2f versus "
562 char_str, char_height, normal_height);
565 if (bad_certainty || bad_height || is_punc || is_italic) {
566 if (ok_run_count ==
i) {
567 initial_ok_run_count = ok_run_count;
573 if (char_certainty < worst_certainty) {
574 worst_certainty = char_certainty;
577 bool all_ok = ok_run_count == wc.
length();
578 if (all_ok && debug) {
579 tprintf(
" Accept: worst revised certainty is %.2f\n", worst_certainty);
583 *left_ok = initial_ok_run_count;
586 *right_ok = ok_run_count;
@ W_REP_CHAR
repeated character
void tprintf(const char *format,...)
const char * ScriptPosToString(enum ScriptPos script_pos)
const int kBlnBaselineOffset
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
void split_word(WERD_RES *word, unsigned split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
bool SubAndSuperscriptFix(WERD_RES *word_res)
void recog_word_recursive(WERD_RES *word)
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
TBOX bounding_box() const
std::vector< TBLOB * > blobs
unsigned NumBlobs() const
int size() const
Return the size used.
const T & at(int id) const
Return the object from an id.
WERD_CHOICE * best_choice
void ConsumeWordResults(WERD_RES *word)
const FontInfo * fontinfo
void SetAllScriptPositions(tesseract::ScriptPos position)
BLOB_CHOICE * GetBlobChoice(unsigned index) const
int16_t fontinfo_id2() const
int16_t fontinfo_id() const
UNICHAR_ID unichar_id(unsigned index) const
const UNICHARSET * unicharset() const
std::string & unichar_string()
TDimension height() const
TDimension bottom() const
bool flag(WERD_FLAGS mask) const
const char * id_to_unichar(UNICHAR_ID id) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
bool top_bottom_useful() const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
UnicityTable< FontInfo > & get_fontinfo_table()