All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
fixspace.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: fixspace.cpp (Formerly fixspace.c)
3  * Description: Implements a pass over the page res, exploring the alternative
4  * spacing possibilities, trying to use context to improve the
5  * word spacing
6 * Author: Phil Cheatle
7 * Created: Thu Oct 21 11:38:43 BST 1993
8 *
9 * (C) Copyright 1993, Hewlett-Packard Ltd.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 *
20 **********************************************************************/
21 
22 #include <ctype.h>
23 #include "reject.h"
24 #include "statistc.h"
25 #include "control.h"
26 #include "fixspace.h"
27 #include "genblob.h"
28 #include "tessvars.h"
29 #include "tessbox.h"
30 #include "globals.h"
31 #include "tesseractclass.h"
32 
33 #define PERFECT_WERDS 999
34 #define MAXSPACING 128 /*max expected spacing in pix */
35 
36 namespace tesseract {
37 
49  inT32 word_count,
50  PAGE_RES *page_res) {
51  BLOCK_RES_IT block_res_it;
52  ROW_RES_IT row_res_it;
53  WERD_RES_IT word_res_it_from;
54  WERD_RES_IT word_res_it_to;
55  WERD_RES *word_res;
56  WERD_RES_LIST fuzzy_space_words;
57  inT16 new_length;
58  BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
59  inT32 word_index; // current word
60 
61  block_res_it.set_to_list(&page_res->block_res_list);
62  word_index = 0;
63  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64  block_res_it.forward()) {
65  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67  row_res_it.forward()) {
68  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69  while (!word_res_it_from.at_last()) {
70  word_res = word_res_it_from.data();
71  while (!word_res_it_from.at_last() &&
72  !(word_res->combination ||
73  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
74  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
75  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
76  block_res_it.data()->block);
77  word_res = word_res_it_from.forward();
78  word_index++;
79  if (monitor != NULL) {
80  monitor->ocr_alive = TRUE;
81  monitor->progress = 90 + 5 * word_index / word_count;
82  if (monitor->deadline_exceeded() ||
83  (monitor->cancel != NULL &&
84  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
85  return;
86  }
87  }
88 
89  if (!word_res_it_from.at_last()) {
90  word_res_it_to = word_res_it_from;
91  prevent_null_wd_fixsp =
92  word_res->word->cblob_list()->empty();
93  if (check_debug_pt(word_res, 60))
94  debug_fix_space_level.set_value(10);
95  word_res_it_to.forward();
96  word_index++;
97  if (monitor != NULL) {
98  monitor->ocr_alive = TRUE;
99  monitor->progress = 90 + 5 * word_index / word_count;
100  if (monitor->deadline_exceeded() ||
101  (monitor->cancel != NULL &&
102  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
103  return;
104  }
105  while (!word_res_it_to.at_last () &&
106  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
107  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
108  if (check_debug_pt(word_res, 60))
109  debug_fix_space_level.set_value(10);
110  if (word_res->word->cblob_list()->empty())
111  prevent_null_wd_fixsp = TRUE;
112  word_res = word_res_it_to.forward();
113  }
114  if (check_debug_pt(word_res, 60))
115  debug_fix_space_level.set_value(10);
116  if (word_res->word->cblob_list()->empty())
117  prevent_null_wd_fixsp = TRUE;
118  if (prevent_null_wd_fixsp) {
119  word_res_it_from = word_res_it_to;
120  } else {
121  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
122  &word_res_it_to);
123  fix_fuzzy_space_list(fuzzy_space_words,
124  row_res_it.data()->row,
125  block_res_it.data()->block);
126  new_length = fuzzy_space_words.length();
127  word_res_it_from.add_list_before(&fuzzy_space_words);
128  for (;
129  !word_res_it_from.at_last() && new_length > 0;
130  new_length--) {
131  word_res_it_from.forward();
132  }
133  }
134  if (test_pt)
135  debug_fix_space_level.set_value(0);
136  }
137  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
138  block_res_it.data()->block);
139  // Last word in row
140  }
141  }
142  }
143 }
144 
145 void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
146  ROW *row,
147  BLOCK* block) {
148  inT16 best_score;
149  WERD_RES_LIST current_perm;
150  inT16 current_score;
151  BOOL8 improved = FALSE;
152 
153  best_score = eval_word_spacing(best_perm); // default score
154  dump_words(best_perm, best_score, 1, improved);
155 
156  if (best_score != PERFECT_WERDS)
157  initialise_search(best_perm, current_perm);
158 
159  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
160  match_current_words(current_perm, row, block);
161  current_score = eval_word_spacing(current_perm);
162  dump_words(current_perm, current_score, 2, improved);
163  if (current_score > best_score) {
164  best_perm.clear();
165  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
166  best_score = current_score;
167  improved = TRUE;
168  }
169  if (current_score < PERFECT_WERDS)
170  transform_to_next_perm(current_perm);
171  }
172  dump_words(best_perm, best_score, 3, improved);
173 }
174 
175 } // namespace tesseract
176 
177 void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {
178  WERD_RES_IT src_it(&src_list);
179  WERD_RES_IT new_it(&new_list);
180  WERD_RES *src_wd;
181  WERD_RES *new_wd;
182 
183  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
184  src_wd = src_it.data();
185  if (!src_wd->combination) {
186  new_wd = WERD_RES::deep_copy(src_wd);
187  new_wd->combination = FALSE;
188  new_wd->part_of_combo = FALSE;
189  new_it.add_after_then_move(new_wd);
190  }
191  }
192 }
193 
194 
195 namespace tesseract {
196 void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
197  BLOCK* block) {
198  WERD_RES_IT word_it(&words);
199  WERD_RES *word;
200  // Since we are not using PAGE_RES to iterate over words, we need to update
201  // prev_word_best_choice_ before calling classify_word_pass2().
203  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204  word = word_it.data();
205  if ((!word->part_of_combo) && (word->box_word == NULL)) {
206  WordData word_data(block, row, word);
207  SetupWordPassN(2, &word_data);
208  classify_word_and_language(2, NULL, &word_data);
209  }
211  }
212 }
213 
214 
240 inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
241  WERD_RES_IT word_res_it(&word_res_list);
242  inT16 total_score = 0;
243  inT16 word_count = 0;
244  inT16 done_word_count = 0;
245  inT16 word_len;
246  inT16 i;
247  inT16 offset;
248  WERD_RES *word; // current word
249  inT16 prev_word_score = 0;
250  BOOL8 prev_word_done = FALSE;
251  BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
252  BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
253  BOOL8 current_char_1 = FALSE;
254  BOOL8 current_word_ok_so_far;
255  STRING punct_chars = "!\"`',.:;";
256  BOOL8 prev_char_punct = FALSE;
257  BOOL8 current_char_punct = FALSE;
258  BOOL8 word_done = FALSE;
259 
260  do {
261  word = word_res_it.data();
262  word_done = fixspace_thinks_word_done(word);
263  word_count++;
264  if (word->tess_failed) {
265  total_score += prev_word_score;
266  if (prev_word_done)
267  done_word_count++;
268  prev_word_score = 0;
269  prev_char_1 = FALSE;
270  prev_char_digit = FALSE;
271  prev_word_done = FALSE;
272  } else {
273  /*
274  Can we add the prev word score and potentially count this word?
275  Yes IF it didnt end in a 1 when the first char of this word is a digit
276  AND it didnt end in a digit when the first char of this word is a 1
277  */
278  word_len = word->reject_map.length();
279  current_word_ok_so_far = FALSE;
280  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
281  (prev_char_digit && (
282  (word_done &&
283  word->best_choice->unichar_lengths().string()[0] == 1 &&
284  word->best_choice->unichar_string()[0] == '1') ||
285  (!word_done && STRING(conflict_set_I_l_1).contains(
286  word->best_choice->unichar_string()[0])))))) {
287  total_score += prev_word_score;
288  if (prev_word_done)
289  done_word_count++;
290  current_word_ok_so_far = word_done;
291  }
292 
293  if (current_word_ok_so_far) {
294  prev_word_done = TRUE;
295  prev_word_score = word_len;
296  } else {
297  prev_word_done = FALSE;
298  prev_word_score = 0;
299  }
300 
301  /* Add 1 to total score for every joined 1 regardless of context and
302  rejtn */
303  for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
304  current_char_1 = word->best_choice->unichar_string()[i] == '1';
305  if (prev_char_1 || (current_char_1 && (i > 0)))
306  total_score++;
307  prev_char_1 = current_char_1;
308  }
309 
310  /* Add 1 to total score for every joined punctuation regardless of context
311  and rejtn */
313  for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
314  offset += word->best_choice->unichar_lengths()[i++]) {
315  current_char_punct =
316  punct_chars.contains(word->best_choice->unichar_string()[offset]);
317  if (prev_char_punct || (current_char_punct && i > 0))
318  total_score++;
319  prev_char_punct = current_char_punct;
320  }
321  }
322  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
323  for (i = 0, offset = 0; i < word_len - 1;
324  offset += word->best_choice->unichar_lengths()[i++]);
325  prev_char_1 =
326  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
327  || (!word_done && STRING(conflict_set_I_l_1).contains(
328  word->best_choice->unichar_string()[offset])));
329  }
330  /* Find next word */
331  do {
332  word_res_it.forward();
333  } while (word_res_it.data()->part_of_combo);
334  } while (!word_res_it.at_first());
335  total_score += prev_word_score;
336  if (prev_word_done)
337  done_word_count++;
338  if (done_word_count == word_count)
339  return PERFECT_WERDS;
340  else
341  return total_score;
342 }
343 
345  int i;
346  int offset;
347 
348  for (i = 0, offset = 0; i < char_position;
349  offset += word->best_choice->unichar_lengths()[i++]);
350  return (
351  word->uch_set->get_isdigit(
352  word->best_choice->unichar_string().string() + offset,
353  word->best_choice->unichar_lengths()[i]) ||
354  (word->best_choice->permuter() == NUMBER_PERM &&
356  word->best_choice->unichar_string().string()[offset])));
357 }
358 
359 } // namespace tesseract
360 
361 
373 void transform_to_next_perm(WERD_RES_LIST &words) {
374  WERD_RES_IT word_it(&words);
375  WERD_RES_IT prev_word_it(&words);
376  WERD_RES *word;
377  WERD_RES *prev_word;
378  WERD_RES *combo;
379  WERD *copy_word;
380  inT16 prev_right = -MAX_INT16;
381  TBOX box;
382  inT16 gap;
383  inT16 min_gap = MAX_INT16;
384 
385  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
386  word = word_it.data();
387  if (!word->part_of_combo) {
388  box = word->word->bounding_box();
389  if (prev_right > -MAX_INT16) {
390  gap = box.left() - prev_right;
391  if (gap < min_gap)
392  min_gap = gap;
393  }
394  prev_right = box.right();
395  }
396  }
397  if (min_gap < MAX_INT16) {
398  prev_right = -MAX_INT16; // back to start
399  word_it.set_to_list(&words);
400  // Note: we can't use cycle_pt due to inserted combos at start of list.
401  for (; (prev_right == -MAX_INT16) || !word_it.at_first();
402  word_it.forward()) {
403  word = word_it.data();
404  if (!word->part_of_combo) {
405  box = word->word->bounding_box();
406  if (prev_right > -MAX_INT16) {
407  gap = box.left() - prev_right;
408  if (gap <= min_gap) {
409  prev_word = prev_word_it.data();
410  if (prev_word->combination) {
411  combo = prev_word;
412  } else {
413  /* Make a new combination and insert before
414  * the first word being joined. */
415  copy_word = new WERD;
416  *copy_word = *(prev_word->word);
417  // deep copy
418  combo = new WERD_RES(copy_word);
419  combo->combination = TRUE;
420  combo->x_height = prev_word->x_height;
421  prev_word->part_of_combo = TRUE;
422  prev_word_it.add_before_then_move(combo);
423  }
424  combo->word->set_flag(W_EOL, word->word->flag(W_EOL));
425  if (word->combination) {
426  combo->word->join_on(word->word);
427  // Move blobs to combo
428  // old combo no longer needed
429  delete word_it.extract();
430  } else {
431  // Copy current wd to combo
432  combo->copy_on(word);
433  word->part_of_combo = TRUE;
434  }
435  combo->done = FALSE;
436  combo->ClearResults();
437  } else {
438  prev_word_it = word_it; // catch up
439  }
440  }
441  prev_right = box.right();
442  }
443  }
444  } else {
445  words.clear(); // signal termination
446  }
447 }
448 
449 namespace tesseract {
450 void Tesseract::dump_words(WERD_RES_LIST &perm, inT16 score,
451  inT16 mode, BOOL8 improved) {
452  WERD_RES_IT word_res_it(&perm);
453 
454  if (debug_fix_space_level > 0) {
455  if (mode == 1) {
456  stats_.dump_words_str = "";
457  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
458  word_res_it.forward()) {
459  if (!word_res_it.data()->part_of_combo) {
460  stats_.dump_words_str +=
461  word_res_it.data()->best_choice->unichar_string();
462  stats_.dump_words_str += ' ';
463  }
464  }
465  }
466 
467  if (debug_fix_space_level > 1) {
468  switch (mode) {
469  case 1:
470  tprintf("EXTRACTED (%d): \"", score);
471  break;
472  case 2:
473  tprintf("TESTED (%d): \"", score);
474  break;
475  case 3:
476  tprintf("RETURNED (%d): \"", score);
477  break;
478  }
479 
480  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
481  word_res_it.forward()) {
482  if (!word_res_it.data()->part_of_combo) {
483  tprintf("%s/%1d ",
484  word_res_it.data()->best_choice->unichar_string().string(),
485  (int)word_res_it.data()->best_choice->permuter());
486  }
487  }
488  tprintf("\"\n");
489  } else if (improved) {
490  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
491  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
492  word_res_it.forward()) {
493  if (!word_res_it.data()->part_of_combo) {
494  tprintf("%s/%1d ",
495  word_res_it.data()->best_choice->unichar_string().string(),
496  (int)word_res_it.data()->best_choice->permuter());
497  }
498  }
499  tprintf("\"\n");
500  }
501  }
502 }
503 
505  if (word->done)
506  return TRUE;
507 
508  /*
509  Use all the standard pass 2 conditions for mode 5 in set_done() in
510  reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
511  CARE WHETHER WE HAVE of/at on/an etc.
512  */
513  if (fixsp_done_mode > 0 &&
514  (word->tess_accepted ||
515  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
516  fixsp_done_mode == 3) &&
517  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
518  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
519  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
520  (word->best_choice->permuter() == USER_DAWG_PERM) ||
521  (word->best_choice->permuter() == NUMBER_PERM))) {
522  return TRUE;
523  } else {
524  return FALSE;
525  }
526 }
527 
528 
536 void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
537  BLOCK* block) {
538  WERD_RES *word_res;
539  WERD_RES_LIST sub_word_list;
540  WERD_RES_IT sub_word_list_it(&sub_word_list);
541  inT16 blob_index;
542  inT16 new_length;
543  float junk;
544 
545  word_res = word_res_it.data();
546  if (word_res->word->flag(W_REP_CHAR) ||
547  word_res->combination ||
548  word_res->part_of_combo ||
549  !word_res->word->flag(W_DONT_CHOP))
550  return;
551 
552  blob_index = worst_noise_blob(word_res, &junk);
553  if (blob_index < 0)
554  return;
555 
556  if (debug_fix_space_level > 1) {
557  tprintf("FP fixspace working on \"%s\"\n",
558  word_res->best_choice->unichar_string().string());
559  }
560  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
561  sub_word_list_it.add_after_stay_put(word_res_it.extract());
562  fix_noisy_space_list(sub_word_list, row, block);
563  new_length = sub_word_list.length();
564  word_res_it.add_list_before(&sub_word_list);
565  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
566  word_res_it.forward();
567  }
568 }
569 
570 void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
571  BLOCK* block) {
572  inT16 best_score;
573  WERD_RES_IT best_perm_it(&best_perm);
574  WERD_RES_LIST current_perm;
575  WERD_RES_IT current_perm_it(&current_perm);
576  WERD_RES *old_word_res;
577  inT16 current_score;
578  BOOL8 improved = FALSE;
579 
580  best_score = fp_eval_word_spacing(best_perm); // default score
581 
582  dump_words(best_perm, best_score, 1, improved);
583 
584  old_word_res = best_perm_it.data();
585  // Even deep_copy doesn't copy the underlying WERD unless its combination
586  // flag is true!.
587  old_word_res->combination = TRUE; // Kludge to force deep copy
588  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
589  old_word_res->combination = FALSE; // Undo kludge
590 
591  break_noisiest_blob_word(current_perm);
592 
593  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
594  match_current_words(current_perm, row, block);
595  current_score = fp_eval_word_spacing(current_perm);
596  dump_words(current_perm, current_score, 2, improved);
597  if (current_score > best_score) {
598  best_perm.clear();
599  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
600  best_score = current_score;
601  improved = TRUE;
602  }
603  if (current_score < PERFECT_WERDS) {
604  break_noisiest_blob_word(current_perm);
605  }
606  }
607  dump_words(best_perm, best_score, 3, improved);
608 }
609 
610 
616 void Tesseract::break_noisiest_blob_word(WERD_RES_LIST &words) {
617  WERD_RES_IT word_it(&words);
618  WERD_RES_IT worst_word_it;
619  float worst_noise_score = 9999;
620  int worst_blob_index = -1; // Noisiest blob of noisiest wd
621  int blob_index; // of wds noisiest blob
622  float noise_score; // of wds noisiest blob
623  WERD_RES *word_res;
624  C_BLOB_IT blob_it;
625  C_BLOB_IT rej_cblob_it;
626  C_BLOB_LIST new_blob_list;
627  C_BLOB_IT new_blob_it;
628  C_BLOB_IT new_rej_cblob_it;
629  WERD *new_word;
630  inT16 start_of_noise_blob;
631  inT16 i;
632 
633  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
634  blob_index = worst_noise_blob(word_it.data(), &noise_score);
635  if (blob_index > -1 && worst_noise_score > noise_score) {
636  worst_noise_score = noise_score;
637  worst_blob_index = blob_index;
638  worst_word_it = word_it;
639  }
640  }
641  if (worst_blob_index < 0) {
642  words.clear(); // signal termination
643  return;
644  }
645 
646  /* Now split the worst_word_it */
647 
648  word_res = worst_word_it.data();
649 
650  /* Move blobs before noise blob to a new bloblist */
651 
652  new_blob_it.set_to_list(&new_blob_list);
653  blob_it.set_to_list(word_res->word->cblob_list());
654  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
655  new_blob_it.add_after_then_move(blob_it.extract());
656  }
657  start_of_noise_blob = blob_it.data()->bounding_box().left();
658  delete blob_it.extract(); // throw out noise blob
659 
660  new_word = new WERD(&new_blob_list, word_res->word);
661  new_word->set_flag(W_EOL, FALSE);
662  word_res->word->set_flag(W_BOL, FALSE);
663  word_res->word->set_blanks(1); // After break
664 
665  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
666  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
667  for (;
668  (!rej_cblob_it.empty() &&
669  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
670  rej_cblob_it.forward()) {
671  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
672  }
673 
674  WERD_RES* new_word_res = new WERD_RES(new_word);
675  new_word_res->combination = TRUE;
676  worst_word_it.add_before_then_move(new_word_res);
677 
678  word_res->ClearResults();
679 }
680 
682  float *worst_noise_score) {
683  float noise_score[512];
684  int i;
685  int min_noise_blob; // 1st contender
686  int max_noise_blob; // last contender
687  int non_noise_count;
688  int worst_noise_blob; // Worst blob
689  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
690  float non_noise_limit = kBlnXHeight * 0.8;
691 
692  if (word_res->rebuild_word == NULL)
693  return -1; // Can't handle cube words.
694 
695  // Normalised.
696  int blob_count = word_res->box_word->length();
697  ASSERT_HOST(blob_count <= 512);
698  if (blob_count < 5)
699  return -1; // too short to split
700 
701  /* Get the noise scores for all blobs */
702 
703  #ifndef SECURE_NAMES
704  if (debug_fix_space_level > 5)
705  tprintf("FP fixspace Noise metrics for \"%s\": ",
706  word_res->best_choice->unichar_string().string());
707  #endif
708 
709  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
710  TBLOB* blob = word_res->rebuild_word->blobs[i];
711  if (word_res->reject_map[i].accepted())
712  noise_score[i] = non_noise_limit;
713  else
714  noise_score[i] = blob_noise_score(blob);
715 
716  if (debug_fix_space_level > 5)
717  tprintf("%1.1f ", noise_score[i]);
718  }
719  if (debug_fix_space_level > 5)
720  tprintf("\n");
721 
722  /* Now find the worst one which is far enough away from the end of the word */
723 
724  non_noise_count = 0;
725  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
726  if (noise_score[i] >= non_noise_limit) {
727  non_noise_count++;
728  }
729  }
730  if (non_noise_count < fixsp_non_noise_limit)
731  return -1;
732 
733  min_noise_blob = i;
734 
735  non_noise_count = 0;
736  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
737  i--) {
738  if (noise_score[i] >= non_noise_limit) {
739  non_noise_count++;
740  }
741  }
742  if (non_noise_count < fixsp_non_noise_limit)
743  return -1;
744 
745  max_noise_blob = i;
746 
747  if (min_noise_blob > max_noise_blob)
748  return -1;
749 
750  *worst_noise_score = small_limit;
751  worst_noise_blob = -1;
752  for (i = min_noise_blob; i <= max_noise_blob; i++) {
753  if (noise_score[i] < *worst_noise_score) {
754  worst_noise_blob = i;
755  *worst_noise_score = noise_score[i];
756  }
757  }
758  return worst_noise_blob;
759 }
760 
762  TBOX box; // BB of outline
763  inT16 outline_count = 0;
764  inT16 max_dimension;
765  inT16 largest_outline_dimension = 0;
766 
767  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
768  outline_count++;
769  box = ol->bounding_box();
770  if (box.height() > box.width()) {
771  max_dimension = box.height();
772  } else {
773  max_dimension = box.width();
774  }
775 
776  if (largest_outline_dimension < max_dimension)
777  largest_outline_dimension = max_dimension;
778  }
779 
780  if (outline_count > 5) {
781  // penalise LOTS of blobs
782  largest_outline_dimension *= 2;
783  }
784 
785  box = blob->bounding_box();
786  if (box.bottom() > kBlnBaselineOffset * 4 ||
787  box.top() < kBlnBaselineOffset / 2) {
788  // Lax blob is if high or low
789  largest_outline_dimension /= 2;
790  }
791 
792  return largest_outline_dimension;
793 }
794 } // namespace tesseract
795 
796 void fixspace_dbg(WERD_RES *word) {
797  TBOX box = word->word->bounding_box();
798  BOOL8 show_map_detail = FALSE;
799  inT16 i;
800 
801  box.print();
802  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
803  tprintf("Blob count: %d (word); %d/%d (rebuild word)\n",
804  word->word->cblob_list()->length(),
805  word->rebuild_word->NumBlobs(),
806  word->box_word->length());
807  word->reject_map.print(debug_fp);
808  tprintf("\n");
809  if (show_map_detail) {
810  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
811  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
812  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
813  word->reject_map[i].full_print(debug_fp);
814  }
815  }
816 
817  tprintf("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
818  tprintf("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
819 }
820 
821 
830 namespace tesseract {
831 inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
832  WERD_RES_IT word_it(&word_res_list);
833  WERD_RES *word;
834  inT16 word_length;
835  inT16 score = 0;
836  inT16 i;
837  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
838 
839  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
840  word = word_it.data();
841  if (word->rebuild_word == NULL)
842  continue; // Can't handle cube words.
843  word_length = word->reject_map.length();
844  if (word->done ||
845  word->tess_accepted ||
846  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
847  word->best_choice->permuter() == FREQ_DAWG_PERM ||
848  word->best_choice->permuter() == USER_DAWG_PERM ||
849  safe_dict_word(word) > 0) {
850  int num_blobs = word->rebuild_word->NumBlobs();
851  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
852  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
853  TBLOB* blob = word->rebuild_word->blobs[i];
854  if (word->best_choice->unichar_id(i) == space ||
855  blob_noise_score(blob) < small_limit) {
856  score -= 1; // penalise possibly erroneous non-space
857  } else if (word->reject_map[i].accepted()) {
858  score++;
859  }
860  }
861  }
862  }
863  if (score < 0)
864  score = 0;
865  return score;
866 }
867 
868 } // namespace tesseract
const int kBlnXHeight
Definition: normalis.h:28
BOOL8 tess_accepted
Definition: pageres.h:280
Definition: blobs.h:261
void join_on(WERD *other)
Definition: werd.cpp:211
tesseract::BoxWord * box_word
Definition: pageres.h:250
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:616
void ClearResults()
Definition: pageres.cpp:1140
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:681
inT32 length() const
Definition: rejctmap.h:237
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:177
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:373
volatile inT8 ocr_alive
Definition: ocrclass.h:117
#define tprintf(...)
Definition: tprintf.h:31
void fixspace_dbg(WERD_RES *word)
Definition: fixspace.cpp:796
void * cancel_this
Definition: ocrclass.h:120
void print() const
Definition: rect.h:270
const STRING & unichar_lengths() const
Definition: ratngs.h:531
unsigned char BOOL8
Definition: host.h:113
TBOX bounding_box() const
Definition: werd.cpp:160
CMD_EVENTS mode
Definition: pgedit.cpp:116
inT16 right() const
Definition: rect.h:75
float x_height
Definition: pageres.h:295
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:240
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:630
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: ocrrow.h:32
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:416
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: werd.h:35
BOOL8 part_of_combo
Definition: pageres.h:316
BOOL8 combination
Definition: pageres.h:315
int NumBlobs() const
Definition: blobs.h:425
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
void full_print(FILE *fp)
Definition: rejctmap.cpp:406
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145
CANCEL_FUNC cancel
Definition: ocrclass.h:119
Definition: werd.h:36
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1767
inT16 left() const
Definition: rect.h:68
TWERD * rebuild_word
Definition: pageres.h:244
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool deadline_exceeded() const
Definition: ocrclass.h:144
Definition: ocrblock.h:30
const UNICHARSET * uch_set
Definition: pageres.h:192
uinT8 permuter() const
Definition: ratngs.h:343
const int kBlnBaselineOffset
Definition: normalis.h:29
int UNICHAR_ID
Definition: unichar.h:33
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:570
Definition: werd.h:60
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:171
inT16 bottom() const
Definition: rect.h:61
BOOL8 done
Definition: pageres.h:282
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
WERD * word
Definition: pageres.h:175
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:30
inT16 height() const
Definition: rect.h:104
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:761
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1268
inT16 width() const
Definition: rect.h:111
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:504
const int length() const
Definition: boxword.h:85
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:831
#define FALSE
Definition: capi.h:29
BOOL8 tess_failed
Definition: pageres.h:272
inT16 progress
Definition: ocrclass.h:115
inT16 reject_count()
Definition: rejctmap.h:243
Definition: rect.h:30
#define PERFECT_WERDS
Definition: fixspace.cpp:33
#define TRUE
Definition: capi.h:28
#define MAX_INT16
Definition: host.h:119
FILE * debug_fp
Definition: tessvars.cpp:24
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:536
TBOX bounding_box() const
Definition: blobs.cpp:482
TESSLINE * outlines
Definition: blobs.h:377
void copy_on(WERD_RES *word_res)
Definition: pageres.h:641
void print(FILE *fp)
Definition: rejctmap.cpp:394
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:344
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:450
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102