All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
pageres.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pageres.cpp (Formerly page_res.c)
3  * Description: Hierarchy of results classes from PAGE_RES to WERD_RES
4  * and an iterator class to iterate over the words.
5  * Main purposes:
6  * Easy way to iterate over the words without a 3-nested loop.
7  * Holds data used during word recognition.
8  * Holds information about alternative spacing paths.
9  * Author: Phil Cheatle
10  * Created: Tue Sep 22 08:42:49 BST 1992
11  *
12  * (C) Copyright 1992, Hewlett-Packard Ltd.
13  ** Licensed under the Apache License, Version 2.0 (the "License");
14  ** you may not use this file except in compliance with the License.
15  ** You may obtain a copy of the License at
16  ** http://www.apache.org/licenses/LICENSE-2.0
17  ** Unless required by applicable law or agreed to in writing, software
18  ** distributed under the License is distributed on an "AS IS" BASIS,
19  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20  ** See the License for the specific language governing permissions and
21  ** limitations under the License.
22  *
23  **********************************************************************/
24 #include <stdlib.h>
25 #ifdef __UNIX__
26 #include <assert.h>
27 #endif
28 #include "blamer.h"
29 #include "pageres.h"
30 #include "blobs.h"
31 
34 
35 // Gain factor for computing thresholds that determine the ambiguity of a word.
36 static const double kStopperAmbiguityThresholdGain = 8.0;
37 // Constant offset for computing thresholds that determine the ambiguity of a
38 // word.
39 static const double kStopperAmbiguityThresholdOffset = 1.5;
40 // Max number of broken pieces to associate.
42 // Max ratio of word box height to line size to allow it to be processed as
43 // a line with other words.
44 const double kMaxWordSizeRatio = 1.25;
45 // Max ratio of line box height to line size to allow a new word to be added.
46 const double kMaxLineSizeRatio = 1.25;
47 // Max ratio of word gap to line size to allow a new word to be added.
48 const double kMaxWordGapRatio = 2.0;
49 
50 // Computes and returns a threshold of certainty difference used to determine
51 // which words to keep, based on the adjustment factors of the two words.
52 // TODO(rays) This is horrible. Replace with an enhance params training model.
53 static double StopperAmbigThreshold(double f1, double f2) {
54  return (f2 - f1) * kStopperAmbiguityThresholdGain -
55  kStopperAmbiguityThresholdOffset;
56 }
57 
58 /*************************************************************************
59  * PAGE_RES::PAGE_RES
60  *
61  * Constructor for page results
62  *************************************************************************/
64  bool merge_similar_words,
65  BLOCK_LIST *the_block_list,
66  WERD_CHOICE **prev_word_best_choice_ptr) {
67  Init();
68  BLOCK_IT block_it(the_block_list);
69  BLOCK_RES_IT block_res_it(&block_res_list);
70  for (block_it.mark_cycle_pt();
71  !block_it.cycled_list(); block_it.forward()) {
72  block_res_it.add_to_end(new BLOCK_RES(merge_similar_words,
73  block_it.data()));
74  }
75  prev_word_best_choice = prev_word_best_choice_ptr;
76 }
77 
78 /*************************************************************************
79  * BLOCK_RES::BLOCK_RES
80  *
81  * Constructor for BLOCK results
82  *************************************************************************/
83 
84 BLOCK_RES::BLOCK_RES(bool merge_similar_words, BLOCK *the_block) {
85  ROW_IT row_it (the_block->row_list ());
86  ROW_RES_IT row_res_it(&row_res_list);
87 
88  char_count = 0;
89  rej_count = 0;
90  font_class = -1; //not assigned
91  x_height = -1.0;
93  bold = FALSE;
94  italic = FALSE;
95  row_count = 0;
96 
97  block = the_block;
98 
99  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
100  row_res_it.add_to_end(new ROW_RES(merge_similar_words, row_it.data()));
101  }
102 }
103 
104 /*************************************************************************
105  * ROW_RES::ROW_RES
106  *
107  * Constructor for ROW results
108  *************************************************************************/
109 
110 ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) {
111  WERD_IT word_it(the_row->word_list());
112  WERD_RES_IT word_res_it(&word_res_list);
113  WERD_RES *combo = NULL; // current combination of fuzzies
114  WERD *copy_word;
115 
116  char_count = 0;
117  rej_count = 0;
119 
120  row = the_row;
121  bool add_next_word = false;
122  TBOX union_box;
123  float line_height = the_row->x_height() + the_row->ascenders() -
124  the_row->descenders();
125  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
126  WERD_RES* word_res = new WERD_RES(word_it.data());
127  word_res->x_height = the_row->x_height();
128  if (add_next_word) {
129  ASSERT_HOST(combo != NULL);
130  // We are adding this word to the combination.
131  word_res->part_of_combo = TRUE;
132  combo->copy_on(word_res);
133  } else if (merge_similar_words) {
134  union_box = word_res->word->bounding_box();
135  add_next_word = !word_res->word->flag(W_REP_CHAR) &&
136  union_box.height() <= line_height * kMaxWordSizeRatio;
137  word_res->odd_size = !add_next_word;
138  }
139  WERD* next_word = word_it.data_relative(1);
140  if (merge_similar_words) {
141  if (add_next_word && !next_word->flag(W_REP_CHAR)) {
142  // Next word will be added on if all of the following are true:
143  // Not a rep char.
144  // Box height small enough.
145  // Union box height small enough.
146  // Horizontal gap small enough.
147  TBOX next_box = next_word->bounding_box();
148  int prev_right = union_box.right();
149  union_box += next_box;
150  if (next_box.height() > line_height * kMaxWordSizeRatio ||
151  union_box.height() > line_height * kMaxLineSizeRatio ||
152  next_box.left() > prev_right + line_height * kMaxWordGapRatio) {
153  add_next_word = false;
154  }
155  }
156  next_word->set_flag(W_FUZZY_NON, add_next_word);
157  } else {
158  add_next_word = next_word->flag(W_FUZZY_NON);
159  }
160  if (add_next_word) {
161  if (combo == NULL) {
162  copy_word = new WERD;
163  *copy_word = *(word_it.data()); // deep copy
164  combo = new WERD_RES(copy_word);
165  combo->x_height = the_row->x_height();
166  combo->combination = TRUE;
167  word_res_it.add_to_end(combo);
168  }
169  word_res->part_of_combo = TRUE;
170  } else {
171  combo = NULL;
172  }
173  word_res_it.add_to_end(word_res);
174  }
175 }
176 
177 
179  this->ELIST_LINK::operator=(source);
180  Clear();
181  if (source.combination) {
182  word = new WERD;
183  *word = *(source.word); // deep copy
184  } else {
185  word = source.word; // pt to same word
186  }
187  if (source.bln_boxes != NULL)
188  bln_boxes = new tesseract::BoxWord(*source.bln_boxes);
189  if (source.chopped_word != NULL)
190  chopped_word = new TWERD(*source.chopped_word);
191  if (source.rebuild_word != NULL)
192  rebuild_word = new TWERD(*source.rebuild_word);
193  // TODO(rays) Do we ever need to copy the seam_array?
194  blob_row = source.blob_row;
195  denorm = source.denorm;
196  if (source.box_word != NULL)
197  box_word = new tesseract::BoxWord(*source.box_word);
198  best_state = source.best_state;
199  correct_text = source.correct_text;
200  blob_widths = source.blob_widths;
201  blob_gaps = source.blob_gaps;
202  // None of the uses of operator= require the ratings matrix to be copied,
203  // so don't as it would be really slow.
204 
205  // Copy the cooked choices.
206  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&source.best_choices));
207  WERD_CHOICE_IT wc_dest_it(&best_choices);
208  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
209  const WERD_CHOICE *choice = wc_it.data();
210  wc_dest_it.add_after_then_move(new WERD_CHOICE(*choice));
211  }
212  if (!wc_dest_it.empty()) {
213  wc_dest_it.move_to_first();
214  best_choice = wc_dest_it.data();
215  } else {
216  best_choice = NULL;
217  }
218 
219  if (source.raw_choice != NULL) {
220  raw_choice = new WERD_CHOICE(*source.raw_choice);
221  } else {
222  raw_choice = NULL;
223  }
224  if (source.ep_choice != NULL) {
225  ep_choice = new WERD_CHOICE(*source.ep_choice);
226  } else {
227  ep_choice = NULL;
228  }
229  reject_map = source.reject_map;
230  combination = source.combination;
231  part_of_combo = source.part_of_combo;
232  CopySimpleFields(source);
233  if (source.blamer_bundle != NULL) {
234  blamer_bundle = new BlamerBundle(*(source.blamer_bundle));
235  }
236  return *this;
237 }
238 
239 // Copies basic fields that don't involve pointers that might be useful
240 // to copy when making one WERD_RES from another.
242  tess_failed = source.tess_failed;
243  tess_accepted = source.tess_accepted;
245  done = source.done;
247  small_caps = source.small_caps;
248  odd_size = source.odd_size;
249  italic = source.italic;
250  bold = source.bold;
251  fontinfo = source.fontinfo;
252  fontinfo2 = source.fontinfo2;
255  x_height = source.x_height;
256  caps_height = source.caps_height;
258  guessed_x_ht = source.guessed_x_ht;
260  reject_spaces = source.reject_spaces;
261  uch_set = source.uch_set;
262  tesseract = source.tesseract;
263 }
264 
265 // Initializes a blank (default constructed) WERD_RES from one that has
266 // already been recognized.
267 // Use SetupFor*Recognition afterwards to complete the setup and make
268 // it ready for a retry recognition.
270  word = source.word;
271  CopySimpleFields(source);
272  if (source.blamer_bundle != NULL) {
273  blamer_bundle = new BlamerBundle();
275  }
276 }
277 
278 // Sets up the members used in recognition: bln_boxes, chopped_word,
279 // seam_array, denorm. Returns false if
280 // the word is empty and sets up fake results. If use_body_size is
281 // true and row->body_size is set, then body_size will be used for
282 // blob normalization instead of xheight + ascrise. This flag is for
283 // those languages that are using CJK pitch model and thus it has to
284 // be true if and only if tesseract->textord_use_cjk_fp_model is
285 // true.
286 // If allow_detailed_fx is true, the feature extractor will receive fine
287 // precision outline information, allowing smoother features and better
288 // features on low resolution images.
289 // The norm_mode_hint sets the default mode for normalization in absence
290 // of any of the above flags.
291 // norm_box is used to override the word bounding box to determine the
292 // normalization scale and offset.
293 // Returns false if the word is empty and sets up fake results.
294 bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in,
295  tesseract::Tesseract* tess, Pix* pix,
296  int norm_mode,
297  const TBOX* norm_box,
298  bool numeric_mode,
299  bool use_body_size,
300  bool allow_detailed_fx,
301  ROW *row, const BLOCK* block) {
302  tesseract::OcrEngineMode norm_mode_hint =
303  static_cast<tesseract::OcrEngineMode>(norm_mode);
304  tesseract = tess;
305  POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL;
306  if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY &&
307  word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) {
308  // Empty words occur when all the blobs have been moved to the rej_blobs
309  // list, which seems to occur frequently in junk.
310  SetupFake(unicharset_in);
311  word->set_flag(W_REP_CHAR, false);
312  return false;
313  }
314  ClearResults();
315  SetupWordScript(unicharset_in);
316  chopped_word = TWERD::PolygonalCopy(allow_detailed_fx, word);
317  float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f
318  ? row->body_size() : x_height;
319  chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE),
320  word_xheight, baseline_shift, numeric_mode,
321  norm_mode_hint, norm_box, &denorm);
322  blob_row = row;
323  SetupBasicsFromChoppedWord(unicharset_in);
325  int num_blobs = chopped_word->NumBlobs();
326  ratings = new MATRIX(num_blobs, kWordrecMaxNumJoinChunks);
327  tess_failed = false;
328  return true;
329 }
330 
331 // Set up the seam array, bln_boxes, best_choice, and raw_choice to empty
332 // accumulators from a made chopped word. We presume the fields are already
333 // empty.
339 }
340 
341 // Sets up the members used in recognition for an empty recognition result:
342 // bln_boxes, chopped_word, seam_array, denorm, best_choice, raw_choice.
343 void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) {
344  ClearResults();
345  SetupWordScript(unicharset_in);
346  chopped_word = new TWERD;
347  rebuild_word = new TWERD;
350  int blob_count = word->cblob_list()->length();
351  if (blob_count > 0) {
352  BLOB_CHOICE** fake_choices = new BLOB_CHOICE*[blob_count];
353  // For non-text blocks, just pass any blobs through to the box_word
354  // and call the word failed with a fake classification.
355  C_BLOB_IT b_it(word->cblob_list());
356  int blob_id = 0;
357  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
358  TBOX box = b_it.data()->bounding_box();
359  box_word->InsertBox(box_word->length(), box);
360  fake_choices[blob_id++] = new BLOB_CHOICE;
361  }
362  FakeClassifyWord(blob_count, fake_choices);
363  delete [] fake_choices;
364  } else {
365  WERD_CHOICE* word = new WERD_CHOICE(&unicharset_in);
366  word->make_bad();
367  LogNewRawChoice(word);
368  // Ownership of word is taken by *this WERD_RES in LogNewCookedChoice.
369  LogNewCookedChoice(1, false, word);
370  }
371  tess_failed = true;
372  done = true;
373 }
374 
376  uch_set = &uch;
377  int script = uch.default_sid();
378  word->set_script_id(script);
380  word->set_flag(W_SCRIPT_IS_LATIN, script == uch.latin_sid());
381 }
382 
383 // Sets up the blamer_bundle if it is not null, using the initialized denorm.
385  if (blamer_bundle != NULL) {
387  }
388 }
389 
390 // Computes the blob_widths and blob_gaps from the chopped_word.
393  blob_gaps.truncate(0);
394  int num_blobs = chopped_word->NumBlobs();
395  for (int b = 0; b < num_blobs; ++b) {
396  TBLOB *blob = chopped_word->blobs[b];
397  TBOX box = blob->bounding_box();
398  blob_widths.push_back(box.width());
399  if (b + 1 < num_blobs) {
401  chopped_word->blobs[b + 1]->bounding_box().left() - box.right());
402  }
403  }
404 }
405 
406 // Updates internal data to account for a new SEAM (chop) at the given
407 // blob_number. Fixes the ratings matrix and states in the choices, as well
408 // as the blob widths and gaps.
409 void WERD_RES::InsertSeam(int blob_number, SEAM* seam) {
410  // Insert the seam into the SEAMS array.
411  seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true);
412  seam_array.insert(seam, blob_number);
413  if (ratings != NULL) {
414  // Expand the ratings matrix.
415  ratings = ratings->ConsumeAndMakeBigger(blob_number);
416  // Fix all the segmentation states.
417  if (raw_choice != NULL)
418  raw_choice->UpdateStateForSplit(blob_number);
419  WERD_CHOICE_IT wc_it(&best_choices);
420  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward()) {
421  WERD_CHOICE* choice = wc_it.data();
422  choice->UpdateStateForSplit(blob_number);
423  }
425  }
426 }
427 
428 // Returns true if all the word choices except the first have adjust_factors
429 // worse than the given threshold.
431  // The choices are not changed by this iteration.
432  WERD_CHOICE_IT wc_it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
433  for (wc_it.forward(); !wc_it.at_first(); wc_it.forward()) {
434  WERD_CHOICE* choice = wc_it.data();
435  if (choice->adjust_factor() <= threshold)
436  return false;
437  }
438  return true;
439 }
440 
441 // Returns true if the current word is ambiguous (by number of answers or
442 // by dangerous ambigs.)
444  return !best_choices.singleton() || best_choice->dangerous_ambig_found();
445 }
446 
447 // Returns true if the ratings matrix size matches the sum of each of the
448 // segmentation states.
450  int ratings_dim = ratings->dimension();
451  if (raw_choice->TotalOfStates() != ratings_dim) {
452  tprintf("raw_choice has total of states = %d vs ratings dim of %d\n",
453  raw_choice->TotalOfStates(), ratings_dim);
454  return false;
455  }
456  WERD_CHOICE_IT it(&best_choices);
457  int index = 0;
458  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
459  WERD_CHOICE* choice = it.data();
460  if (choice->TotalOfStates() != ratings_dim) {
461  tprintf("Cooked #%d has total of states = %d vs ratings dim of %d\n",
462  choice->TotalOfStates(), ratings_dim);
463  return false;
464  }
465  }
466  return true;
467 }
468 
469 // Prints a list of words found if debug is true or the word result matches
470 // the word_to_debug.
471 void WERD_RES::DebugWordChoices(bool debug, const char* word_to_debug) {
472  if (debug ||
473  (word_to_debug != NULL && *word_to_debug != '\0' && best_choice != NULL &&
474  best_choice->unichar_string() == STRING(word_to_debug))) {
475  if (raw_choice != NULL)
476  raw_choice->print("\nBest Raw Choice");
477 
478  WERD_CHOICE_IT it(&best_choices);
479  int index = 0;
480  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward(), ++index) {
481  WERD_CHOICE* choice = it.data();
482  STRING label;
483  label.add_str_int("\nCooked Choice #", index);
484  choice->print(label.string());
485  }
486  }
487 }
488 
489 // Prints the top choice along with the accepted/done flags.
490 void WERD_RES::DebugTopChoice(const char* msg) const {
491  tprintf("Best choice: accepted=%d, adaptable=%d, done=%d : ",
493  if (best_choice == NULL)
494  tprintf("<Null choice>\n");
495  else
496  best_choice->print(msg);
497 }
498 
499 // Removes from best_choices all choices which are not within a reasonable
500 // range of the best choice.
501 // TODO(rays) incorporate the information used here into the params training
502 // re-ranker, in place of this heuristic that is based on the previous
503 // adjustment factor.
504 void WERD_RES::FilterWordChoices(int debug_level) {
505  if (best_choice == NULL || best_choices.singleton())
506  return;
507 
508  if (debug_level >= 2)
509  best_choice->print("\nFiltering against best choice");
510  WERD_CHOICE_IT it(&best_choices);
511  int index = 0;
512  for (it.forward(); !it.at_first(); it.forward(), ++index) {
513  WERD_CHOICE* choice = it.data();
514  float threshold = StopperAmbigThreshold(best_choice->adjust_factor(),
515  choice->adjust_factor());
516  // i, j index the blob choice in choice, best_choice.
517  // chunk is an index into the chopped_word blobs (AKA chunks).
518  // Since the two words may use different segmentations of the chunks, we
519  // iterate over the chunks to find out whether a comparable blob
520  // classification is much worse than the best result.
521  int i = 0, j = 0, chunk = 0;
522  // Each iteration of the while deals with 1 chunk. On entry choice_chunk
523  // and best_chunk are the indices of the first chunk in the NEXT blob,
524  // i.e. we don't have to increment i, j while chunk < choice_chunk and
525  // best_chunk respectively.
526  int choice_chunk = choice->state(0), best_chunk = best_choice->state(0);
527  while (i < choice->length() && j < best_choice->length()) {
528  if (choice->unichar_id(i) != best_choice->unichar_id(j) &&
529  choice->certainty(i) - best_choice->certainty(j) < threshold) {
530  if (debug_level >= 2) {
531  STRING label;
532  label.add_str_int("\nDiscarding bad choice #", index);
533  choice->print(label.string());
534  tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g"
535  " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n",
536  i, j, chunk, choice->certainty(i),
537  best_choice->certainty(j), threshold);
538  }
539  delete it.extract();
540  break;
541  }
542  ++chunk;
543  // If needed, advance choice_chunk to keep up with chunk.
544  while (choice_chunk < chunk && ++i < choice->length())
545  choice_chunk += choice->state(i);
546  // If needed, advance best_chunk to keep up with chunk.
547  while (best_chunk < chunk && ++j < best_choice->length())
548  best_chunk += best_choice->state(j);
549  }
550  }
551 }
552 
553 void WERD_RES::ComputeAdaptionThresholds(float certainty_scale,
554  float min_rating,
555  float max_rating,
556  float rating_margin,
557  float* thresholds) {
558  int chunk = 0;
559  int end_chunk = best_choice->state(0);
560  int end_raw_chunk = raw_choice->state(0);
561  int raw_blob = 0;
562  for (int i = 0; i < best_choice->length(); i++, thresholds++) {
563  float avg_rating = 0.0f;
564  int num_error_chunks = 0;
565 
566  // For each chunk in best choice blob i, count non-matching raw results.
567  while (chunk < end_chunk) {
568  if (chunk >= end_raw_chunk) {
569  ++raw_blob;
570  end_raw_chunk += raw_choice->state(raw_blob);
571  }
572  if (best_choice->unichar_id(i) !=
573  raw_choice->unichar_id(raw_blob)) {
574  avg_rating += raw_choice->certainty(raw_blob);
575  ++num_error_chunks;
576  }
577  ++chunk;
578  }
579 
580  if (num_error_chunks > 0) {
581  avg_rating /= num_error_chunks;
582  *thresholds = (avg_rating / -certainty_scale) * (1.0 - rating_margin);
583  } else {
584  *thresholds = max_rating;
585  }
586 
587  if (*thresholds > max_rating)
588  *thresholds = max_rating;
589  if (*thresholds < min_rating)
590  *thresholds = min_rating;
591  }
592 }
593 
594 // Saves a copy of the word_choice if it has the best unadjusted rating.
595 // Returns true if the word_choice was the new best.
597  if (raw_choice == NULL || word_choice->rating() < raw_choice->rating()) {
598  delete raw_choice;
599  raw_choice = new WERD_CHOICE(*word_choice);
601  return true;
602  }
603  return false;
604 }
605 
606 // Consumes word_choice by adding it to best_choices, (taking ownership) if
607 // the certainty for word_choice is some distance of the best choice in
608 // best_choices, or by deleting the word_choice and returning false.
609 // The best_choices list is kept in sorted order by rating. Duplicates are
610 // removed, and the list is kept no longer than max_num_choices in length.
611 // Returns true if the word_choice is still a valid pointer.
612 bool WERD_RES::LogNewCookedChoice(int max_num_choices, bool debug,
613  WERD_CHOICE* word_choice) {
614  if (best_choice != NULL) {
615  // Throw out obviously bad choices to save some work.
616  // TODO(rays) Get rid of this! This piece of code produces different
617  // results according to the order in which words are found, which is an
618  // undesirable behavior. It would be better to keep all the choices and
619  // prune them later when more information is available.
620  float max_certainty_delta =
621  StopperAmbigThreshold(best_choice->adjust_factor(),
622  word_choice->adjust_factor());
623  if (max_certainty_delta > -kStopperAmbiguityThresholdOffset)
624  max_certainty_delta = -kStopperAmbiguityThresholdOffset;
625  if (word_choice->certainty() - best_choice->certainty() <
626  max_certainty_delta) {
627  if (debug) {
628  STRING bad_string;
629  word_choice->string_and_lengths(&bad_string, NULL);
630  tprintf("Discarding choice \"%s\" with an overly low certainty"
631  " %.3f vs best choice certainty %.3f (Threshold: %.3f)\n",
632  bad_string.string(), word_choice->certainty(),
634  max_certainty_delta + best_choice->certainty());
635  }
636  delete word_choice;
637  return false;
638  }
639  }
640 
641  // Insert in the list in order of increasing rating, but knock out worse
642  // string duplicates.
643  WERD_CHOICE_IT it(&best_choices);
644  const STRING& new_str = word_choice->unichar_string();
645  bool inserted = false;
646  int num_choices = 0;
647  if (!it.empty()) {
648  do {
649  WERD_CHOICE* choice = it.data();
650  if (choice->rating() > word_choice->rating() && !inserted) {
651  // Time to insert.
652  it.add_before_stay_put(word_choice);
653  inserted = true;
654  if (num_choices == 0)
655  best_choice = word_choice; // This is the new best.
656  ++num_choices;
657  }
658  if (choice->unichar_string() == new_str) {
659  if (inserted) {
660  // New is better.
661  delete it.extract();
662  } else {
663  // Old is better.
664  if (debug) {
665  tprintf("Discarding duplicate choice \"%s\", rating %g vs %g\n",
666  new_str.string(), word_choice->rating(), choice->rating());
667  }
668  delete word_choice;
669  return false;
670  }
671  } else {
672  ++num_choices;
673  if (num_choices > max_num_choices)
674  delete it.extract();
675  }
676  it.forward();
677  } while (!it.at_first());
678  }
679  if (!inserted && num_choices < max_num_choices) {
680  it.add_to_end(word_choice);
681  inserted = true;
682  if (num_choices == 0)
683  best_choice = word_choice; // This is the new best.
684  }
685  if (debug) {
686  if (inserted)
687  tprintf("New %s", best_choice == word_choice ? "Best" : "Secondary");
688  else
689  tprintf("Poor");
690  word_choice->print(" Word Choice");
691  }
692  if (!inserted) {
693  delete word_choice;
694  return false;
695  }
696  return true;
697 }
698 
699 
700 // Simple helper moves the ownership of the pointer data from src to dest,
701 // first deleting anything in dest, and nulling out src afterwards.
702 template<class T> static void MovePointerData(T** dest, T**src) {
703  delete *dest;
704  *dest = *src;
705  *src = NULL;
706 }
707 
708 // Prints a brief list of all the best choices.
710  STRING alternates_str;
711  WERD_CHOICE_IT it(const_cast<WERD_CHOICE_LIST*>(&best_choices));
712  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
713  if (!it.at_first()) alternates_str += "\", \"";
714  alternates_str += it.data()->unichar_string();
715  }
716  tprintf("Alternates for \"%s\": {\"%s\"}\n",
717  best_choice->unichar_string().string(), alternates_str.string());
718 }
719 
720 // Returns the sum of the widths of the blob between start_blob and last_blob
721 // inclusive.
722 int WERD_RES::GetBlobsWidth(int start_blob, int last_blob) {
723  int result = 0;
724  for (int b = start_blob; b <= last_blob; ++b) {
725  result += blob_widths[b];
726  if (b < last_blob)
727  result += blob_gaps[b];
728  }
729  return result;
730 }
731 // Returns the width of a gap between the specified blob and the next one.
732 int WERD_RES::GetBlobsGap(int blob_index) {
733  if (blob_index < 0 || blob_index >= blob_gaps.size())
734  return 0;
735  return blob_gaps[blob_index];
736 }
737 
738 // Returns the BLOB_CHOICE corresponding to the given index in the
739 // best choice word taken from the appropriate cell in the ratings MATRIX.
740 // Borrowed pointer, so do not delete. May return NULL if there is no
741 // BLOB_CHOICE matching the unichar_id at the given index.
743  if (index < 0 || index >= best_choice->length()) return NULL;
744  BLOB_CHOICE_LIST* choices = GetBlobChoices(index);
745  return FindMatchingChoice(best_choice->unichar_id(index), choices);
746 }
747 
748 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the
749 // best choice word taken from the appropriate cell in the ratings MATRIX.
750 // Borrowed pointer, so do not delete.
751 BLOB_CHOICE_LIST* WERD_RES::GetBlobChoices(int index) const {
752  return best_choice->blob_choices(index, ratings);
753 }
754 
755 // Moves the results fields from word to this. This takes ownership of all
756 // the data, so src can be destructed.
758  denorm = word->denorm;
759  blob_row = word->blob_row;
760  MovePointerData(&chopped_word, &word->chopped_word);
761  MovePointerData(&rebuild_word, &word->rebuild_word);
762  MovePointerData(&box_word, &word->box_word);
764  seam_array = word->seam_array;
765  word->seam_array.clear();
766  best_state.move(&word->best_state);
768  blob_widths.move(&word->blob_widths);
769  blob_gaps.move(&word->blob_gaps);
771  MovePointerData(&ratings, &word->ratings);
772  best_choice = word->best_choice;
773  MovePointerData(&raw_choice, &word->raw_choice);
774  best_choices.clear();
775  WERD_CHOICE_IT wc_it(&best_choices);
776  wc_it.add_list_after(&word->best_choices);
777  reject_map = word->reject_map;
778  if (word->blamer_bundle != NULL) {
779  assert(blamer_bundle != NULL);
781  }
782  CopySimpleFields(*word);
783 }
784 
785 // Replace the best choice and rebuild box word.
786 // choice must be from the current best_choices list.
788  best_choice = choice;
790  SetupBoxWord();
791  // Make up a fake reject map of the right length to keep the
792  // rejection pass happy.
796 }
797 
798 // Builds the rebuild_word and sets the best_state from the chopped_word and
799 // the best_choice->state.
802  if (rebuild_word != NULL)
803  delete rebuild_word;
804  rebuild_word = new TWERD;
805  if (seam_array.empty())
807  best_state.truncate(0);
808  int start = 0;
809  for (int i = 0; i < best_choice->length(); ++i) {
810  int length = best_choice->state(i);
811  best_state.push_back(length);
812  if (length > 1) {
814  start + length - 1);
815  }
816  TBLOB* blob = chopped_word->blobs[start];
817  rebuild_word->blobs.push_back(new TBLOB(*blob));
818  if (length > 1) {
820  start + length - 1);
821  }
822  start += length;
823  }
824 }
825 
826 // Copies the chopped_word to the rebuild_word, faking a best_state as well.
827 // Also sets up the output box_word.
829  if (rebuild_word != NULL)
830  delete rebuild_word;
832  SetupBoxWord();
833  int word_len = box_word->length();
834  best_state.reserve(word_len);
835  correct_text.reserve(word_len);
836  for (int i = 0; i < word_len; ++i) {
839  }
840 }
841 
842 // Sets/replaces the box_word with one made from the rebuild_word.
844  if (box_word != NULL)
845  delete box_word;
849 }
850 
851 // Sets up the script positions in the output best_choice using the best_choice
852 // to get the unichars, and the unicharset to get the target positions.
855 }
856 // Sets all the blobs in all the words (raw choice and best choices) to be
857 // the given position. (When a sub/superscript is recognized as a separate
858 // word, it falls victim to the rule that a whole word cannot be sub or
859 // superscript, so this function overrides that problem.)
862  WERD_CHOICE_IT wc_it(&best_choices);
863  for (wc_it.mark_cycle_pt(); !wc_it.cycled_list(); wc_it.forward())
864  wc_it.data()->SetAllScriptPositions(position);
865 }
866 
867 // Classifies the word with some already-calculated BLOB_CHOICEs.
868 // The choices are an array of blob_count pointers to BLOB_CHOICE,
869 // providing a single classifier result for each blob.
870 // The BLOB_CHOICEs are consumed and the word takes ownership.
871 // The number of blobs in the box_word must match blob_count.
872 void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) {
873  // Setup the WERD_RES.
875  ASSERT_HOST(blob_count == box_word->length());
877  ClearRatings();
878  ratings = new MATRIX(blob_count, 1);
879  for (int c = 0; c < blob_count; ++c) {
880  BLOB_CHOICE_LIST* choice_list = new BLOB_CHOICE_LIST;
881  BLOB_CHOICE_IT choice_it(choice_list);
882  choice_it.add_after_then_move(choices[c]);
883  ratings->put(c, c, choice_list);
884  }
886  reject_map.initialise(blob_count);
887  done = true;
888 }
889 
890 // Creates a WERD_CHOICE for the word using the top choices from the leading
891 // diagonal of the ratings matrix.
893  int num_blobs = ratings->dimension();
894  WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs);
895  word_choice->set_permuter(TOP_CHOICE_PERM);
896  for (int b = 0; b < num_blobs; ++b) {
897  UNICHAR_ID unichar_id = UNICHAR_SPACE;
898  float rating = MAX_INT32;
899  float certainty = -MAX_INT32;
900  BLOB_CHOICE_LIST* choices = ratings->get(b, b);
901  if (choices != NULL && !choices->empty()) {
902  BLOB_CHOICE_IT bc_it(choices);
903  BLOB_CHOICE* choice = bc_it.data();
904  unichar_id = choice->unichar_id();
905  rating = choice->rating();
906  certainty = choice->certainty();
907  }
908  word_choice->append_unichar_id_space_allocated(unichar_id, 1, rating,
909  certainty);
910  }
911  LogNewRawChoice(word_choice);
912  // Ownership of word_choice taken by word here.
913  LogNewCookedChoice(1, false, word_choice);
914 }
915 
916 // Copies the best_choice strings to the correct_text for adaption/training.
920  for (int i = 0; i < best_choice->length(); ++i) {
921  UNICHAR_ID choice_id = best_choice->unichar_id(i);
922  const char* blob_choice = uch_set->id_to_unichar(choice_id);
923  correct_text.push_back(STRING(blob_choice));
924  }
925 }
926 
927 // Merges 2 adjacent blobs in the result if the permanent callback
928 // class_cb returns other than INVALID_UNICHAR_ID, AND the permanent
929 // callback box_cb is NULL or returns true, setting the merged blob
930 // result to the class returned from class_cb.
931 // Returns true if anything was merged.
935  ASSERT_HOST(best_choice->length() == 0 || ratings != NULL);
936  bool modified = false;
937  for (int i = 0; i + 1 < best_choice->length(); ++i) {
938  UNICHAR_ID new_id = class_cb->Run(best_choice->unichar_id(i),
939  best_choice->unichar_id(i+1));
940  if (new_id != INVALID_UNICHAR_ID &&
941  (box_cb == NULL || box_cb->Run(box_word->BlobBox(i),
942  box_word->BlobBox(i + 1)))) {
943  // Raw choice should not be fixed.
944  best_choice->set_unichar_id(new_id, i);
945  modified = true;
947  const MATRIX_COORD& coord = best_choice->MatrixCoord(i);
948  if (!coord.Valid(*ratings)) {
949  ratings->IncreaseBandSize(coord.row + 1 - coord.col);
950  }
951  BLOB_CHOICE_LIST* blob_choices = GetBlobChoices(i);
952  if (FindMatchingChoice(new_id, blob_choices) == NULL) {
953  // Insert a fake result.
954  BLOB_CHOICE* blob_choice = new BLOB_CHOICE;
955  blob_choice->set_unichar_id(new_id);
956  BLOB_CHOICE_IT bc_it(blob_choices);
957  bc_it.add_before_then_move(blob_choice);
958  }
959  }
960  }
961  delete class_cb;
962  delete box_cb;
963  return modified;
964 }
965 
966 // Merges 2 adjacent blobs in the result (index and index+1) and corrects
967 // all the data to account for the change.
969  if (reject_map.length() == best_choice->length())
970  reject_map.remove_pos(index);
971  best_choice->remove_unichar_id(index + 1);
972  rebuild_word->MergeBlobs(index, index + 2);
973  box_word->MergeBoxes(index, index + 2);
974  if (index + 1 < best_state.length()) {
975  best_state[index] += best_state[index + 1];
976  best_state.remove(index + 1);
977  }
978 }
979 
980 // TODO(tkielbus) Decide between keeping this behavior here or modifying the
981 // training data.
982 
983 // Utility function for fix_quotes
984 // Return true if the next character in the string (given the UTF8 length in
985 // bytes) is a quote character.
986 static int is_simple_quote(const char* signed_str, int length) {
987  const unsigned char* str =
988  reinterpret_cast<const unsigned char*>(signed_str);
989  // Standard 1 byte quotes.
990  return (length == 1 && (*str == '\'' || *str == '`')) ||
991  // UTF-8 3 bytes curved quotes.
992  (length == 3 && ((*str == 0xe2 &&
993  *(str + 1) == 0x80 &&
994  *(str + 2) == 0x98) ||
995  (*str == 0xe2 &&
996  *(str + 1) == 0x80 &&
997  *(str + 2) == 0x99)));
998 }
999 
1000 // Callback helper for fix_quotes returns a double quote if both
1001 // arguments are quote, otherwise INVALID_UNICHAR_ID.
1003  const char *ch = uch_set->id_to_unichar(id1);
1004  const char *next_ch = uch_set->id_to_unichar(id2);
1005  if (is_simple_quote(ch, strlen(ch)) &&
1006  is_simple_quote(next_ch, strlen(next_ch)))
1007  return uch_set->unichar_to_id("\"");
1008  return INVALID_UNICHAR_ID;
1009 }
1010 
1011 // Change pairs of quotes to double quotes.
1013  if (!uch_set->contains_unichar("\"") ||
1015  return; // Don't create it if it is disallowed.
1016 
1019  NULL);
1020 }
1021 
1022 // Callback helper for fix_hyphens returns UNICHAR_ID of - if both
1023 // arguments are hyphen, otherwise INVALID_UNICHAR_ID.
1025  const char *ch = uch_set->id_to_unichar(id1);
1026  const char *next_ch = uch_set->id_to_unichar(id2);
1027  if (strlen(ch) == 1 && strlen(next_ch) == 1 &&
1028  (*ch == '-' || *ch == '~') && (*next_ch == '-' || *next_ch == '~'))
1029  return uch_set->unichar_to_id("-");
1030  return INVALID_UNICHAR_ID;
1031 }
1032 
1033 // Callback helper for fix_hyphens returns true if box1 and box2 overlap
1034 // (assuming both on the same textline, are in order and a chopped em dash.)
1035 bool WERD_RES::HyphenBoxesOverlap(const TBOX& box1, const TBOX& box2) {
1036  return box1.right() >= box2.left();
1037 }
1038 
1039 // Change pairs of hyphens to a single hyphen if the bounding boxes touch
1040 // Typically a long dash which has been segmented.
1042  if (!uch_set->contains_unichar("-") ||
1044  return; // Don't create it if it is disallowed.
1045 
1049 }
1050 
1051 // Callback helper for merge_tess_fails returns a space if both
1052 // arguments are space, otherwise INVALID_UNICHAR_ID.
1054  if (id1 == id2 && id1 == uch_set->unichar_to_id(" "))
1055  return id1;
1056  else
1057  return INVALID_UNICHAR_ID;
1058 }
1059 
1060 // Change pairs of tess failures to a single one
1064  int len = best_choice->length();
1065  ASSERT_HOST(reject_map.length() == len);
1066  ASSERT_HOST(box_word->length() == len);
1067  }
1068 }
1069 
1070 // Returns true if the collection of count pieces, starting at start, are all
1071 // natural connected components, ie there are no real chops involved.
1072 bool WERD_RES::PiecesAllNatural(int start, int count) const {
1073  // all seams must have no splits.
1074  for (int index = start; index < start + count - 1; ++index) {
1075  if (index >= 0 && index < seam_array.size()) {
1076  SEAM* seam = seam_array[index];
1077  if (seam != NULL && seam->HasAnySplits()) return false;
1078  }
1079  }
1080  return true;
1081 }
1082 
1083 
1085  Clear();
1086 }
1087 
1089  tess_failed = FALSE;
1090  tess_accepted = FALSE;
1092  done = FALSE;
1094  small_caps = false;
1095  odd_size = false;
1096  italic = FALSE;
1097  bold = FALSE;
1098  // The fontinfos and tesseract count as non-pointers as they point to
1099  // data owned elsewhere.
1100  fontinfo = NULL;
1101  fontinfo2 = NULL;
1102  tesseract = NULL;
1103  fontinfo_id_count = 0;
1104  fontinfo_id2_count = 0;
1105  x_height = 0.0;
1106  caps_height = 0.0;
1107  baseline_shift = 0.0f;
1108  guessed_x_ht = TRUE;
1110  combination = FALSE;
1111  part_of_combo = FALSE;
1112  reject_spaces = FALSE;
1113 }
1114 
1116  word = NULL;
1117  bln_boxes = NULL;
1118  blob_row = NULL;
1119  uch_set = NULL;
1120  chopped_word = NULL;
1121  rebuild_word = NULL;
1122  box_word = NULL;
1123  ratings = NULL;
1124  best_choice = NULL;
1125  raw_choice = NULL;
1126  ep_choice = NULL;
1127  blamer_bundle = NULL;
1128 }
1129 
1131  if (word != NULL && combination) {
1132  delete word;
1133  }
1134  word = NULL;
1135  delete blamer_bundle;
1136  blamer_bundle = NULL;
1137  ClearResults();
1138 }
1139 
1141  done = false;
1142  fontinfo = NULL;
1143  fontinfo2 = NULL;
1144  fontinfo_id_count = 0;
1145  fontinfo_id2_count = 0;
1146  if (bln_boxes != NULL) {
1147  delete bln_boxes;
1148  bln_boxes = NULL;
1149  }
1150  blob_row = NULL;
1151  if (chopped_word != NULL) {
1152  delete chopped_word;
1153  chopped_word = NULL;
1154  }
1155  if (rebuild_word != NULL) {
1156  delete rebuild_word;
1157  rebuild_word = NULL;
1158  }
1159  if (box_word != NULL) {
1160  delete box_word;
1161  box_word = NULL;
1162  }
1163  best_state.clear();
1164  correct_text.clear();
1166  seam_array.clear();
1167  blob_widths.clear();
1168  blob_gaps.clear();
1169  ClearRatings();
1170  ClearWordChoices();
1172 }
1174  best_choice = NULL;
1175  if (raw_choice != NULL) {
1176  delete raw_choice;
1177  raw_choice = NULL;
1178  }
1179  best_choices.clear();
1180  if (ep_choice != NULL) {
1181  delete ep_choice;
1182  ep_choice = NULL;
1183  }
1184 }
1186  if (ratings != NULL) {
1188  delete ratings;
1189  ratings = NULL;
1190  }
1191 }
1192 
1193 
1194 bool PAGE_RES_IT::operator ==(const PAGE_RES_IT &other) const {
1195  return word_res == other.word_res &&
1196  row_res == other.row_res &&
1197  block_res == other.block_res;
1198 }
1199 
1200 int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
1201  ASSERT_HOST(page_res == other.page_res);
1202  if (other.block_res == NULL) {
1203  // other points to the end of the page.
1204  if (block_res == NULL)
1205  return 0;
1206  return -1;
1207  }
1208  if (block_res == NULL) {
1209  return 1; // we point to the end of the page.
1210  }
1211  if (block_res == other.block_res) {
1212  if (other.row_res == NULL || row_res == NULL) {
1213  // this should only happen if we hit an image block.
1214  return 0;
1215  }
1216  if (row_res == other.row_res) {
1217  // we point to the same block and row.
1218  ASSERT_HOST(other.word_res != NULL && word_res != NULL);
1219  if (word_res == other.word_res) {
1220  // we point to the same word!
1221  return 0;
1222  }
1223 
1224  WERD_RES_IT word_res_it(&row_res->word_res_list);
1225  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1226  word_res_it.forward()) {
1227  if (word_res_it.data() == word_res) {
1228  return -1;
1229  } else if (word_res_it.data() == other.word_res) {
1230  return 1;
1231  }
1232  }
1233  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
1234  }
1235 
1236  // we both point to the same block, but different rows.
1237  ROW_RES_IT row_res_it(&block_res->row_res_list);
1238  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1239  row_res_it.forward()) {
1240  if (row_res_it.data() == row_res) {
1241  return -1;
1242  } else if (row_res_it.data() == other.row_res) {
1243  return 1;
1244  }
1245  }
1246  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
1247  }
1248 
1249  // We point to different blocks.
1250  BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1251  for (block_res_it.mark_cycle_pt();
1252  !block_res_it.cycled_list(); block_res_it.forward()) {
1253  if (block_res_it.data() == block_res) {
1254  return -1;
1255  } else if (block_res_it.data() == other.block_res) {
1256  return 1;
1257  }
1258  }
1259  // Shouldn't happen...
1260  ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == NULL);
1261  return 0;
1262 }
1263 
1264 // Inserts the new_word as a combination owned by a corresponding WERD_RES
1265 // before the current position. The simple fields of the WERD_RES are copied
1266 // from clone_res and the resulting WERD_RES is returned for further setup
1267 // with best_choice etc.
1269  WERD* new_word) {
1270  // Make a WERD_RES for the new_word.
1271  WERD_RES* new_res = new WERD_RES(new_word);
1272  new_res->CopySimpleFields(clone_res);
1273  new_res->combination = true;
1274  // Insert into the appropriate place in the ROW_RES.
1275  WERD_RES_IT wr_it(&row()->word_res_list);
1276  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1277  WERD_RES* word = wr_it.data();
1278  if (word == word_res)
1279  break;
1280  }
1281  ASSERT_HOST(!wr_it.cycled_list());
1282  wr_it.add_before_then_move(new_res);
1283  if (wr_it.at_first()) {
1284  // This is the new first word, so reset the member iterator so it
1285  // detects the cycled_list state correctly.
1287  }
1288  return new_res;
1289 }
1290 
1291 // Helper computes the boundaries between blobs in the word. The blob bounds
1292 // are likely very poor, if they come from LSTM, where it only outputs the
1293 // character at one pixel within it, so we find the midpoints between them.
1294 static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
1295  GenericVector<int>* blob_ends) {
1296  C_BLOB_IT blob_it(word.word->cblob_list());
1297  for (int i = 0; i < word.best_state.size(); ++i) {
1298  int length = word.best_state[i];
1299  // Get the bounding box of the fake blobs
1300  TBOX blob_box = blob_it.data()->bounding_box();
1301  blob_it.forward();
1302  for (int b = 1; b < length; ++b) {
1303  blob_box += blob_it.data()->bounding_box();
1304  blob_it.forward();
1305  }
1306  // This blob_box is crap, so for now we are only looking for the
1307  // boundaries between them.
1308  int blob_end = MAX_INT32;
1309  if (!blob_it.at_first() || next_word_blobs != NULL) {
1310  if (blob_it.at_first())
1311  blob_it.set_to_list(next_word_blobs);
1312  blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
1313  }
1314  blob_ends->push_back(blob_end);
1315  }
1316 }
1317 
1318 // Replaces the current WERD/WERD_RES with the given words. The given words
1319 // contain fake blobs that indicate the position of the characters. These are
1320 // replaced with real blobs from the current word as much as possible.
1323  if (words->empty()) {
1325  return;
1326  }
1327  WERD_RES* input_word = word();
1328  // Set the BOL/EOL flags on the words from the input word.
1329  if (input_word->word->flag(W_BOL)) {
1330  (*words)[0]->word->set_flag(W_BOL, true);
1331  } else {
1332  (*words)[0]->word->set_blanks(1);
1333  }
1334  words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1335 
1336  // Move the blobs from the input word to the new set of words.
1337  // If the input word_res is a combination, then the replacements will also be
1338  // combinations, and will own their own words. If the input word_res is not a
1339  // combination, then the final replacements will not be either, (although it
1340  // is allowed for the input words to be combinations) and their words
1341  // will get put on the row list. This maintains the ownership rules.
1342  WERD_IT w_it(row()->row->word_list());
1343  if (!input_word->combination) {
1344  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1345  WERD* word = w_it.data();
1346  if (word == input_word->word)
1347  break;
1348  }
1349  // w_it is now set to the input_word's word.
1350  ASSERT_HOST(!w_it.cycled_list());
1351  }
1352  // Insert into the appropriate place in the ROW_RES.
1353  WERD_RES_IT wr_it(&row()->word_res_list);
1354  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1355  WERD_RES* word = wr_it.data();
1356  if (word == input_word)
1357  break;
1358  }
1359  ASSERT_HOST(!wr_it.cycled_list());
1360  // Since we only have an estimate of the bounds between blobs, use the blob
1361  // x-middle as the determiner of where to put the blobs
1362  C_BLOB_IT src_b_it(input_word->word->cblob_list());
1363  src_b_it.sort(&C_BLOB::SortByXMiddle);
1364  C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1365  rej_b_it.sort(&C_BLOB::SortByXMiddle);
1366  for (int w = 0; w < words->size(); ++w) {
1367  WERD_RES* word_w = (*words)[w];
1368  // Compute blob boundaries.
1369  GenericVector<int> blob_ends;
1370  C_BLOB_LIST* next_word_blobs =
1371  w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : NULL;
1372  ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
1373  // Delete the fake blobs on the current word.
1374  word_w->word->cblob_list()->clear();
1375  C_BLOB_IT dest_it(word_w->word->cblob_list());
1376  // Build the box word as we move the blobs.
1377  tesseract::BoxWord* box_word = new tesseract::BoxWord;
1378  for (int i = 0; i < blob_ends.size(); ++i) {
1379  int end_x = blob_ends[i];
1380  TBOX blob_box;
1381  // Add the blobs up to end_x.
1382  while (!src_b_it.empty() &&
1383  src_b_it.data()->bounding_box().x_middle() < end_x) {
1384  blob_box += src_b_it.data()->bounding_box();
1385  dest_it.add_after_then_move(src_b_it.extract());
1386  src_b_it.forward();
1387  }
1388  while (!rej_b_it.empty() &&
1389  rej_b_it.data()->bounding_box().x_middle() < end_x) {
1390  blob_box += rej_b_it.data()->bounding_box();
1391  dest_it.add_after_then_move(rej_b_it.extract());
1392  rej_b_it.forward();
1393  }
1394  // Clip to the previously computed bounds. Although imperfectly accurate,
1395  // it is good enough, and much more complicated to determine where else
1396  // to clip.
1397  if (i > 0 && blob_box.left() < blob_ends[i - 1])
1398  blob_box.set_left(blob_ends[i - 1]);
1399  if (blob_box.right() > end_x)
1400  blob_box.set_right(end_x);
1401  box_word->InsertBox(i, blob_box);
1402  }
1403  // Fix empty boxes. If a very joined blob sits over multiple characters,
1404  // then we will have some empty boxes from using the middle, so look for
1405  // overlaps.
1406  for (int i = 0; i < box_word->length(); ++i) {
1407  TBOX box = box_word->BlobBox(i);
1408  if (box.null_box()) {
1409  // Nothing has its middle in the bounds of this blob, so use anything
1410  // that overlaps.
1411  for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
1412  dest_it.forward()) {
1413  TBOX blob_box = dest_it.data()->bounding_box();
1414  if (blob_box.left() < blob_ends[i] &&
1415  (i == 0 || blob_box.right() >= blob_ends[i - 1])) {
1416  if (i > 0 && blob_box.left() < blob_ends[i - 1])
1417  blob_box.set_left(blob_ends[i - 1]);
1418  if (blob_box.right() > blob_ends[i])
1419  blob_box.set_right(blob_ends[i]);
1420  box_word->ChangeBox(i, blob_box);
1421  break;
1422  }
1423  }
1424  }
1425  }
1426  delete word_w->box_word;
1427  word_w->box_word = box_word;
1428  if (!input_word->combination) {
1429  // Insert word_w->word into the ROW. It doesn't own its word, so the
1430  // ROW needs to own it.
1431  w_it.add_before_stay_put(word_w->word);
1432  word_w->combination = false;
1433  }
1434  (*words)[w] = NULL; // We are taking ownership.
1435  wr_it.add_before_stay_put(word_w);
1436  }
1437  // We have taken ownership of the words.
1438  words->clear();
1439  // Delete the current word, which has been replaced. We could just call
1440  // DeleteCurrentWord, but that would iterate both lists again, and we know
1441  // we are already in the right place.
1442  if (!input_word->combination)
1443  delete w_it.extract();
1444  delete wr_it.extract();
1446 }
1447 
1448 // Deletes the current WERD_RES and its underlying WERD.
1450  // Check that this word is as we expect. part_of_combos are NEVER iterated
1451  // by the normal iterator, so we should never be trying to delete them.
1452  ASSERT_HOST(!word_res->part_of_combo);
1453  if (!word_res->combination) {
1454  // Combinations own their own word, so we won't find the word on the
1455  // row's word_list, but it is legitimate to try to delete them.
1456  // Delete word from the ROW when not a combination.
1457  WERD_IT w_it(row()->row->word_list());
1458  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1459  if (w_it.data() == word_res->word) {
1460  break;
1461  }
1462  }
1463  ASSERT_HOST(!w_it.cycled_list());
1464  delete w_it.extract();
1465  }
1466  // Remove the WERD_RES for the new_word.
1467  // Remove the WORD_RES from the ROW_RES.
1468  WERD_RES_IT wr_it(&row()->word_res_list);
1469  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1470  if (wr_it.data() == word_res) {
1471  word_res = NULL;
1472  break;
1473  }
1474  }
1475  ASSERT_HOST(!wr_it.cycled_list());
1476  delete wr_it.extract();
1478 }
1479 
1480 // Makes the current word a fuzzy space if not already fuzzy. Updates
1481 // corresponding part of combo if required.
1483  WERD* real_word = word_res->word;
1484  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1485  real_word->set_flag(W_FUZZY_SP, true);
1486  if (word_res->combination) {
1487  // The next word should be the corresponding part of combo, but we have
1488  // already stepped past it, so find it by search.
1489  WERD_RES_IT wr_it(&row()->word_res_list);
1490  for (wr_it.mark_cycle_pt();
1491  !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1492  }
1493  wr_it.forward();
1494  ASSERT_HOST(wr_it.data()->part_of_combo);
1495  real_word = wr_it.data()->word;
1496  ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1497  !real_word->flag(W_FUZZY_NON));
1498  real_word->set_flag(W_FUZZY_SP, true);
1499  }
1500  }
1501 }
1502 
1503 /*************************************************************************
1504  * PAGE_RES_IT::restart_page
1505  *
1506  * Set things up at the start of the page
1507  *************************************************************************/
1508 
1510  block_res_it.set_to_list(&page_res->block_res_list);
1511  block_res_it.mark_cycle_pt();
1512  prev_block_res = NULL;
1513  prev_row_res = NULL;
1514  prev_word_res = NULL;
1515  block_res = NULL;
1516  row_res = NULL;
1517  word_res = NULL;
1518  next_block_res = NULL;
1519  next_row_res = NULL;
1520  next_word_res = NULL;
1521  internal_forward(true, empty_ok);
1522  return internal_forward(false, empty_ok);
1523 }
1524 
1525 // Recovers from operations on the current word, such as in InsertCloneWord
1526 // and DeleteCurrentWord.
1527 // Resets the word_res_it so that it is one past the next_word_res, as
1528 // it should be after internal_forward. If next_row_res != row_res,
1529 // then the next_word_res is in the next row, so there is no need to do
1530 // anything to word_res_it, but it is still a good idea to reset the pointers
1531 // word_res and prev_word_res, which are still in the current row.
1533  if (row_res == next_row_res) {
1534  // Reset the member iterator so it can move forward and detect the
1535  // cycled_list state correctly.
1536  word_res_it.move_to_first();
1537  for (word_res_it.mark_cycle_pt();
1538  !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1539  word_res_it.forward()) {
1540  if (!word_res_it.data()->part_of_combo) {
1541  if (prev_row_res == row_res) prev_word_res = word_res;
1542  word_res = word_res_it.data();
1543  }
1544  }
1545  ASSERT_HOST(!word_res_it.cycled_list());
1546  word_res_it.forward();
1547  } else {
1548  // word_res_it is OK, but reset word_res and prev_word_res if needed.
1549  WERD_RES_IT wr_it(&row_res->word_res_list);
1550  for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1551  if (!wr_it.data()->part_of_combo) {
1552  if (prev_row_res == row_res) prev_word_res = word_res;
1553  word_res = wr_it.data();
1554  }
1555  }
1556  }
1557 }
1558 
1559 /*************************************************************************
1560  * PAGE_RES_IT::internal_forward
1561  *
1562  * Find the next word on the page. If empty_ok is true, then non-text blocks
1563  * and text blocks with no text are visited as if they contain a single
1564  * imaginary word in a single imaginary row. (word() and row() both return NULL
1565  * in such a block and the return value is NULL.)
1566  * If empty_ok is false, the old behaviour is maintained. Each real word
1567  * is visited and empty and non-text blocks and rows are skipped.
1568  * new_block is used to initialize the iterators for a new block.
1569  * The iterator maintains pointers to block, row and word for the previous,
1570  * current and next words. These are correct, regardless of block/row
1571  * boundaries. NULL values denote start and end of the page.
1572  *************************************************************************/
1573 
1574 WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1575  bool new_row = false;
1576 
1577  prev_block_res = block_res;
1578  prev_row_res = row_res;
1579  prev_word_res = word_res;
1580  block_res = next_block_res;
1581  row_res = next_row_res;
1582  word_res = next_word_res;
1583  next_block_res = NULL;
1584  next_row_res = NULL;
1585  next_word_res = NULL;
1586 
1587  while (!block_res_it.cycled_list()) {
1588  if (new_block) {
1589  new_block = false;
1590  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
1591  row_res_it.mark_cycle_pt();
1592  if (row_res_it.empty() && empty_ok) {
1593  next_block_res = block_res_it.data();
1594  break;
1595  }
1596  new_row = true;
1597  }
1598  while (!row_res_it.cycled_list()) {
1599  if (new_row) {
1600  new_row = false;
1601  word_res_it.set_to_list(&row_res_it.data()->word_res_list);
1602  word_res_it.mark_cycle_pt();
1603  }
1604  // Skip any part_of_combo words.
1605  while (!word_res_it.cycled_list() && word_res_it.data()->part_of_combo)
1606  word_res_it.forward();
1607  if (!word_res_it.cycled_list()) {
1608  next_block_res = block_res_it.data();
1609  next_row_res = row_res_it.data();
1610  next_word_res = word_res_it.data();
1611  word_res_it.forward();
1612  goto foundword;
1613  }
1614  // end of row reached
1615  row_res_it.forward();
1616  new_row = true;
1617  }
1618  // end of block reached
1619  block_res_it.forward();
1620  new_block = true;
1621  }
1622  foundword:
1623  // Update prev_word_best_choice pointer.
1626  (new_block || prev_word_res == NULL) ? NULL : prev_word_res->best_choice;
1627  }
1628  return word_res;
1629 }
1630 
1631 /*************************************************************************
1632  * PAGE_RES_IT::restart_row()
1633  *
1634  * Move to the beginning (leftmost word) of the current row.
1635  *************************************************************************/
1637  ROW_RES *row = this->row();
1638  if (!row) return NULL;
1639  for (restart_page(); this->row() != row; forward()) {
1640  // pass
1641  }
1642  return word();
1643 }
1644 
1645 /*************************************************************************
1646  * PAGE_RES_IT::forward_paragraph
1647  *
1648  * Move to the beginning of the next paragraph, allowing empty blocks.
1649  *************************************************************************/
1650 
1652  while (block_res == next_block_res &&
1653  (next_row_res != NULL && next_row_res->row != NULL &&
1654  row_res->row->para() == next_row_res->row->para())) {
1655  internal_forward(false, true);
1656  }
1657  return internal_forward(false, true);
1658 }
1659 
1660 /*************************************************************************
1661  * PAGE_RES_IT::forward_block
1662  *
1663  * Move to the beginning of the next block, allowing empty blocks.
1664  *************************************************************************/
1665 
1667  while (block_res == next_block_res) {
1668  internal_forward(false, true);
1669  }
1670  return internal_forward(false, true);
1671 }
1672 
1674  inT16 chars_in_word;
1675  inT16 rejects_in_word = 0;
1676 
1677  chars_in_word = word_res->reject_map.length ();
1678  page_res->char_count += chars_in_word;
1679  block_res->char_count += chars_in_word;
1680  row_res->char_count += chars_in_word;
1681 
1682  rejects_in_word = word_res->reject_map.reject_count ();
1683 
1684  page_res->rej_count += rejects_in_word;
1685  block_res->rej_count += rejects_in_word;
1686  row_res->rej_count += rejects_in_word;
1687  if (chars_in_word == rejects_in_word)
1688  row_res->whole_word_rej_count += rejects_in_word;
1689 }
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:343
bool StatesAllValid()
Definition: pageres.cpp:449
BOOL8 tess_accepted
Definition: pageres.h:280
Definition: blobs.h:261
void SetScriptPositions()
Definition: pageres.cpp:853
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:471
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:268
virtual R Run(A1, A2)=0
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
void rej_stat_word()
Definition: pageres.cpp:1673
int size() const
Definition: genericvector.h:72
WERD_RES_LIST word_res_list
Definition: pageres.h:131
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
void truncate(int size)
tesseract::BoxWord * box_word
Definition: pageres.h:250
void remove_unichar_id(int index)
Definition: ratngs.h:481
void ClearResults()
Definition: pageres.cpp:1140
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
float rating() const
Definition: ratngs.h:324
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:528
void RebuildBestState()
Definition: pageres.cpp:800
inT32 length() const
Definition: rejctmap.h:237
int length() const
Definition: genericvector.h:79
bool HasAnySplits() const
Definition: seam.h:67
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
void ClearWordChoices()
Definition: pageres.cpp:1173
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1509
MATRIX * ratings
Definition: pageres.h:215
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:553
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
void ResetWordIterator()
Definition: pageres.cpp:1532
int push_back(T object)
REJMAP reject_map
Definition: pageres.h:271
T get(int column, int row) const
Definition: matrix.h:171
#define ELISTIZE(CLASSNAME)
Definition: elst.h:994
float ascenders() const
Definition: ocrrow.h:79
TWERD * chopped_word
Definition: pageres.h:201
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1321
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:449
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
inT32 char_count
Definition: pageres.h:60
inT8 bold
Definition: pageres.h:286
#define tprintf(...)
Definition: tprintf.h:31
void SetupBlamerBundle()
Definition: pageres.cpp:384
inT32 whole_word_rej_count
Definition: pageres.h:130
const TBOX & BlobBox(int index) const
Definition: boxword.h:88
T & back() const
WERD_RES & operator=(const WERD_RES &source)
Definition: pageres.cpp:178
BOOL8 reject_spaces
Definition: pageres.h:317
void set_right(int x)
Definition: rect.h:78
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134
void set_permuter(uinT8 perm)
Definition: ratngs.h:372
void MergeBlobs(int start, int end)
Definition: blobs.cpp:892
float caps_height
Definition: pageres.h:296
PAGE_RES * page_res
Definition: pageres.h:658
const double kMaxWordSizeRatio
Definition: pageres.cpp:44
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:334
float x_height() const
Definition: ocrrow.h:61
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:375
TBOX bounding_box() const
Definition: werd.cpp:160
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
BOOL8 font_assigned
Definition: pageres.h:105
GenericVector< STRING > correct_text
Definition: pageres.h:259
void operator=(const ELIST_LINK &)
Definition: elst.h:101
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:685
const FontInfo * fontinfo
Definition: pageres.h:288
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119
void fix_quotes()
Definition: pageres.cpp:1012
void ComputeBoundingBoxes()
Definition: blobs.cpp:875
bool IsText() const
Definition: polyblk.h:52
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:82
bool small_caps
Definition: pageres.h:283
void DeleteCurrentWord()
Definition: pageres.cpp:1449
inT16 right() const
Definition: rect.h:75
bool null_box() const
Definition: rect.h:46
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160
void Init()
Definition: pageres.h:75
void set_left(int x)
Definition: rect.h:71
void put(int column, int row, const T &thing)
Definition: matrix.h:166
float x_height
Definition: pageres.h:295
int dimension() const
Definition: matrix.h:247
void start_seam_list(TWERD *word, GenericVector< SEAM * > *seam_array)
Definition: seam.cpp:269
BLOCK * block
Definition: pageres.h:99
BOOL8 tess_would_adapt
Definition: pageres.h:281
bool dangerous_ambig_found() const
Definition: ratngs.h:360
#define ASSERT_HOST(x)
Definition: errcode.h:84
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:793
Definition: ocrrow.h:32
UNICHAR_ID BothHyphens(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1024
bool HyphenBoxesOverlap(const TBOX &box1, const TBOX &box2)
Definition: pageres.cpp:1035
void ClipToOriginalWord(const BLOCK *block, WERD *original_word)
Definition: boxword.cpp:95
const STRING & unichar_string() const
Definition: ratngs.h:524
Definition: werd.h:35
float x_height
Definition: pageres.h:104
void InitPointers()
Definition: pageres.cpp:1115
void delete_matrix_pointers()
Definition: matrix.h:191
BOOL8 part_of_combo
Definition: pageres.h:316
void DebugTopChoice(const char *msg) const
Definition: pageres.cpp:490
BOOL8 combination
Definition: pageres.h:315
WERD_RES * forward()
Definition: pageres.h:713
int NumBlobs() const
Definition: blobs.h:425
int GetBlobsGap(int blob_index)
Definition: pageres.cpp:732
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:280
float rating() const
Definition: ratngs.h:79
inT8 fontinfo_id_count
Definition: pageres.h:290
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1482
WERD_RES * restart_page()
Definition: pageres.h:680
int state(int index) const
Definition: ratngs.h:316
void insert(T t, int index)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
Definition: werd.h:36
BOOL8 italic
Definition: pageres.h:108
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:440
inT16 left() const
Definition: rect.h:68
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
float certainty() const
Definition: ratngs.h:327
ROW_RES()
Definition: pageres.h:133
TWERD * rebuild_word
Definition: pageres.h:244
bool script_has_xheight() const
Definition: unicharset.h:849
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
inT32 char_count
Definition: pageres.h:100
void delete_data_pointers()
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
Definition: ocrblock.h:30
const double kMaxLineSizeRatio
Definition: pageres.cpp:46
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID BothSpaces(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1053
void InitNonPointers()
Definition: pageres.cpp:1088
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:504
int latin_sid() const
Definition: unicharset.h:833
ROW_RES * row() const
Definition: pageres.h:736
DENORM denorm
Definition: pageres.h:190
GenericVector< int > blob_gaps
Definition: pageres.h:208
bool odd_size
Definition: pageres.h:284
WERD_CHOICE * raw_choice
Definition: pageres.h:224
WERD_RES * forward_block()
Definition: pageres.cpp:1666
WERD_RES * restart_row()
Definition: pageres.cpp:1636
bool Valid(const MATRIX &m) const
Definition: matrix.h:327
void SetupBlobWidthsAndGaps()
Definition: pageres.cpp:391
inT32 rej_count
Definition: pageres.h:61
tesseract::Tesseract * tesseract
Definition: pageres.h:266
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1072
void PrintBestChoices() const
Definition: pageres.cpp:709
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
int cmp(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1200
int UNICHAR_ID
Definition: unichar.h:33
float baseline_shift
Definition: pageres.h:297
PARA * para() const
Definition: ocrrow.h:115
void remove_pos(inT16 pos)
Definition: rejctmap.cpp:365
BOOL8 guessed_x_ht
Definition: pageres.h:292
const FontInfo * fontinfo2
Definition: pageres.h:289
#define MAX_INT32
Definition: host.h:120
void set_script_id(int id)
Definition: werd.h:113
inT32 rej_count
Definition: pageres.h:129
Definition: werd.h:60
inT32 char_count
Definition: pageres.h:128
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:216
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
Definition: werd.h:44
BOOL8 done
Definition: pageres.h:282
inT8 fontinfo_id2_count
Definition: pageres.h:291
int GetBlobsWidth(int start_blob, int last_blob)
Definition: pageres.cpp:722
WERD * word
Definition: pageres.h:175
bool empty() const
Definition: genericvector.h:84
inT32 rej_count
Definition: pageres.h:101
inT16 height() const
Definition: rect.h:104
ROW_RES_LIST row_res_list
Definition: pageres.h:110
BLOCK_RES()
Definition: pageres.h:112
void add_str_int(const char *str, int number)
Definition: strngs.cpp:376
void remove(int index)
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1268
inT16 font_class
Definition: pageres.h:102
inT16 width() const
Definition: rect.h:111
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:187
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
ROW * blob_row
Definition: pageres.h:186
const int length() const
Definition: boxword.h:85
#define FALSE
Definition: capi.h:29
WERD_CHOICE * ep_choice
Definition: pageres.h:270
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:430
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
bool ConditionalBlobMerge(TessResultCallback2< UNICHAR_ID, UNICHAR_ID, UNICHAR_ID > *class_cb, TessResultCallback2< bool, const TBOX &, const TBOX & > *box_cb)
Definition: pageres.cpp:932
void reserve(int size)
BOOL8 tess_failed
Definition: pageres.h:272
int count(LIST var_list)
Definition: oldlist.cpp:108
float descenders() const
Definition: ocrrow.h:82
void fix_hyphens()
Definition: pageres.cpp:1041
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:826
ROW * row
Definition: pageres.h:127
inT16 reject_count()
Definition: rejctmap.h:243
float adjust_factor() const
Definition: ratngs.h:303
Definition: rect.h:30
MATRIX * ConsumeAndMakeBigger(int ind)
Definition: matrix.cpp:58
#define TRUE
Definition: capi.h:28
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
const int kWordrecMaxNumJoinChunks
Definition: pageres.cpp:41
int default_sid() const
Definition: unicharset.h:839
void CopyResults(const BlamerBundle &other)
Definition: blamer.h:194
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:860
void InsertBox(int index, const TBOX &box)
Definition: boxword.cpp:151
void SetupBoxWord()
Definition: pageres.cpp:843
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:968
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
PAGE_RES()
Definition: pageres.h:83
static BoxWord * CopyFromNormalized(TWERD *tessword)
Definition: boxword.cpp:59
Definition: matrix.h:289
void print() const
Definition: ratngs.h:563
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
Definition: strngs.h:44
void initialise(inT16 length)
Definition: rejctmap.cpp:318
void move(GenericVector< T > *from)
const BLOCK * block() const
Definition: normalis.h:275
GenericVector< int > best_state
Definition: pageres.h:255
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
inT16 row_count
Definition: pageres.h:103
#define NULL
Definition: host.h:144
bool IsAmbiguous()
Definition: pageres.cpp:443
GenericVector< int > blob_widths
Definition: pageres.h:205
void SetupNormTruthWord(const DENORM &denorm)
Definition: blamer.cpp:145
UNICHAR_ID BothQuotes(UNICHAR_ID id1, UNICHAR_ID id2)
Definition: pageres.cpp:1002
bool operator==(const PAGE_RES_IT &other) const
Definition: pageres.cpp:1194
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
WERD_CHOICE ** prev_word_best_choice
Definition: pageres.h:66
Definition: blobs.h:395
Definition: seam.h:44
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:609
TBOX bounding_box() const
Definition: blobs.cpp:482
const double kMaxWordGapRatio
Definition: pageres.cpp:48
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void copy_on(WERD_RES *word_res)
Definition: pageres.h:641
inT8 italic
Definition: pageres.h:285
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:194
const char * string() const
Definition: strngs.cpp:193
WERD_RES * forward_paragraph()
Definition: pageres.cpp:1651
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
BOOL8 bold
Definition: pageres.h:107
int TotalOfStates() const
Definition: ratngs.cpp:697
void ClearResults()
Definition: blamer.h:173
float certainty() const
Definition: ratngs.h:82
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void merge_tess_fails()
Definition: pageres.cpp:1061
void FakeWordFromRatings()
Definition: pageres.cpp:892
void BestChoiceToCorrectText()
Definition: pageres.cpp:917
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:751
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void Clear()
Definition: pageres.cpp:1130
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
void BLNormalize(const BLOCK *block, const ROW *row, Pix *pix, bool inverse, float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX *norm_box, DENORM *word_denorm)
Definition: blobs.cpp:807
BOOL8 guessed_caps_ht
Definition: pageres.h:293
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
WERD_RES * word() const
Definition: pageres.h:733
short inT16
Definition: host.h:100
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
void ClearRatings()
Definition: pageres.cpp:1185
float body_size() const
Definition: ocrrow.h:70
void CopySimpleFields(const WERD_RES &source)
Definition: pageres.cpp:241