All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
chopper.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: chopper.c (Formerly chopper.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Tue Jul 30 16:18:52 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  **************************************************************************/
25 
26 /*----------------------------------------------------------------------
27  I n c l u d e s
28 ----------------------------------------------------------------------*/
29 
30 #include <math.h>
31 
32 #include "chopper.h"
33 
34 #include "assert.h"
35 #include "associate.h"
36 #include "blobs.h"
37 #include "callcpp.h"
38 #include "const.h"
39 #include "findseam.h"
40 #include "freelist.h"
41 #include "globals.h"
42 #include "render.h"
43 #include "pageres.h"
44 #include "seam.h"
45 #include "stopper.h"
46 #include "structures.h"
47 #include "unicharset.h"
48 #include "wordrec.h"
49 
50 // Include automatically generated configuration file if running autoconf.
51 #ifdef HAVE_CONFIG_H
52 #include "config_auto.h"
53 #endif
54 
55 // Even though the limit on the number of chunks may now be removed, keep
56 // the same limit for repeatable behavior, and it may be a speed advantage.
57 static const int kMaxNumChunks = 64;
58 
59 /*----------------------------------------------------------------------
60  F u n c t i o n s
61 ----------------------------------------------------------------------*/
67 void preserve_outline(EDGEPT *start) {
68  EDGEPT *srcpt;
69 
70  if (start == NULL)
71  return;
72  srcpt = start;
73  do {
74  srcpt->flags[1] = 1;
75  srcpt = srcpt->next;
76  }
77  while (srcpt != start);
78  srcpt->flags[1] = 2;
79 }
80 
81 
82 /**************************************************************************/
84  TESSLINE *outline;
85 
86  for (outline = srcline; outline != NULL; outline = outline->next) {
87  preserve_outline (outline->loop);
88  }
89 }
90 
91 
98  EDGEPT *srcpt;
99  EDGEPT *real_start;
100 
101  if (start == NULL)
102  return NULL;
103  srcpt = start;
104  do {
105  if (srcpt->flags[1] == 2)
106  break;
107  srcpt = srcpt->next;
108  }
109  while (srcpt != start);
110  real_start = srcpt;
111  do {
112  srcpt = srcpt->next;
113  if (srcpt->prev->flags[1] == 0) {
114  remove_edgept(srcpt->prev);
115  }
116  }
117  while (srcpt != real_start);
118  return real_start;
119 }
120 
121 
122 /******************************************************************************/
124  TESSLINE *outline;
125 
126  for (outline = srcline; outline != NULL; outline = outline->next) {
127  outline->loop = restore_outline (outline->loop);
128  outline->start = outline->loop->pos;
129  }
130 }
131 
132 // Helper runs all the checks on a seam to make sure it is valid.
133 // Returns the seam if OK, otherwise deletes the seam and returns NULL.
134 static SEAM* CheckSeam(int debug_level, inT32 blob_number, TWERD* word,
135  TBLOB* blob, TBLOB* other_blob,
136  const GenericVector<SEAM*>& seams, SEAM* seam) {
137  if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL ||
138  total_containment(blob, other_blob) || check_blob(other_blob) ||
139  !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) ||
140  any_shared_split_points(seams, seam) ||
141  !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) {
142  word->blobs.remove(blob_number + 1);
143  if (seam) {
144  seam->UndoSeam(blob, other_blob);
145  delete seam;
146  seam = NULL;
147 #ifndef GRAPHICS_DISABLED
148  if (debug_level) {
149  if (debug_level >2)
150  display_blob(blob, Red);
151  tprintf("\n** seam being removed ** \n");
152  }
153 #endif
154  } else {
155  delete other_blob;
156  }
157  return NULL;
158  }
159  return seam;
160 }
161 
162 
169 namespace tesseract {
170 SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number,
171  bool italic_blob,
172  const GenericVector<SEAM*>& seams) {
175  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
176  // Insert it into the word.
177  word->blobs.insert(other_blob, blob_number + 1);
178 
179  SEAM *seam = NULL;
180  if (prioritize_division) {
181  TPOINT location;
182  if (divisible_blob(blob, italic_blob, &location)) {
183  seam = new SEAM(0.0f, location);
184  }
185  }
186  if (seam == NULL)
187  seam = pick_good_seam(blob);
188  if (chop_debug) {
189  if (seam != NULL)
190  seam->Print("Good seam picked=");
191  else
192  tprintf("\n** no seam picked *** \n");
193  }
194  if (seam) {
195  seam->ApplySeam(italic_blob, blob, other_blob);
196  }
197 
198  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
199  seams, seam);
200  if (seam == NULL) {
204  // If the blob can simply be divided into outlines, then do that.
205  TPOINT location;
206  if (divisible_blob(blob, italic_blob, &location)) {
207  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
208  word->blobs.insert(other_blob, blob_number + 1);
209  seam = new SEAM(0.0f, location);
210  seam->ApplySeam(italic_blob, blob, other_blob);
211  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
212  seams, seam);
213  }
214  }
215  }
216  if (seam != NULL) {
217  // Make sure this seam doesn't get chopped again.
218  seam->Finalize();
219  }
220  return seam;
221 }
222 
223 
225  bool italic_blob,
226  const GenericVector<SEAM*>& seams) {
227  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
228  italic_blob, seams);
229 }
230 
231 
233  bool italic_blob, WERD_RES *word_res,
234  int *blob_number) {
235  TWERD *word = word_res->chopped_word;
236  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
237  TBLOB *blob = word->blobs[*blob_number];
238  TPOINT topleft, botright;
239  topleft.x = blob->bounding_box().left();
240  topleft.y = blob->bounding_box().top();
241  botright.x = blob->bounding_box().right();
242  botright.y = blob->bounding_box().bottom();
243 
244  TPOINT original_topleft, original_botright;
245  word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
246  word_res->denorm.DenormTransform(NULL, botright, &original_botright);
247 
248  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
249  original_botright.x, original_topleft.y);
250 
251  bool almost_equal_box = false;
252  int num_overlap = 0;
253  for (int i = 0; i < boxes.size(); i++) {
254  if (original_box.overlap_fraction(boxes[i]) > 0.125)
255  num_overlap++;
256  if (original_box.almost_equal(boxes[i], 3))
257  almost_equal_box = true;
258  }
259 
260  TPOINT location;
261  if (divisible_blob(blob, italic_blob, &location) ||
262  (!almost_equal_box && num_overlap > 1)) {
263  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
264  italic_blob, word_res->seam_array);
265  if (seam != NULL)
266  return seam;
267  }
268  }
269 
270  *blob_number = -1;
271  return NULL;
272 }
273 
274 } // namespace tesseract
275 
276 
283  int length;
284  int index;
285 
286  length = seams.size();
287  for (index = 0; index < length; index++)
288  if (seam->SharesPosition(*seams[index])) return TRUE;
289  return FALSE;
290 }
291 
292 
298 int check_blob(TBLOB *blob) {
299  TESSLINE *outline;
300  EDGEPT *edgept;
301 
302  for (outline = blob->outlines; outline != NULL; outline = outline->next) {
303  edgept = outline->loop;
304  do {
305  if (edgept == NULL)
306  break;
307  edgept = edgept->next;
308  }
309  while (edgept != outline->loop);
310  if (edgept == NULL)
311  return 1;
312  }
313  return 0;
314 }
315 
316 
317 namespace tesseract {
331  DANGERR *fixpt,
332  bool split_next_to_fragment,
333  bool italic_blob,
334  WERD_RES* word,
335  int* blob_number) {
336  float rating_ceiling = MAX_FLOAT32;
337  SEAM *seam = NULL;
338  do {
339  *blob_number = select_blob_to_split_from_fixpt(fixpt);
340  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
341  bool split_point_from_dict = (*blob_number != -1);
342  if (split_point_from_dict) {
343  fixpt->clear();
344  } else {
345  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
346  split_next_to_fragment);
347  }
348  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
349  if (*blob_number == -1)
350  return NULL;
351 
352  // TODO(rays) it may eventually help to allow italic_blob to be true,
353  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
354  word->seam_array);
355  if (seam != NULL)
356  return seam; // Success!
357  if (blob_choices[*blob_number] == NULL)
358  return NULL;
359  if (!split_point_from_dict) {
360  // We chopped the worst rated blob, try something else next time.
361  rating_ceiling = blob_choices[*blob_number]->rating();
362  }
363  } while (true);
364  return seam;
365 }
366 
375  const GenericVector<BLOB_CHOICE*>& blob_choices,
376  WERD_RES* word_res,
377  int* blob_number) {
378  if (prioritize_division) {
379  return chop_overlapping_blob(boxes, true, word_res, blob_number);
380  } else {
381  return improve_one_blob(blob_choices, NULL, false, true, word_res,
382  blob_number);
383  }
384 }
385 
395  int num_blobs = word->chopped_word->NumBlobs();
396  if (word->ratings == NULL) {
397  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
398  }
399  if (word->ratings->get(0, 0) == NULL) {
400  // Run initial classification.
401  for (int b = 0; b < num_blobs; ++b) {
402  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
403  "Initial:", word->chopped_word,
404  word->blamer_bundle);
405  word->ratings->put(b, b, choices);
406  }
407  } else {
408  // Blobs have been pre-classified. Set matrix cell for all blob choices
409  for (int col = 0; col < word->ratings->dimension(); ++col) {
410  for (int row = col; row < word->ratings->dimension() &&
411  row < col + word->ratings->bandwidth(); ++row) {
412  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
413  if (choices != NULL) {
414  BLOB_CHOICE_IT bc_it(choices);
415  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
416  bc_it.data()->set_matrix_cell(col, row);
417  }
418  }
419  }
420  }
421  }
422 
423  // Run Segmentation Search.
424  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
425  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
426 
427  if (word->best_choice == NULL) {
428  // SegSearch found no valid paths, so just use the leading diagonal.
429  word->FakeWordFromRatings();
430  }
431  word->RebuildBestState();
432  // If we finished without a hyphen at the end of the word, let the next word
433  // be found in the dictionary.
434  if (word->word->flag(W_EOL) &&
435  !getDict().has_hyphen_end(*word->best_choice)) {
436  getDict().reset_hyphen_vars(true);
437  }
438 
439  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
440  CallFillLattice(*word->ratings, word->best_choices,
441  *word->uch_set, word->blamer_bundle);
442  }
443  if (wordrec_debug_level > 0) {
444  tprintf("Final Ratings Matrix:\n");
445  word->ratings->print(getDict().getUnicharset());
446  }
447  word->FilterWordChoices(getDict().stopper_debug_level);
448 }
449 
457 void Wordrec::improve_by_chopping(float rating_cert_scale,
458  WERD_RES* word,
459  BestChoiceBundle* best_choice_bundle,
460  BlamerBundle* blamer_bundle,
461  LMPainPoints* pain_points,
463  int blob_number;
464  do { // improvement loop.
465  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
466  // one to chop.
467  GenericVector<BLOB_CHOICE*> blob_choices;
468  int num_blobs = word->ratings->dimension();
469  for (int i = 0; i < num_blobs; ++i) {
470  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
471  if (choices == NULL || choices->empty()) {
472  blob_choices.push_back(NULL);
473  } else {
474  BLOB_CHOICE_IT bc_it(choices);
475  blob_choices.push_back(bc_it.data());
476  }
477  }
478  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
479  false, false, word, &blob_number);
480  if (seam == NULL) break;
481  // A chop has been made. We have to correct all the data structures to
482  // take into account the extra bottom-level blob.
483  // Put the seam into the seam_array and correct everything else on the
484  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
485  // states in WERD_CHOICEs, and blob widths.
486  word->InsertSeam(blob_number, seam);
487  // Insert a new entry in the beam array.
488  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
489  // Fixpts are outdated, but will get recalculated.
490  best_choice_bundle->fixpt.clear();
491  // Remap existing pain points.
492  pain_points->RemapForSplit(blob_number);
493  // Insert a new pending at the chop point.
494  pending->insert(SegSearchPending(), blob_number);
495 
496  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
497  // as that updates the pending correctly and adds new pain points.
498  MATRIX_COORD pain_point(blob_number, blob_number);
499  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
500  pain_points, blamer_bundle);
501  pain_point.col = blob_number + 1;
502  pain_point.row = blob_number + 1;
503  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
504  pain_points, blamer_bundle);
506  // N-gram evaluation depends on the number of blobs in a chunk, so we
507  // have to re-evaluate everything in the word.
508  ResetNGramSearch(word, best_choice_bundle, pending);
509  blob_number = 0;
510  }
511  // Run language model incrementally. (Except with the n-gram model on.)
512  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
513  word, pain_points, best_choice_bundle, blamer_bundle);
514  } while (!language_model_->AcceptableChoiceFound() &&
515  word->ratings->dimension() < kMaxNumChunks);
516 
517  // If after running only the chopper best_choice is incorrect and no blame
518  // has been yet set, blame the classifier if best_choice is classifier's
519  // top choice and is a dictionary word (i.e. language model could not have
520  // helped). Otherwise blame the tradeoff between the classifier and
521  // the old language model (permuters).
522  if (word->blamer_bundle != NULL &&
524  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
525  bool valid_permuter = word->best_choice != NULL &&
528  getDict().getUnicharset(),
529  valid_permuter,
531  }
532 }
533 
534 
535 /**********************************************************************
536  * select_blob_to_split
537  *
538  * These are the results of the last classification. Find a likely
539  * place to apply splits. If none, return -1.
540  **********************************************************************/
542  const GenericVector<BLOB_CHOICE*>& blob_choices,
543  float rating_ceiling, bool split_next_to_fragment) {
544  BLOB_CHOICE *blob_choice;
545  int x;
546  float worst = -MAX_FLOAT32;
547  int worst_index = -1;
548  float worst_near_fragment = -MAX_FLOAT32;
549  int worst_index_near_fragment = -1;
550  const CHAR_FRAGMENT **fragments = NULL;
551 
552  if (chop_debug) {
553  if (rating_ceiling < MAX_FLOAT32)
554  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
555  else
556  tprintf("rating_ceiling = No Limit\n");
557  }
558 
559  if (split_next_to_fragment && blob_choices.size() > 0) {
560  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
561  if (blob_choices[0] != NULL) {
562  fragments[0] = getDict().getUnicharset().get_fragment(
563  blob_choices[0]->unichar_id());
564  } else {
565  fragments[0] = NULL;
566  }
567  }
568 
569  for (x = 0; x < blob_choices.size(); ++x) {
570  if (blob_choices[x] == NULL) {
571  if (fragments != NULL) {
572  delete[] fragments;
573  }
574  return x;
575  } else {
576  blob_choice = blob_choices[x];
577  // Populate fragments for the following position.
578  if (split_next_to_fragment && x+1 < blob_choices.size()) {
579  if (blob_choices[x + 1] != NULL) {
580  fragments[x + 1] = getDict().getUnicharset().get_fragment(
581  blob_choices[x + 1]->unichar_id());
582  } else {
583  fragments[x + 1] = NULL;
584  }
585  }
586  if (blob_choice->rating() < rating_ceiling &&
587  blob_choice->certainty() < tessedit_certainty_threshold) {
588  // Update worst and worst_index.
589  if (blob_choice->rating() > worst) {
590  worst_index = x;
591  worst = blob_choice->rating();
592  }
593  if (split_next_to_fragment) {
594  // Update worst_near_fragment and worst_index_near_fragment.
595  bool expand_following_fragment =
596  (x + 1 < blob_choices.size() &&
597  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
598  bool expand_preceding_fragment =
599  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
600  if ((expand_following_fragment || expand_preceding_fragment) &&
601  blob_choice->rating() > worst_near_fragment) {
602  worst_index_near_fragment = x;
603  worst_near_fragment = blob_choice->rating();
604  if (chop_debug) {
605  tprintf("worst_index_near_fragment=%d"
606  " expand_following_fragment=%d"
607  " expand_preceding_fragment=%d\n",
608  worst_index_near_fragment,
609  expand_following_fragment,
610  expand_preceding_fragment);
611  }
612  }
613  }
614  }
615  }
616  }
617  if (fragments != NULL) {
618  delete[] fragments;
619  }
620  // TODO(daria): maybe a threshold of badness for
621  // worst_near_fragment would be useful.
622  return worst_index_near_fragment != -1 ?
623  worst_index_near_fragment : worst_index;
624 }
625 
626 /**********************************************************************
627  * select_blob_to_split_from_fixpt
628  *
629  * Given the fix point from a dictionary search, if there is a single
630  * dangerous blob that maps to multiple characters, return that blob
631  * index as a place we need to split. If none, return -1.
632  **********************************************************************/
634  if (!fixpt)
635  return -1;
636  for (int i = 0; i < fixpt->size(); i++) {
637  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
638  (*fixpt)[i].dangerous &&
639  (*fixpt)[i].correct_is_ngram) {
640  return (*fixpt)[i].begin;
641  }
642  }
643  return -1;
644 }
645 
646 
647 } // namespace tesseract
648 
649 
650 /**********************************************************************
651  * total_containment
652  *
653  * Check to see if one of these outlines is totally contained within
654  * the bounding box of the other.
655  **********************************************************************/
657  TBOX box1 = blob1->bounding_box();
658  TBOX box2 = blob2->bounding_box();
659  return box1.contains(box2) || box2.contains(box1);
660 }
Definition: blobs.h:261
int select_blob_to_split(const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:541
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:83
int size() const
Definition: genericvector.h:72
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:934
int repair_unchopped_blobs
Definition: wordrec.h:137
void RebuildBestState()
Definition: pageres.cpp:800
int length() const
Definition: genericvector.h:79
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
MATRIX * ratings
Definition: pageres.h:215
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:394
WERD_CHOICE * best_choice
Definition: pageres.h:219
int push_back(T object)
T get(int column, int row) const
Definition: matrix.h:171
inT16 total_containment(TBLOB *blob1, TBLOB *blob2)
Definition: chopper.cpp:656
EDGEPT * restore_outline(EDGEPT *start)
Definition: chopper.cpp:97
TWERD * chopped_word
Definition: pageres.h:201
EDGEPT * prev
Definition: blobs.h:170
#define tprintf(...)
Definition: tprintf.h:31
TPOINT start
Definition: blobs.h:255
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
void Print(const char *label) const
Definition: seam.cpp:160
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:330
void remove_edgept(EDGEPT *point)
Definition: split.cpp:203
inT16 y
Definition: blobs.h:72
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
bool prioritize_division
Definition: classify.h:387
void preserve_outline(EDGEPT *start)
Definition: chopper.cpp:67
TESSLINE * next
Definition: blobs.h:258
double tessedit_certainty_threshold
Definition: wordrec.h:138
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
bool PrepareToInsertSeam(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int insert_index, bool modify)
Definition: seam.cpp:82
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:389
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
inT16 right() const
Definition: rect.h:75
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
void put(int column, int row, const T &thing)
Definition: matrix.h:166
int dimension() const
Definition: matrix.h:247
int wordrec_max_join_chunks
Definition: wordrec.h:164
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:369
Definition: callcpp.h:35
bool wordrec_debug_blamer
Definition: wordrec.h:167
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
int NumBlobs() const
Definition: blobs.h:425
int check_blob(TBLOB *blob)
Definition: chopper.cpp:298
float rating() const
Definition: ratngs.h:79
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:447
void insert(T t, int index)
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:216
Definition: werd.h:36
inT16 left() const
Definition: rect.h:68
LanguageModel * language_model_
Definition: wordrec.h:411
const UNICHARSET * uch_set
Definition: pageres.h:192
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:232
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:504
Definition: blobs.h:50
DENORM denorm
Definition: pageres.h:190
uinT8 permuter() const
Definition: ratngs.h:343
EDGEPT * next
Definition: blobs.h:169
bool ContainedByBlob(const TBLOB &blob) const
Definition: seam.h:79
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:57
Dict & getDict()
Definition: classify.h:65
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
inT16 x
Definition: blobs.h:71
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:325
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:170
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM * > &seams)
Definition: chopper.cpp:224
void UndoSeam(TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:140
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:409
bool is_ending() const
Definition: unicharset.h:102
inT16 bottom() const
Definition: rect.h:61
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
Definition: lm_state.h:231
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:195
WERD * word
Definition: pageres.h:175
int bandwidth() const
Definition: matrix.h:249
int any_shared_split_points(const GenericVector< SEAM * > &seams, SEAM *seam)
Definition: chopper.cpp:282
PointerVector< LanguageModelState > beam
Definition: lm_state.h:235
void remove(int index)
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
void RemapForSplit(int index)
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
char flags[EDGEPTFLAGS]
Definition: blobs.h:168
bool SharesPosition(const SEAM &other) const
Definition: seam.h:95
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
#define FALSE
Definition: capi.h:29
TPOINT pos
Definition: blobs.h:163
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:352
Definition: blobs.h:76
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:457
#define MAX_FLOAT32
Definition: host.h:124
int wordrec_debug_level
Definition: wordrec.h:162
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Definition: matrix.h:289
bool contains(const FCOORD pt) const
Definition: rect.h:323
Struct to store information maintained by various language model components.
Definition: lm_state.h:197
#define NULL
Definition: host.h:144
void Finalize()
Definition: seam.h:116
Definition: blobs.h:395
Definition: seam.h:44
TBOX bounding_box() const
Definition: blobs.cpp:482
TESSLINE * outlines
Definition: blobs.h:377
bool allow_blob_division
Definition: classify.h:382
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:142
EDGEPT * loop
Definition: blobs.h:257
inT16 top() const
Definition: rect.h:54
float certainty() const
Definition: ratngs.h:82
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:123
void FakeWordFromRatings()
Definition: pageres.cpp:892
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:633
BlamerBundle * blamer_bundle
Definition: pageres.h:230
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:374
Bundle together all the things pertaining to the best choice/state.
Definition: lm_state.h:219
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102
bool is_beginning() const
Definition: unicharset.h:99