tesseract  4.00.00dev
ratngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.cpp (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 13:23:29 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "ratngs.h"
26 
27 #include <string>
28 #include "blobs.h"
29 #include "callcpp.h"
30 #include "genericvector.h"
31 #include "matrix.h"
32 #include "normalis.h" // kBlnBaselineOffset.
33 #include "unicharset.h"
34 
36 
39 
40 const float WERD_CHOICE::kBadRating = 100000.0;
41 // Min offset in baseline-normalized coords to make a character a subscript.
42 const int kMinSubscriptOffset = 20;
43 // Min offset in baseline-normalized coords to make a character a superscript.
44 const int kMinSuperscriptOffset = 20;
45 // Max y of bottom of a drop-cap blob.
46 const int kMaxDropCapBottom = -128;
47 // Max fraction of x-height to use as denominator in measuring x-height overlap.
48 const double kMaxOverlapDenominator = 0.125;
49 // Min fraction of x-height range that should be in agreement for matching
50 // x-heights.
51 const double kMinXHeightMatch = 0.5;
52 // Max tolerance on baseline position as a fraction of x-height for matching
53 // baselines.
54 const double kMaxBaselineDrift = 0.0625;
55 
56 static const char kPermuterTypeNoPerm[] = "None";
57 static const char kPermuterTypePuncPerm[] = "Punctuation";
58 static const char kPermuterTypeTopPerm[] = "Top Choice";
59 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
60 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
61 static const char kPermuterTypeNgramPerm[] = "Ngram";
62 static const char kPermuterTypeNumberPerm[] = "Number";
63 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
64 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
65 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
66 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
67 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
68 static const char kPermuterTypeCompoundPerm[] = "Compound";
69 
70 static const char * const kPermuterTypeNames[] = {
71  kPermuterTypeNoPerm, // 0
72  kPermuterTypePuncPerm, // 1
73  kPermuterTypeTopPerm, // 2
74  kPermuterTypeLowerPerm, // 3
75  kPermuterTypeUpperPerm, // 4
76  kPermuterTypeNgramPerm, // 5
77  kPermuterTypeNumberPerm, // 6
78  kPermuterTypeUserPatPerm, // 7
79  kPermuterTypeSysDawgPerm, // 8
80  kPermuterTypeDocDawgPerm, // 9
81  kPermuterTypeUserDawgPerm, // 10
82  kPermuterTypeFreqDawgPerm, // 11
83  kPermuterTypeCompoundPerm // 12
84 };
85 
91 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
92  float src_rating, // rating
93  float src_cert, // certainty
94  int src_script_id, // script
95  float min_xheight, // min xheight allowed
96  float max_xheight, // max xheight by this char
97  float yshift, // yshift out of position
98  BlobChoiceClassifier c) { // adapted match or other
99  unichar_id_ = src_unichar_id;
100  rating_ = src_rating;
101  certainty_ = src_cert;
102  fontinfo_id_ = -1;
103  fontinfo_id2_ = -1;
104  script_id_ = src_script_id;
105  min_xheight_ = min_xheight;
106  max_xheight_ = max_xheight;
107  yshift_ = yshift;
108  classifier_ = c;
109 }
110 
117  unichar_id_ = other.unichar_id();
118  rating_ = other.rating();
119  certainty_ = other.certainty();
120  fontinfo_id_ = other.fontinfo_id();
121  fontinfo_id2_ = other.fontinfo_id2();
122  script_id_ = other.script_id();
123  matrix_cell_ = other.matrix_cell_;
124  min_xheight_ = other.min_xheight_;
125  max_xheight_ = other.max_xheight_;
126  yshift_ = other.yshift();
127  classifier_ = other.classifier_;
128  fonts_ = other.fonts_;
129 }
130 
131 // Returns true if *this and other agree on the baseline and x-height
132 // to within some tolerance based on a given estimate of the x-height.
133 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
134  bool debug) const {
135  double baseline_diff = fabs(yshift() - other.yshift());
136  if (baseline_diff > kMaxBaselineDrift * x_height) {
137  if (debug) {
138  tprintf("Baseline diff %g for %d v %d\n",
139  baseline_diff, unichar_id_, other.unichar_id_);
140  }
141  return false;
142  }
143  double this_range = max_xheight() - min_xheight();
144  double other_range = other.max_xheight() - other.min_xheight();
145  double denominator = ClipToRange(MIN(this_range, other_range),
146  1.0, kMaxOverlapDenominator * x_height);
147  double overlap = MIN(max_xheight(), other.max_xheight()) -
148  MAX(min_xheight(), other.min_xheight());
149  overlap /= denominator;
150  if (debug) {
151  tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
152  unichar_id_, other.unichar_id_, baseline_diff,
153  this_range, other_range, denominator, overlap);
154  }
155 
156  return overlap >= kMinXHeightMatch;
157 }
158 
159 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
160 // unichar_id, or NULL if there is no match.
162  BLOB_CHOICE_LIST* bc_list) {
163  // Find the corresponding best BLOB_CHOICE.
164  BLOB_CHOICE_IT choice_it(bc_list);
165  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
166  choice_it.forward()) {
167  BLOB_CHOICE* choice = choice_it.data();
168  if (choice->unichar_id() == char_id) {
169  return choice;
170  }
171  }
172  return NULL;
173 }
174 
175 const char *WERD_CHOICE::permuter_name(uinT8 permuter) {
176  return kPermuterTypeNames[permuter];
177 }
178 
179 namespace tesseract {
180 
181 const char *ScriptPosToString(enum ScriptPos script_pos) {
182  switch (script_pos) {
183  case SP_NORMAL: return "NORM";
184  case SP_SUBSCRIPT: return "SUB";
185  case SP_SUPERSCRIPT: return "SUPER";
186  case SP_DROPCAP: return "DROPC";
187  }
188  return "SP_UNKNOWN";
189 }
190 
191 } // namespace tesseract.
192 
199 WERD_CHOICE::WERD_CHOICE(const char *src_string,
200  const UNICHARSET &unicharset)
201  : unicharset_(&unicharset){
202  GenericVector<UNICHAR_ID> encoding;
203  GenericVector<char> lengths;
204  string cleaned = unicharset.CleanupString(src_string);
205  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
206  NULL)) {
207  lengths.push_back('\0');
208  STRING src_lengths = &lengths[0];
209  this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
210  } else { // There must have been an invalid unichar in the string.
211  this->init(8);
212  this->make_bad();
213  }
214 }
215 
226 void WERD_CHOICE::init(const char *src_string,
227  const char *src_lengths,
228  float src_rating,
229  float src_certainty,
230  uinT8 src_permuter) {
231  int src_string_len = strlen(src_string);
232  if (src_string_len == 0) {
233  this->init(8);
234  } else {
235  this->init(src_lengths ? strlen(src_lengths): src_string_len);
236  length_ = reserved_;
237  int offset = 0;
238  for (int i = 0; i < length_; ++i) {
239  int unichar_length = src_lengths ? src_lengths[i] : 1;
240  unichar_ids_[i] =
241  unicharset_->unichar_to_id(src_string+offset, unichar_length);
242  state_[i] = 1;
243  certainties_[i] = src_certainty;
244  offset += unichar_length;
245  }
246  }
247  adjust_factor_ = 1.0f;
248  rating_ = src_rating;
249  certainty_ = src_certainty;
250  permuter_ = src_permuter;
251  dangerous_ambig_found_ = false;
252 }
253 
258  delete[] unichar_ids_;
259  delete[] script_pos_;
260  delete[] state_;
261  delete[] certainties_;
262 }
263 
264 const char *WERD_CHOICE::permuter_name() const {
265  return kPermuterTypeNames[permuter_];
266 }
267 
268 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
269 // taken from the appropriate cell in the ratings MATRIX.
270 // Borrowed pointer, so do not delete.
271 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
272  MATRIX_COORD coord = MatrixCoord(index);
273  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
274  if (result == NULL) {
275  result = new BLOB_CHOICE_LIST;
276  ratings->put(coord.col, coord.row, result);
277  }
278  return result;
279 }
280 
281 // Returns the MATRIX_COORD corresponding to the location in the ratings
282 // MATRIX for the given index into the word.
284  int col = 0;
285  for (int i = 0; i < index; ++i)
286  col += state_[i];
287  int row = col + state_[index] - 1;
288  return MATRIX_COORD(col, row);
289 }
290 
291 // Sets the entries for the given index from the BLOB_CHOICE, assuming
292 // unit fragment lengths, but setting the state for this index to blob_count.
293 void WERD_CHOICE::set_blob_choice(int index, int blob_count,
294  const BLOB_CHOICE* blob_choice) {
295  unichar_ids_[index] = blob_choice->unichar_id();
296  script_pos_[index] = tesseract::SP_NORMAL;
297  state_[index] = blob_count;
298  certainties_[index] = blob_choice->certainty();
299 }
300 
301 
308  for (int i = 0; i < length_; ++i) {
309  if (unichar_ids_[i] == unichar_id) {
310  return true;
311  }
312  }
313  return false;
314 }
315 
323 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
324  ASSERT_HOST(start >= 0 && start + num <= length_);
325  // Accumulate the states to account for the merged blobs.
326  for (int i = 0; i < num; ++i) {
327  if (start > 0)
328  state_[start - 1] += state_[start + i];
329  else if (start + num < length_)
330  state_[start + num] += state_[start + i];
331  }
332  for (int i = start; i + num < length_; ++i) {
333  unichar_ids_[i] = unichar_ids_[i + num];
334  script_pos_[i] = script_pos_[i + num];
335  state_[i] = state_[i + num];
336  certainties_[i] = certainties_[i + num];
337  }
338  length_ -= num;
339 }
340 
347  for (int i = 0; i < length_ / 2; ++i) {
348  UNICHAR_ID tmp_id = unichar_ids_[i];
349  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
350  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
351  }
352  if (length_ % 2 != 0) {
353  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
354  }
355 }
356 
364 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
365  *start = 0;
366  *end = length() - 1;
367  while (*start < length() &&
368  unicharset()->get_ispunctuation(unichar_id(*start))) {
369  (*start)++;
370  }
371  while (*end > -1 &&
372  unicharset()->get_ispunctuation(unichar_id(*end))) {
373  (*end)--;
374  }
375  (*end)++;
376 }
377 
378 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
379  int end = length();
380  while (end > 0 &&
381  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
383  end--;
384  }
385  int start = 0;
386  while (start < end &&
387  unicharset_->get_isdigit(unichar_ids_[start]) &&
389  start++;
390  }
391  *pstart = start;
392  *pend = end;
393 }
394 
395 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
396  ASSERT_HOST(start >= 0 && start <= length_);
397  ASSERT_HOST(end >= 0 && end <= length_);
398  if (end < start) { end = start; }
399  WERD_CHOICE retval(unicharset_, end - start);
400  for (int i = start; i < end; i++) {
402  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
403  }
404  return retval;
405 }
406 
413  int i;
414  for (i = 0; i < length_; ++i) {
415  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
416  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
418  return true;
419  }
420  }
421  return false;
422 }
423 
431  STRING *word_lengths_str) const {
432  *word_str = "";
433  if (word_lengths_str != NULL) *word_lengths_str = "";
434  for (int i = 0; i < length_; ++i) {
435  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
436  *word_str += ch;
437  if (word_lengths_str != NULL) {
438  *word_lengths_str += strlen(ch);
439  }
440  }
441 }
442 
450  UNICHAR_ID unichar_id, int blob_count,
451  float rating, float certainty) {
452  if (length_ == reserved_) {
453  this->double_the_size();
454  }
455  this->append_unichar_id_space_allocated(unichar_id, blob_count,
456  rating, certainty);
457 }
458 
467  ASSERT_HOST(unicharset_ == second.unicharset_);
468  while (reserved_ < length_ + second.length()) {
469  this->double_the_size();
470  }
471  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
472  for (int i = 0; i < second.length(); ++i) {
473  unichar_ids_[length_ + i] = other_unichar_ids[i];
474  state_[length_ + i] = second.state_[i];
475  certainties_[length_ + i] = second.certainties_[i];
476  script_pos_[length_ + i] = second.BlobPosition(i);
477  }
478  length_ += second.length();
479  if (second.adjust_factor_ > adjust_factor_)
480  adjust_factor_ = second.adjust_factor_;
481  rating_ += second.rating(); // add ratings
482  if (second.certainty() < certainty_) // take min
483  certainty_ = second.certainty();
484  if (second.dangerous_ambig_found_)
485  dangerous_ambig_found_ = true;
486  if (permuter_ == NO_PERM) {
487  permuter_ = second.permuter();
488  } else if (second.permuter() != NO_PERM &&
489  second.permuter() != permuter_) {
490  permuter_ = COMPOUND_PERM;
491  }
492  return *this;
493 }
494 
495 
503  while (reserved_ < source.length()) {
504  this->double_the_size();
505  }
506 
507  unicharset_ = source.unicharset_;
508  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
509  for (int i = 0; i < source.length(); ++i) {
510  unichar_ids_[i] = other_unichar_ids[i];
511  state_[i] = source.state_[i];
512  certainties_[i] = source.certainties_[i];
513  script_pos_[i] = source.BlobPosition(i);
514  }
515  length_ = source.length();
516  adjust_factor_ = source.adjust_factor_;
517  rating_ = source.rating();
518  certainty_ = source.certainty();
519  min_x_height_ = source.min_x_height();
520  max_x_height_ = source.max_x_height();
521  permuter_ = source.permuter();
522  dangerous_ambig_found_ = source.dangerous_ambig_found_;
523  return *this;
524 }
525 
526 // Sets up the script_pos_ member using the blobs_list to get the bln
527 // bounding boxes, *this to get the unichars, and this->unicharset
528 // to get the target positions. If small_caps is true, sub/super are not
529 // considered, but dropcaps are.
530 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
531 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word) {
532  // Since WERD_CHOICE isn't supposed to depend on a Tesseract,
533  // we don't have easy access to the flags Tesseract stores. Therefore, debug
534  // for this module is hard compiled in.
535  int debug = 0;
536 
537  // Initialize to normal.
538  for (int i = 0; i < length_; ++i)
539  script_pos_[i] = tesseract::SP_NORMAL;
540  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
541  return;
542  }
543 
544  int position_counts[4];
545  for (int i = 0; i < 4; i++) {
546  position_counts[i] = 0;
547  }
548 
549  int chunk_index = 0;
550  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
551  TBLOB* tblob = word->blobs[chunk_index];
552  int uni_id = unichar_id(blob_index);
553  TBOX blob_box = tblob->bounding_box();
554  if (state_ != NULL) {
555  for (int i = 1; i < state_[blob_index]; ++i) {
556  ++chunk_index;
557  tblob = word->blobs[chunk_index];
558  blob_box += tblob->bounding_box();
559  }
560  }
561  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
562  uni_id);
563  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
564  script_pos_[blob_index] = tesseract::SP_NORMAL;
565  }
566  position_counts[script_pos_[blob_index]]++;
567  }
568  // If almost everything looks like a superscript or subscript,
569  // we most likely just got the baseline wrong.
570  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
571  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
572  if (debug >= 2) {
573  tprintf("Most characters of %s are subscript or superscript.\n"
574  "That seems wrong, so I'll assume we got the baseline wrong\n",
575  unichar_string().string());
576  }
577  for (int i = 0; i < length_; i++) {
578  ScriptPos sp = script_pos_[i];
580  position_counts[sp]--;
581  position_counts[tesseract::SP_NORMAL]++;
582  script_pos_[i] = tesseract::SP_NORMAL;
583  }
584  }
585  }
586 
587  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
588  debug >= 2) {
589  tprintf("SetScriptPosition on %s\n", unichar_string().string());
590  int chunk_index = 0;
591  for (int blob_index = 0; blob_index < length_; ++blob_index) {
592  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
593  TBLOB* tblob = word->blobs[chunk_index];
594  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
595  unichar_id(blob_index));
596  }
597  chunk_index += state_ != NULL ? state_[blob_index] : 1;
598  }
599  }
600 }
601 // Sets the script_pos_ member from some source positions with a given length.
603  int length) {
604  ASSERT_HOST(length == length_);
605  if (positions != script_pos_) {
606  delete [] script_pos_;
607  script_pos_ = new ScriptPos[length];
608  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
609  }
610 }
611 // Sets all the script_pos_ positions to the given position.
613  for (int i = 0; i < length_; ++i)
614  script_pos_[i] = position;
615 }
616 
617 /* static */
619  const UNICHARSET& unicharset,
620  const TBOX& blob_box,
623  int top = blob_box.top();
624  int bottom = blob_box.bottom();
625  int min_bottom, max_bottom, min_top, max_top;
626  unicharset.get_top_bottom(unichar_id,
627  &min_bottom, &max_bottom,
628  &min_top, &max_top);
629 
630  int sub_thresh_top = min_top - kMinSubscriptOffset;
631  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
632  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
633  if (bottom <= kMaxDropCapBottom) {
634  retval = tesseract::SP_DROPCAP;
635  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
636  retval = tesseract::SP_SUBSCRIPT;
637  } else if (bottom > sup_thresh_bot) {
638  retval = tesseract::SP_SUPERSCRIPT;
639  }
640 
641  if (print_debug) {
642  const char *pos = ScriptPosToString(retval);
643  tprintf("%s Character %s[bot:%d top: %d] "
644  "bot_range[%d,%d] top_range[%d, %d] "
645  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
646  pos, unicharset.id_to_unichar(unichar_id),
647  bottom, top,
648  min_bottom, max_bottom, min_top, max_top,
649  sub_thresh_bot, sub_thresh_top,
650  sup_thresh_bot);
651  }
652  return retval;
653 }
654 
655 // Returns the script-id (eg Han) of the dominant script in the word.
657  int max_script = unicharset_->get_script_table_size();
658  int *sid = new int[max_script];
659  int x;
660  for (x = 0; x < max_script; x++) sid[x] = 0;
661  for (x = 0; x < length_; ++x) {
662  int script_id = unicharset_->get_script(unichar_id(x));
663  sid[script_id]++;
664  }
665  if (unicharset_->han_sid() != unicharset_->null_sid()) {
666  // Add the Hiragana & Katakana counts to Han and zero them out.
667  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
668  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
669  sid[unicharset_->hiragana_sid()] = 0;
670  }
671  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
672  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
673  sid[unicharset_->katakana_sid()] = 0;
674  }
675  }
676  // Note that high script ID overrides lower one on a tie, thus biasing
677  // towards non-Common script (if sorted that way in unicharset file).
678  int max_sid = 0;
679  for (x = 1; x < max_script; x++)
680  if (sid[x] >= sid[max_sid]) max_sid = x;
681  if (sid[max_sid] < length_ / 2)
682  max_sid = unicharset_->null_sid();
683  delete[] sid;
684  return max_sid;
685 }
686 
687 // Fixes the state_ for a chop at the given blob_posiiton.
688 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
689  int total_chunks = 0;
690  for (int i = 0; i < length_; ++i) {
691  total_chunks += state_[i];
692  if (total_chunks > blob_position) {
693  ++state_[i];
694  return;
695  }
696  }
697 }
698 
699 // Returns the sum of all the state elements, being the total number of blobs.
701  int total_chunks = 0;
702  for (int i = 0; i < length_; ++i) {
703  total_chunks += state_[i];
704  }
705  return total_chunks;
706 }
707 
713 void WERD_CHOICE::print(const char *msg) const {
714  tprintf("%s : ", msg);
715  for (int i = 0; i < length_; ++i) {
716  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
717  }
718  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
719  rating_, certainty_, adjust_factor_, permuter_,
720  min_x_height_, max_x_height_, dangerous_ambig_found_);
721  tprintf("pos");
722  for (int i = 0; i < length_; ++i) {
723  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
724  }
725  tprintf("\nstr");
726  for (int i = 0; i < length_; ++i) {
727  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
728  }
729  tprintf("\nstate:");
730  for (int i = 0; i < length_; ++i) {
731  tprintf("\t%d ", state_[i]);
732  }
733  tprintf("\nC");
734  for (int i = 0; i < length_; ++i) {
735  tprintf("\t%.3f", certainties_[i]);
736  }
737  tprintf("\n");
738 }
739 
740 // Prints the segmentation state with an introductory message.
741 void WERD_CHOICE::print_state(const char *msg) const {
742  tprintf("%s", msg);
743  for (int i = 0; i < length_; ++i)
744  tprintf(" %d", state_[i]);
745  tprintf("\n");
746 }
747 
748 // Displays the segmentation state of *this (if not the same as the last
749 // one displayed) and waits for a click in the window.
751 #ifndef GRAPHICS_DISABLED
752  // Number of different colors to draw with.
753  const int kNumColors = 6;
754  static ScrollView *segm_window = NULL;
755  // Check the state against the static prev_drawn_state.
756  static GenericVector<int> prev_drawn_state;
757  bool already_done = prev_drawn_state.size() == length_;
758  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
759  for (int i = 0; i < length_; ++i) {
760  if (prev_drawn_state[i] != state_[i]) {
761  already_done = false;
762  }
763  prev_drawn_state[i] = state_[i];
764  }
765  if (already_done || word->blobs.empty()) return;
766 
767  // Create the window if needed.
768  if (segm_window == NULL) {
769  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
770  2000.0, 256.0, true);
771  } else {
772  segm_window->Clear();
773  }
774 
775  TBOX bbox;
776  int blob_index = 0;
777  for (int c = 0; c < length_; ++c) {
778  ScrollView::Color color =
779  static_cast<ScrollView::Color>(c % kNumColors + 3);
780  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
781  TBLOB* blob = word->blobs[blob_index];
782  bbox += blob->bounding_box();
783  blob->plot(segm_window, color, color);
784  }
785  }
786  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
787  bbox.right(), bbox.bottom());
788  segm_window->Update();
789  window_wait(segm_window);
790 #endif
791 }
792 
793 
795  const WERD_CHOICE &word2) {
796  const UNICHARSET *uchset = word1.unicharset();
797  if (word2.unicharset() != uchset) return false;
798  int w1start, w1end;
799  word1.punct_stripped(&w1start, &w1end);
800  int w2start, w2end;
801  word2.punct_stripped(&w2start, &w2end);
802  if (w1end - w1start != w2end - w2start) return false;
803  for (int i = 0; i < w1end - w1start; i++) {
804  if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
805  uchset->to_lower(word2.unichar_id(w2start + i))) {
806  return false;
807  }
808  }
809  return true;
810 }
811 
822 void print_ratings_list(const char *msg,
823  BLOB_CHOICE_LIST *ratings,
824  const UNICHARSET &current_unicharset) {
825  if (ratings->length() == 0) {
826  tprintf("%s:<none>\n", msg);
827  return;
828  }
829  if (*msg != '\0') {
830  tprintf("%s\n", msg);
831  }
832  BLOB_CHOICE_IT c_it;
833  c_it.set_to_list(ratings);
834  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
835  c_it.data()->print(&current_unicharset);
836  if (!c_it.at_last()) tprintf("\n");
837  }
838  tprintf("\n");
839  fflush(stdout);
840 }
uinT8 permuter() const
Definition: ratngs.h:342
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:283
bool empty() const
Definition: genericvector.h:91
#define MIN(x, y)
Definition: ndminx.h:28
static const float kBadRating
Definition: ratngs.h:271
int script_id() const
Definition: ratngs.h:111
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:383
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:296
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:323
int han_sid() const
Definition: unicharset.h:887
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:567
void put(ICOORD pos, const T &thing)
Definition: matrix.h:219
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:256
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:822
#define MAX(x, y)
Definition: ndminx.h:24
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
float yshift() const
Definition: ratngs.h:129
void init(int reserved)
Definition: ratngs.h:405
int GetTopScriptID() const
Definition: ratngs.cpp:656
float min_xheight() const
Definition: ratngs.h:123
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:161
void print_state(const char *msg) const
Definition: ratngs.cpp:741
static void Update()
Definition: scrollview.cpp:715
const UNICHARSET * unicharset() const
Definition: ratngs.h:296
int katakana_sid() const
Definition: unicharset.h:889
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:688
void SetScriptPositions(bool small_caps, TWERD *word)
Definition: ratngs.cpp:531
float rating() const
Definition: ratngs.h:323
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
float certainty() const
Definition: ratngs.h:326
int size() const
Definition: genericvector.h:72
~WERD_CHOICE()
Definition: ratngs.cpp:257
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
void print() const
Definition: ratngs.h:576
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:612
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:449
inT16 fontinfo_id2() const
Definition: ratngs.h:88
#define tprintf(...)
Definition: tprintf.h:31
int hiragana_sid() const
Definition: unicharset.h:888
int TotalOfStates() const
Definition: ratngs.cpp:700
uint8_t uinT8
Definition: host.h:35
float min_x_height() const
Definition: ratngs.h:332
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
const char * permuter_name() const
Definition: ratngs.cpp:264
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:274
T get(ICOORD pos) const
Definition: matrix.h:227
const char * string() const
Definition: strngs.cpp:198
Definition: blobs.h:395
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:412
inT16 top() const
Definition: rect.h:54
void Clear()
Definition: scrollview.cpp:595
BlobChoiceClassifier
Definition: ratngs.h:40
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:308
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:689
const double kMinXHeightMatch
Definition: ratngs.cpp:51
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:271
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:448
inT16 bottom() const
Definition: rect.h:61
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
UNICHARSET unicharset_
Definition: strngs.h:45
Definition: rect.h:30
Definition: matrix.h:570
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:261
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:466
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:346
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:318
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:307
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:703
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
inT16 fontinfo_id() const
Definition: ratngs.h:85
int NumBlobs() const
Definition: blobs.h:425
const int kMaxDropCapBottom
Definition: ratngs.cpp:46
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:133
TBOX bounding_box() const
Definition: blobs.cpp:482
float rating() const
Definition: ratngs.h:79
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:524
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:293
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
const int kMinSuperscriptOffset
Definition: ratngs.cpp:44
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:430
const int kBlnBaselineOffset
Definition: normalis.h:29
int length() const
Definition: ratngs.h:299
float max_xheight() const
Definition: ratngs.h:126
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:364
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:378
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:618
inT16 right() const
Definition: rect.h:75
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:439
int get_script_table_size() const
Definition: unicharset.h:848
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:794
const int kMinSubscriptOffset
Definition: ratngs.cpp:42
void init_to_size(int size, T t)
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:696
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:750
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:502
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:395
float certainty() const
Definition: ratngs.h:82
BLOB_CHOICE()
Definition: ratngs.h:51
int null_sid() const
Definition: unicharset.h:882
float max_x_height() const
Definition: ratngs.h:335
const double kMaxBaselineDrift
Definition: ratngs.cpp:54
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:662
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35
const double kMaxOverlapDenominator
Definition: ratngs.cpp:48
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:181
const STRING & unichar_string() const
Definition: ratngs.h:537