All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
adaptmatch.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: adaptmatch.c
3  ** Purpose: High level adaptive matcher.
4  ** Author: Dan Johnson
5  ** History: Mon Mar 11 10:00:10 1991, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 /*-----------------------------------------------------------------------------
20  Include Files and Type Defines
21 -----------------------------------------------------------------------------*/
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <ctype.h>
27 #include "shapeclassifier.h"
28 #include "ambigs.h"
29 #include "blobclass.h"
30 #include "blobs.h"
31 #include "callcpp.h"
32 #include "classify.h"
33 #include "const.h"
34 #include "dict.h"
35 #include "efio.h"
36 #include "emalloc.h"
37 #include "featdefs.h"
38 #include "float2int.h"
39 #include "genericvector.h"
40 #include "globals.h"
41 #include "helpers.h"
42 #include "intfx.h"
43 #include "intproto.h"
44 #include "mfoutline.h"
45 #include "ndminx.h"
46 #include "normfeat.h"
47 #include "normmatch.h"
48 #include "outfeat.h"
49 #include "pageres.h"
50 #include "params.h"
51 #include "picofeat.h"
52 #include "shapetable.h"
53 #include "tessclassifier.h"
54 #include "trainingsample.h"
55 #include "unicharset.h"
56 #include "werd.h"
57 
58 #include <stdio.h>
59 #include <string.h>
60 #include <stdlib.h>
61 #include <math.h>
62 #ifdef __UNIX__
63 #include <assert.h>
64 #endif
65 
66 #define ADAPT_TEMPLATE_SUFFIX ".a"
67 
68 #define MAX_MATCHES 10
69 #define UNLIKELY_NUM_FEAT 200
70 #define NO_DEBUG 0
71 #define MAX_ADAPTABLE_WERD_SIZE 40
72 
73 #define ADAPTABLE_WERD_ADJUSTMENT (0.05)
74 
75 #define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
76 
77 #define WORST_POSSIBLE_RATING (0.0f)
78 
81 
82 struct ADAPT_RESULTS {
90 
93  inline void Initialize() {
94  BlobLength = MAX_INT32;
95  HasNonfragment = false;
96  ComputeBest();
97  }
98  // Computes best_unichar_id, best_match_index and best_rating.
99  void ComputeBest() {
100  best_unichar_id = INVALID_UNICHAR_ID;
101  best_match_index = -1;
102  best_rating = WORST_POSSIBLE_RATING;
103  for (int i = 0; i < match.size(); ++i) {
104  if (match[i].rating > best_rating) {
105  best_rating = match[i].rating;
106  best_unichar_id = match[i].unichar_id;
107  best_match_index = i;
108  }
109  }
110  }
111 };
112 
113 struct PROTO_KEY {
116  int ConfigId;
117 };
118 
119 /*-----------------------------------------------------------------------------
120  Private Macros
121 -----------------------------------------------------------------------------*/
122 inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
123  return (1.0f - confidence) > matcher_great_threshold;
124 }
125 
126 /*-----------------------------------------------------------------------------
127  Private Function Prototypes
128 -----------------------------------------------------------------------------*/
129 // Returns the index of the given id in results, if present, or the size of the
130 // vector (index it will go at) if not present.
131 static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
132  for (int i = 0; i < results.match.size(); i++) {
133  if (results.match[i].unichar_id == id)
134  return i;
135  }
136  return results.match.size();
137 }
138 
139 // Returns the current rating for a unichar id if we have rated it, defaulting
140 // to WORST_POSSIBLE_RATING.
141 static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) {
142  int index = FindScoredUnichar(id, results);
143  if (index >= results.match.size()) return WORST_POSSIBLE_RATING;
144  return results.match[index].rating;
145 }
146 
147 void InitMatcherRatings(register FLOAT32 *Rating);
148 
149 int MakeTempProtoPerm(void *item1, void *item2);
150 
151 void SetAdaptiveThreshold(FLOAT32 Threshold);
152 
153 
154 /*-----------------------------------------------------------------------------
155  Public Code
156 -----------------------------------------------------------------------------*/
157 /*---------------------------------------------------------------------------*/
158 namespace tesseract {
185 void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
186  assert(Choices != NULL);
187  ADAPT_RESULTS *Results = new ADAPT_RESULTS;
188  Results->Initialize();
189 
191 
192  DoAdaptiveMatch(Blob, Results);
193 
194  RemoveBadMatches(Results);
196  RemoveExtraPuncs(Results);
197  Results->ComputeBest();
198  ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results,
199  Choices);
200 
201  // TODO(rays) Move to before ConvertMatchesToChoices!
202  if (LargeSpeckle(*Blob) || Choices->length() == 0)
203  AddLargeSpeckleTo(Results->BlobLength, Choices);
204 
205  if (matcher_debug_level >= 1) {
206  tprintf("AD Matches = ");
207  PrintAdaptiveMatchResults(*Results);
208  }
209 
210 #ifndef GRAPHICS_DISABLED
212  DebugAdaptiveClassifier(Blob, Results);
213 #endif
214 
215  delete Results;
216 } /* AdaptiveClassifier */
217 
218 // If *win is NULL, sets it to a new ScrollView() object with title msg.
219 // Clears the window and draws baselines.
220 void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
221  int y_offset, const TBOX &wbox) {
222  #ifndef GRAPHICS_DISABLED
223  const int kSampleSpaceWidth = 500;
224  if (*win == NULL) {
225  *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200,
226  kSampleSpaceWidth * 2, 200, true);
227  }
228  (*win)->Clear();
229  (*win)->Pen(64, 64, 64);
230  (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset,
231  kSampleSpaceWidth, kBlnBaselineOffset);
232  (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset,
233  kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset);
234  (*win)->ZoomToRectangle(wbox.left(), wbox.top(),
235  wbox.right(), wbox.bottom());
236  #endif // GRAPHICS_DISABLED
237 }
238 
239 // Learns the given word using its chopped_word, seam_array, denorm,
240 // box_word, best_state, and correct_text to learn both correctly and
241 // incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
242 // is called and the data will be saved in an internal buffer.
243 // Otherwise AdaptToBlob is called for adaption within a document.
244 void Classify::LearnWord(const char* fontname, WERD_RES* word) {
245  int word_len = word->correct_text.size();
246  if (word_len == 0) return;
247 
248  float* thresholds = NULL;
249  if (fontname == NULL) {
250  // Adaption mode.
251  if (!EnableLearning || word->best_choice == NULL)
252  return; // Can't or won't adapt.
253 
255  tprintf("\n\nAdapting to word = %s\n",
256  word->best_choice->debug_string().string());
257  thresholds = new float[word_len];
261  matcher_rating_margin, thresholds);
262  }
263  int start_blob = 0;
264 
265  #ifndef GRAPHICS_DISABLED
267  if (learn_fragmented_word_debug_win_ != NULL) {
268  window_wait(learn_fragmented_word_debug_win_);
269  }
270  RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
271  word->chopped_word->bounding_box());
272  RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
273  word->chopped_word->bounding_box());
274  word->chopped_word->plot(learn_fragmented_word_debug_win_);
276  }
277  #endif // GRAPHICS_DISABLED
278 
279  for (int ch = 0; ch < word_len; ++ch) {
281  tprintf("\nLearning %s\n", word->correct_text[ch].string());
282  }
283  if (word->correct_text[ch].length() > 0) {
284  float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
285 
286  LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
287  CST_WHOLE, word->correct_text[ch].string(), word);
288 
289  if (word->best_state[ch] > 1 && !disable_character_fragments) {
290  // Check that the character breaks into meaningful fragments
291  // that each match a whole character with at least
292  // classify_character_fragments_garbage_certainty_threshold
293  bool garbage = false;
294  int frag;
295  for (frag = 0; frag < word->best_state[ch]; ++frag) {
296  TBLOB* frag_blob = word->chopped_word->blobs[start_blob + frag];
298  garbage |= LooksLikeGarbage(frag_blob);
299  }
300  }
301  // Learn the fragments.
302  if (!garbage) {
303  bool pieces_all_natural = word->PiecesAllNatural(start_blob,
304  word->best_state[ch]);
305  if (pieces_all_natural || !prioritize_division) {
306  for (frag = 0; frag < word->best_state[ch]; ++frag) {
307  GenericVector<STRING> tokens;
308  word->correct_text[ch].split(' ', &tokens);
309 
310  tokens[0] = CHAR_FRAGMENT::to_string(
311  tokens[0].string(), frag, word->best_state[ch],
312  pieces_all_natural);
313 
314  STRING full_string;
315  for (int i = 0; i < tokens.size(); i++) {
316  full_string += tokens[i];
317  if (i != tokens.size() - 1)
318  full_string += ' ';
319  }
320  LearnPieces(fontname, start_blob + frag, 1, threshold,
321  CST_FRAGMENT, full_string.string(), word);
322  }
323  }
324  }
325  }
326 
327  // TODO(rays): re-enable this part of the code when we switch to the
328  // new classifier that needs to see examples of garbage.
329  /*
330  if (word->best_state[ch] > 1) {
331  // If the next blob is good, make junk with the rightmost fragment.
332  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
333  LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
334  word->best_state[ch + 1] + 1,
335  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
336  }
337  // If the previous blob is good, make junk with the leftmost fragment.
338  if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
339  LearnPieces(fontname, start_blob - word->best_state[ch - 1],
340  word->best_state[ch - 1] + 1,
341  threshold, CST_IMPROPER, INVALID_UNICHAR, word);
342  }
343  }
344  // If the next blob is good, make a join with it.
345  if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
346  STRING joined_text = word->correct_text[ch];
347  joined_text += word->correct_text[ch + 1];
348  LearnPieces(fontname, start_blob,
349  word->best_state[ch] + word->best_state[ch + 1],
350  threshold, CST_NGRAM, joined_text.string(), word);
351  }
352  */
353  }
354  start_blob += word->best_state[ch];
355  }
356  delete [] thresholds;
357 } // LearnWord.
358 
359 // Builds a blob of length fragments, from the word, starting at start,
360 // and then learns it, as having the given correct_text.
361 // If fontname is not NULL, then LearnBlob is called and the data will be
362 // saved in an internal buffer for static training.
363 // Otherwise AdaptToBlob is called for adaption within a document.
364 // threshold is a magic number required by AdaptToChar and generated by
365 // ComputeAdaptionThresholds.
366 // Although it can be partly inferred from the string, segmentation is
367 // provided to explicitly clarify the character segmentation.
368 void Classify::LearnPieces(const char* fontname, int start, int length,
369  float threshold, CharSegmentationType segmentation,
370  const char* correct_text, WERD_RES* word) {
371  // TODO(daria) Remove/modify this if/when we want
372  // to train and/or adapt to n-grams.
373  if (segmentation != CST_WHOLE &&
374  (segmentation != CST_FRAGMENT || disable_character_fragments))
375  return;
376 
377  if (length > 1) {
378  SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start,
379  start + length - 1);
380  }
381  TBLOB* blob = word->chopped_word->blobs[start];
382  // Rotate the blob if needed for classification.
383  TBLOB* rotated_blob = blob->ClassifyNormalizeIfNeeded();
384  if (rotated_blob == NULL)
385  rotated_blob = blob;
386 
387  #ifndef GRAPHICS_DISABLED
388  // Draw debug windows showing the blob that is being learned if needed.
389  if (strcmp(classify_learn_debug_str.string(), correct_text) == 0) {
390  RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600,
391  word->chopped_word->bounding_box());
392  rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
393  learn_debug_win_->Update();
394  window_wait(learn_debug_win_);
395  }
396  if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
397  ASSERT_HOST(learn_fragments_debug_win_ != NULL); // set up in LearnWord
398  blob->plot(learn_fragments_debug_win_,
400  learn_fragments_debug_win_->Update();
401  }
402  #endif // GRAPHICS_DISABLED
403 
404  if (fontname != NULL) {
405  classify_norm_method.set_value(character); // force char norm spc 30/11/93
406  tess_bn_matching.set_value(false); // turn it off
407  tess_cn_matching.set_value(false);
408  DENORM bl_denorm, cn_denorm;
409  INT_FX_RESULT_STRUCT fx_info;
411  &bl_denorm, &cn_denorm, &fx_info);
412  LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
413  } else if (unicharset.contains_unichar(correct_text)) {
414  UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
415  int font_id = word->fontinfo != NULL
416  ? fontinfo_table_.get_id(*word->fontinfo)
417  : 0;
419  tprintf("Adapting to char = %s, thr= %g font_id= %d\n",
420  unicharset.id_to_unichar(class_id), threshold, font_id);
421  // If filename is not NULL we are doing recognition
422  // (as opposed to training), so we must have already set word fonts.
423  AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
424  if (BackupAdaptedTemplates != NULL) {
425  // Adapt the backup templates too. They will be used if the primary gets
426  // too full.
427  AdaptToChar(rotated_blob, class_id, font_id, threshold,
429  }
430  } else if (classify_debug_level >= 1) {
431  tprintf("Can't adapt to %s not in unicharset\n", correct_text);
432  }
433  if (rotated_blob != blob) {
434  delete rotated_blob;
435  }
436 
437  SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start,
438  start + length - 1);
439 } // LearnPieces.
440 
441 /*---------------------------------------------------------------------------*/
457  STRING Filename;
458  FILE *File;
459 
460  if (AdaptedTemplates != NULL &&
462  Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
463  File = fopen (Filename.string(), "wb");
464  if (File == NULL)
465  cprintf ("Unable to save adapted templates to %s!\n", Filename.string());
466  else {
467  cprintf ("\nSaving adapted templates to %s ...", Filename.string());
468  fflush(stdout);
470  cprintf ("\n");
471  fclose(File);
472  }
473  }
474 
475  if (AdaptedTemplates != NULL) {
478  }
479  if (BackupAdaptedTemplates != NULL) {
482  }
483 
484  if (PreTrainedTemplates != NULL) {
487  }
489  FreeNormProtos();
490  if (AllProtosOn != NULL) {
495  AllProtosOn = NULL;
496  AllConfigsOn = NULL;
499  }
500  delete shape_table_;
501  shape_table_ = NULL;
502  if (static_classifier_ != NULL) {
503  delete static_classifier_;
504  static_classifier_ = NULL;
505  }
506 } /* EndAdaptiveClassifier */
507 
508 
509 /*---------------------------------------------------------------------------*/
527 void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) {
529  return;
530  if (AllProtosOn != NULL)
531  EndAdaptiveClassifier(); // Don't leak with multiple inits.
532 
533  // If there is no language_data_path_prefix, the classifier will be
534  // adaptive only.
535  if (language_data_path_prefix.length() > 0 &&
536  load_pre_trained_templates) {
540  if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n");
541 
546  tprintf("Error loading shape table!\n");
547  delete shape_table_;
548  shape_table_ = NULL;
549  } else if (tessdata_manager.DebugLevel() > 0) {
550  tprintf("Successfully loaded shape table!\n");
551  }
552  }
553 
558  CharNormCutoffs);
559  if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n");
560 
562  NormProtos =
565  if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n");
566  static_classifier_ = new TessClassifier(false, this);
567  }
568 
570  InitIntegerFX();
571 
579 
580  for (int i = 0; i < MAX_NUM_CLASSES; i++) {
581  BaselineCutoffs[i] = 0;
582  }
583 
585  FILE *File;
586  STRING Filename;
587 
588  Filename = imagefile;
589  Filename += ADAPT_TEMPLATE_SUFFIX;
590  File = fopen(Filename.string(), "rb");
591  if (File == NULL) {
593  } else {
594  cprintf("\nReading pre-adapted templates from %s ...\n",
595  Filename.string());
596  fflush(stdout);
598  cprintf("\n");
599  fclose(File);
601 
602  for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
603  BaselineCutoffs[i] = CharNormCutoffs[i];
604  }
605  }
606  } else {
607  if (AdaptedTemplates != NULL)
610  }
611 } /* InitAdaptiveClassifier */
612 
615  tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n",
616  NumAdaptationsFailed);
617  }
623  NumAdaptationsFailed = 0;
624 }
625 
626 // If there are backup adapted templates, switches to those, otherwise resets
627 // the main adaptive classifier (because it is full.)
629  if (BackupAdaptedTemplates == NULL) {
631  return;
632  }
634  tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
635  NumAdaptationsFailed);
636  }
640  NumAdaptationsFailed = 0;
641 }
642 
643 // Resets the backup adaptive classifier to empty.
648 }
649 
650 /*---------------------------------------------------------------------------*/
672 
674 
675 } /* SettupPass1 */
676 
677 
678 /*---------------------------------------------------------------------------*/
693 
694 } /* SettupPass2 */
695 
696 
697 /*---------------------------------------------------------------------------*/
718  CLASS_ID ClassId,
719  int FontinfoId,
720  ADAPT_CLASS Class,
721  ADAPT_TEMPLATES Templates) {
722  FEATURE_SET Features;
723  int Fid, Pid;
724  FEATURE Feature;
725  int NumFeatures;
726  TEMP_PROTO TempProto;
727  PROTO Proto;
728  INT_CLASS IClass;
730 
731  classify_norm_method.set_value(baseline);
732  Features = ExtractOutlineFeatures(Blob);
733  NumFeatures = Features->NumFeatures;
734  if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
735  FreeFeatureSet(Features);
736  return;
737  }
738 
739  Config = NewTempConfig(NumFeatures - 1, FontinfoId);
740  TempConfigFor(Class, 0) = Config;
741 
742  /* this is a kludge to construct cutoffs for adapted templates */
743  if (Templates == AdaptedTemplates)
744  BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
745 
746  IClass = ClassForClassId (Templates->Templates, ClassId);
747 
748  for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
749  Pid = AddIntProto (IClass);
750  assert (Pid != NO_PROTO);
751 
752  Feature = Features->Features[Fid];
753  TempProto = NewTempProto ();
754  Proto = &(TempProto->Proto);
755 
756  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
757  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
758  instead of the -0.25 to 0.75 used in baseline normalization */
759  Proto->Angle = Feature->Params[OutlineFeatDir];
760  Proto->X = Feature->Params[OutlineFeatX];
761  Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
762  Proto->Length = Feature->Params[OutlineFeatLength];
763  FillABC(Proto);
764 
765  TempProto->ProtoId = Pid;
766  SET_BIT (Config->Protos, Pid);
767 
768  ConvertProto(Proto, Pid, IClass);
769  AddProtoToProtoPruner(Proto, Pid, IClass,
771 
772  Class->TempProtos = push (Class->TempProtos, TempProto);
773  }
774  FreeFeatureSet(Features);
775 
776  AddIntConfig(IClass);
777  ConvertConfig (AllProtosOn, 0, IClass);
778 
780  tprintf("Added new class '%s' with class id %d and %d protos.\n",
781  unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
783  DisplayAdaptedChar(Blob, IClass);
784  }
785 
786  if (IsEmptyAdaptedClass(Class))
787  (Templates->NumNonEmptyClasses)++;
788 } /* InitAdaptedClass */
789 
790 
791 /*---------------------------------------------------------------------------*/
813  INT_FEATURE_ARRAY IntFeatures,
814  FEATURE_SET *FloatFeatures) {
815  FEATURE_SET Features;
816  int NumFeatures;
817 
818  classify_norm_method.set_value(baseline);
819  Features = ExtractPicoFeatures(Blob);
820 
821  NumFeatures = Features->NumFeatures;
822  if (NumFeatures > UNLIKELY_NUM_FEAT) {
823  FreeFeatureSet(Features);
824  return 0;
825  }
826 
827  ComputeIntFeatures(Features, IntFeatures);
828  *FloatFeatures = Features;
829 
830  return NumFeatures;
831 } /* GetAdaptiveFeatures */
832 
833 
834 /*-----------------------------------------------------------------------------
835  Private Code
836 -----------------------------------------------------------------------------*/
837 /*---------------------------------------------------------------------------*/
851  if (word->best_choice == NULL) return false;
852  int BestChoiceLength = word->best_choice->length();
853  float adaptable_score =
855  return // rules that apply in general - simplest to compute first
856  BestChoiceLength > 0 &&
857  BestChoiceLength == word->rebuild_word->NumBlobs() &&
858  BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
859  // This basically ensures that the word is at least a dictionary match
860  // (freq word, user word, system dawg word, etc).
861  // Since all the other adjustments will make adjust factor higher
862  // than higher than adaptable_score=1.1+0.05=1.15
863  // Since these are other flags that ensure that the word is dict word,
864  // this check could be at times redundant.
865  word->best_choice->adjust_factor() <= adaptable_score &&
866  // Make sure that alternative choices are not dictionary words.
867  word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
868 }
869 
870 /*---------------------------------------------------------------------------*/
886 void Classify::AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
887  FLOAT32 Threshold,
888  ADAPT_TEMPLATES adaptive_templates) {
889  int NumFeatures;
890  INT_FEATURE_ARRAY IntFeatures;
891  UnicharRating int_result;
892  INT_CLASS IClass;
893  ADAPT_CLASS Class;
894  TEMP_CONFIG TempConfig;
895  FEATURE_SET FloatFeatures;
896  int NewTempConfigId;
897 
898  if (!LegalClassId (ClassId))
899  return;
900 
901  int_result.unichar_id = ClassId;
902  Class = adaptive_templates->Class[ClassId];
903  assert(Class != NULL);
904  if (IsEmptyAdaptedClass(Class)) {
905  InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
906  } else {
907  IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
908 
909  NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
910  if (NumFeatures <= 0)
911  return;
912 
913  // Only match configs with the matching font.
914  BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
915  for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
916  if (GetFontinfoId(Class, cfg) == FontinfoId) {
917  SET_BIT(MatchingFontConfigs, cfg);
918  } else {
919  reset_bit(MatchingFontConfigs, cfg);
920  }
921  }
922  im_.Match(IClass, AllProtosOn, MatchingFontConfigs,
923  NumFeatures, IntFeatures,
926  FreeBitVector(MatchingFontConfigs);
927 
928  SetAdaptiveThreshold(Threshold);
929 
930  if (1.0f - int_result.rating <= Threshold) {
931  if (ConfigIsPermanent(Class, int_result.config)) {
933  tprintf("Found good match to perm config %d = %4.1f%%.\n",
934  int_result.config, int_result.rating * 100.0);
935  FreeFeatureSet(FloatFeatures);
936  return;
937  }
938 
939  TempConfig = TempConfigFor(Class, int_result.config);
940  IncreaseConfidence(TempConfig);
941  if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
942  Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
943  }
945  tprintf("Increasing reliability of temp config %d to %d.\n",
946  int_result.config, TempConfig->NumTimesSeen);
947 
948  if (TempConfigReliable(ClassId, TempConfig)) {
949  MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
950  UpdateAmbigsGroup(ClassId, Blob);
951  }
952  } else {
954  tprintf("Found poor match to temp config %d = %4.1f%%.\n",
955  int_result.config, int_result.rating * 100.0);
957  DisplayAdaptedChar(Blob, IClass);
958  }
959  NewTempConfigId =
960  MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId,
961  NumFeatures, IntFeatures, FloatFeatures);
962  if (NewTempConfigId >= 0 &&
963  TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
964  MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
965  UpdateAmbigsGroup(ClassId, Blob);
966  }
967 
968 #ifndef GRAPHICS_DISABLED
970  DisplayAdaptedChar(Blob, IClass);
971  }
972 #endif
973  }
974  FreeFeatureSet(FloatFeatures);
975  }
976 } /* AdaptToChar */
977 
979 #ifndef GRAPHICS_DISABLED
980  INT_FX_RESULT_STRUCT fx_info;
984  &bl_features);
985  if (sample == NULL) return;
986 
987  UnicharRating int_result;
988  im_.Match(int_class, AllProtosOn, AllConfigsOn,
989  bl_features.size(), &bl_features[0],
992  tprintf("Best match to temp config %d = %4.1f%%.\n",
993  int_result.config, int_result.rating * 100.0);
995  uinT32 ConfigMask;
996  ConfigMask = 1 << int_result.config;
998  im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask,
999  bl_features.size(), &bl_features[0],
1000  &int_result, classify_adapt_feature_threshold,
1001  6 | 0x19, matcher_debug_separate_windows);
1003  }
1004 #endif
1005 }
1006 
1007 
1008 
1029 void Classify::AddNewResult(const UnicharRating& new_result,
1030  ADAPT_RESULTS *results) {
1031  int old_match = FindScoredUnichar(new_result.unichar_id, *results);
1032 
1033  if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
1034  (old_match < results->match.size() &&
1035  new_result.rating <= results->match[old_match].rating))
1036  return; // New one not good enough.
1037 
1038  if (!unicharset.get_fragment(new_result.unichar_id))
1039  results->HasNonfragment = true;
1040 
1041  if (old_match < results->match.size()) {
1042  results->match[old_match].rating = new_result.rating;
1043  } else {
1044  results->match.push_back(new_result);
1045  }
1046 
1047  if (new_result.rating > results->best_rating &&
1048  // Ensure that fragments do not affect best rating, class and config.
1049  // This is needed so that at least one non-fragmented character is
1050  // always present in the results.
1051  // TODO(daria): verify that this helps accuracy and does not
1052  // hurt performance.
1053  !unicharset.get_fragment(new_result.unichar_id)) {
1054  results->best_match_index = old_match;
1055  results->best_rating = new_result.rating;
1056  results->best_unichar_id = new_result.unichar_id;
1057  }
1058 } /* AddNewResult */
1059 
1060 
1061 /*---------------------------------------------------------------------------*/
1084  const GenericVector<INT_FEATURE_STRUCT>& int_features,
1085  const INT_FX_RESULT_STRUCT& fx_info,
1086  const TBLOB *blob,
1087  INT_TEMPLATES templates,
1088  ADAPT_CLASS *classes,
1089  UNICHAR_ID *ambiguities,
1090  ADAPT_RESULTS *results) {
1091  if (int_features.empty()) return;
1092  uinT8* CharNormArray = new uinT8[unicharset.size()];
1093  UnicharRating int_result;
1094 
1095  results->BlobLength = GetCharNormFeature(fx_info, templates, NULL,
1096  CharNormArray);
1097  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1098  if (debug)
1099  tprintf("AM Matches = ");
1100 
1101  int top = blob->bounding_box().top();
1102  int bottom = blob->bounding_box().bottom();
1103  while (*ambiguities >= 0) {
1104  CLASS_ID class_id = *ambiguities;
1105 
1106  int_result.unichar_id = class_id;
1107  im_.Match(ClassForClassId(templates, class_id),
1109  int_features.size(), &int_features[0],
1110  &int_result,
1113 
1114  ExpandShapesAndApplyCorrections(NULL, debug, class_id, bottom, top, 0,
1115  results->BlobLength,
1117  CharNormArray, &int_result, results);
1118  ambiguities++;
1119  }
1120  delete [] CharNormArray;
1121 } /* AmbigClassifier */
1122 
1123 /*---------------------------------------------------------------------------*/
1127  inT16 num_features,
1128  const INT_FEATURE_STRUCT* features,
1129  const uinT8* norm_factors,
1130  ADAPT_CLASS* classes,
1131  int debug,
1132  int matcher_multiplier,
1133  const TBOX& blob_box,
1134  const GenericVector<CP_RESULT_STRUCT>& results,
1135  ADAPT_RESULTS* final_results) {
1136  int top = blob_box.top();
1137  int bottom = blob_box.bottom();
1138  UnicharRating int_result;
1139  for (int c = 0; c < results.size(); c++) {
1140  CLASS_ID class_id = results[c].Class;
1141  BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos
1142  : AllProtosOn;
1143  BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs
1144  : AllConfigsOn;
1145 
1146  int_result.unichar_id = class_id;
1147  im_.Match(ClassForClassId(templates, class_id),
1148  protos, configs,
1149  num_features, features,
1150  &int_result, classify_adapt_feature_threshold, debug,
1152  bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1153  ExpandShapesAndApplyCorrections(classes, debug, class_id, bottom, top,
1154  results[c].Rating,
1155  final_results->BlobLength,
1156  matcher_multiplier, norm_factors,
1157  &int_result, final_results);
1158  }
1159 }
1160 
1161 // Converts configs to fonts, and if the result is not adapted, and a
1162 // shape_table_ is present, the shape is expanded to include all
1163 // unichar_ids represented, before applying a set of corrections to the
1164 // distance rating in int_result, (see ComputeCorrectedRating.)
1165 // The results are added to the final_results output.
1167  ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top,
1168  float cp_rating, int blob_length, int matcher_multiplier,
1169  const uinT8* cn_factors,
1170  UnicharRating* int_result, ADAPT_RESULTS* final_results) {
1171  if (classes != NULL) {
1172  // Adapted result. Convert configs to fontinfo_ids.
1173  int_result->adapted = true;
1174  for (int f = 0; f < int_result->fonts.size(); ++f) {
1175  int_result->fonts[f].fontinfo_id =
1176  GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id);
1177  }
1178  } else {
1179  // Pre-trained result. Map fonts using font_sets_.
1180  int_result->adapted = false;
1181  for (int f = 0; f < int_result->fonts.size(); ++f) {
1182  int_result->fonts[f].fontinfo_id =
1184  int_result->fonts[f].fontinfo_id);
1185  }
1186  if (shape_table_ != NULL) {
1187  // Two possible cases:
1188  // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1189  // int_result->fonts are the same. In this case build a new vector of
1190  // mapped fonts and replace the fonts in int_result.
1191  // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1192  // by int_result. In this case, build a vector of UnicharRating to
1193  // gather together different font-ids for each unichar. Also covers case1.
1194  GenericVector<UnicharRating> mapped_results;
1195  for (int f = 0; f < int_result->fonts.size(); ++f) {
1196  int shape_id = int_result->fonts[f].fontinfo_id;
1197  const Shape& shape = shape_table_->GetShape(shape_id);
1198  for (int c = 0; c < shape.size(); ++c) {
1199  int unichar_id = shape[c].unichar_id;
1200  if (!unicharset.get_enabled(unichar_id)) continue;
1201  // Find the mapped_result for unichar_id.
1202  int r = 0;
1203  for (r = 0; r < mapped_results.size() &&
1204  mapped_results[r].unichar_id != unichar_id; ++r) {}
1205  if (r == mapped_results.size()) {
1206  mapped_results.push_back(*int_result);
1207  mapped_results[r].unichar_id = unichar_id;
1208  mapped_results[r].fonts.truncate(0);
1209  }
1210  for (int i = 0; i < shape[c].font_ids.size(); ++i) {
1211  mapped_results[r].fonts.push_back(
1212  ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score));
1213  }
1214  }
1215  }
1216  for (int m = 0; m < mapped_results.size(); ++m) {
1217  mapped_results[m].rating =
1218  ComputeCorrectedRating(debug, mapped_results[m].unichar_id,
1219  cp_rating, int_result->rating,
1220  int_result->feature_misses, bottom, top,
1221  blob_length, matcher_multiplier, cn_factors);
1222  AddNewResult(mapped_results[m], final_results);
1223  }
1224  return;
1225  }
1226  }
1227  if (unicharset.get_enabled(class_id)) {
1228  int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating,
1229  int_result->rating,
1230  int_result->feature_misses,
1231  bottom, top, blob_length,
1232  matcher_multiplier, cn_factors);
1233  AddNewResult(*int_result, final_results);
1234  }
1235 }
1236 
1237 // Applies a set of corrections to the confidence im_rating,
1238 // including the cn_correction, miss penalty and additional penalty
1239 // for non-alnums being vertical misfits. Returns the corrected confidence.
1240 double Classify::ComputeCorrectedRating(bool debug, int unichar_id,
1241  double cp_rating, double im_rating,
1242  int feature_misses,
1243  int bottom, int top,
1244  int blob_length, int matcher_multiplier,
1245  const uinT8* cn_factors) {
1246  // Compute class feature corrections.
1247  double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length,
1248  cn_factors[unichar_id],
1249  matcher_multiplier);
1250  double miss_penalty = tessedit_class_miss_scale * feature_misses;
1251  double vertical_penalty = 0.0;
1252  // Penalize non-alnums for being vertical misfits.
1253  if (!unicharset.get_isalpha(unichar_id) &&
1254  !unicharset.get_isdigit(unichar_id) &&
1255  cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1256  int min_bottom, max_bottom, min_top, max_top;
1257  unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom,
1258  &min_top, &max_top);
1259  if (debug) {
1260  tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n",
1261  top, min_top, max_top, bottom, min_bottom, max_bottom);
1262  }
1263  if (top < min_top || top > max_top ||
1264  bottom < min_bottom || bottom > max_bottom) {
1265  vertical_penalty = classify_misfit_junk_penalty;
1266  }
1267  }
1268  double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1269  if (result < WORST_POSSIBLE_RATING)
1270  result = WORST_POSSIBLE_RATING;
1271  if (debug) {
1272  tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1273  unicharset.id_to_unichar(unichar_id),
1274  result * 100.0,
1275  cp_rating * 100.0,
1276  (1.0 - im_rating) * 100.0,
1277  (cn_corrected - (1.0 - im_rating)) * 100.0,
1278  cn_factors[unichar_id],
1279  miss_penalty * 100.0,
1280  vertical_penalty * 100.0);
1281  }
1282  return result;
1283 }
1284 
1285 /*---------------------------------------------------------------------------*/
1306  TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
1307  const INT_FX_RESULT_STRUCT& fx_info,
1308  ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) {
1309  if (int_features.empty()) return NULL;
1310  uinT8* CharNormArray = new uinT8[unicharset.size()];
1311  ClearCharNormArray(CharNormArray);
1312 
1314  PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0],
1315  CharNormArray, BaselineCutoffs, &Results->CPResults);
1316 
1317  if (matcher_debug_level >= 2 || classify_debug_level > 1)
1318  tprintf("BL Matches = ");
1319 
1320  MasterMatcher(Templates->Templates, int_features.size(), &int_features[0],
1321  CharNormArray,
1322  Templates->Class, matcher_debug_flags, 0,
1323  Blob->bounding_box(), Results->CPResults, Results);
1324 
1325  delete [] CharNormArray;
1326  CLASS_ID ClassId = Results->best_unichar_id;
1327  if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0)
1328  return NULL;
1329 
1330  return Templates->Class[ClassId]->
1331  Config[Results->match[Results->best_match_index].config].Perm->Ambigs;
1332 } /* BaselineClassifier */
1333 
1334 
1335 /*---------------------------------------------------------------------------*/
1355  const TrainingSample& sample,
1356  ADAPT_RESULTS *adapt_results) {
1357  // This is the length that is used for scaling ratings vs certainty.
1358  adapt_results->BlobLength =
1360  GenericVector<UnicharRating> unichar_results;
1361  static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0,
1362  -1, &unichar_results);
1363  // Convert results to the format used internally by AdaptiveClassifier.
1364  for (int r = 0; r < unichar_results.size(); ++r) {
1365  AddNewResult(unichar_results[r], adapt_results);
1366  }
1367  return sample.num_features();
1368 } /* CharNormClassifier */
1369 
1370 // As CharNormClassifier, but operates on a TrainingSample and outputs to
1371 // a GenericVector of ShapeRating without conversion to classes.
1373  int keep_this,
1374  const TrainingSample& sample,
1375  GenericVector<UnicharRating>* results) {
1376  results->clear();
1377  ADAPT_RESULTS* adapt_results = new ADAPT_RESULTS();
1378  adapt_results->Initialize();
1379  // Compute the bounding box of the features.
1380  int num_features = sample.num_features();
1381  // Only the top and bottom of the blob_box are used by MasterMatcher, so
1382  // fabricate right and left using top and bottom.
1383  TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1384  sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1385  // Compute the char_norm_array from the saved cn_feature.
1386  FEATURE norm_feature = sample.GetCNFeature();
1387  uinT8* char_norm_array = new uinT8[unicharset.size()];
1388  int num_pruner_classes = MAX(unicharset.size(),
1390  uinT8* pruner_norm_array = new uinT8[num_pruner_classes];
1391  adapt_results->BlobLength =
1392  static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5);
1393  ComputeCharNormArrays(norm_feature, PreTrainedTemplates, char_norm_array,
1394  pruner_norm_array);
1395 
1396  PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(),
1397  pruner_norm_array,
1398  shape_table_ != NULL ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1399  &adapt_results->CPResults);
1400  delete [] pruner_norm_array;
1401  if (keep_this >= 0) {
1402  adapt_results->CPResults[0].Class = keep_this;
1403  adapt_results->CPResults.truncate(1);
1404  }
1405  if (pruner_only) {
1406  // Convert pruner results to output format.
1407  for (int i = 0; i < adapt_results->CPResults.size(); ++i) {
1408  int class_id = adapt_results->CPResults[i].Class;
1409  results->push_back(
1410  UnicharRating(class_id, 1.0f - adapt_results->CPResults[i].Rating));
1411  }
1412  } else {
1413  MasterMatcher(PreTrainedTemplates, num_features, sample.features(),
1414  char_norm_array,
1417  blob_box, adapt_results->CPResults, adapt_results);
1418  // Convert master matcher results to output format.
1419  for (int i = 0; i < adapt_results->match.size(); i++) {
1420  results->push_back(adapt_results->match[i]);
1421  }
1423  }
1424  delete [] char_norm_array;
1425  delete adapt_results;
1426  return num_features;
1427 } /* CharNormTrainingSample */
1428 
1429 
1430 /*---------------------------------------------------------------------------*/
1446  float rating = results->BlobLength / matcher_avg_noise_size;
1447  rating *= rating;
1448  rating /= 1.0 + rating;
1449 
1450  AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1451 } /* ClassifyAsNoise */
1452 
1459 void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
1460  ADAPT_RESULTS *Results,
1461  BLOB_CHOICE_LIST *Choices) {
1462  assert(Choices != NULL);
1463  FLOAT32 Rating;
1464  FLOAT32 Certainty;
1465  BLOB_CHOICE_IT temp_it;
1466  bool contains_nonfrag = false;
1467  temp_it.set_to_list(Choices);
1468  int choices_length = 0;
1469  // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1470  // number of returned results, but with a shape_table_ we want to have room
1471  // for at least the biggest shape (which might contain hundreds of Indic
1472  // grapheme fragments) and more, so use double the size of the biggest shape
1473  // if that is more than the default.
1474  int max_matches = MAX_MATCHES;
1475  if (shape_table_ != NULL) {
1476  max_matches = shape_table_->MaxNumUnichars() * 2;
1477  if (max_matches < MAX_MATCHES)
1478  max_matches = MAX_MATCHES;
1479  }
1480 
1481  float best_certainty = -MAX_FLOAT32;
1482  for (int i = 0; i < Results->match.size(); i++) {
1483  const UnicharRating& result = Results->match[i];
1484  bool adapted = result.adapted;
1485  bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != NULL);
1486  if (temp_it.length()+1 == max_matches &&
1487  !contains_nonfrag && current_is_frag) {
1488  continue; // look for a non-fragmented character to fill the
1489  // last spot in Choices if only fragments are present
1490  }
1491  // BlobLength can never be legally 0, this means recognition failed.
1492  // But we must return a classification result because some invoking
1493  // functions (chopper/permuter) do not anticipate a null blob choice.
1494  // So we need to assign a poor, but not infinitely bad score.
1495  if (Results->BlobLength == 0) {
1496  Certainty = -20;
1497  Rating = 100; // should be -certainty * real_blob_length
1498  } else {
1499  Rating = Certainty = (1.0f - result.rating);
1500  Rating *= rating_scale * Results->BlobLength;
1501  Certainty *= -(getDict().certainty_scale);
1502  }
1503  // Adapted results, by their very nature, should have good certainty.
1504  // Those that don't are at best misleading, and often lead to errors,
1505  // so don't accept adapted results that are too far behind the best result,
1506  // whether adapted or static.
1507  // TODO(rays) find some way of automatically tuning these constants.
1508  if (Certainty > best_certainty) {
1509  best_certainty = MIN(Certainty, classify_adapted_pruning_threshold);
1510  } else if (adapted &&
1511  Certainty / classify_adapted_pruning_factor < best_certainty) {
1512  continue; // Don't accept bad adapted results.
1513  }
1514 
1515  float min_xheight, max_xheight, yshift;
1516  denorm.XHeightRange(result.unichar_id, unicharset, box,
1517  &min_xheight, &max_xheight, &yshift);
1518  BLOB_CHOICE* choice =
1519  new BLOB_CHOICE(result.unichar_id, Rating, Certainty,
1521  min_xheight, max_xheight, yshift,
1522  adapted ? BCC_ADAPTED_CLASSIFIER
1524  choice->set_fonts(result.fonts);
1525  temp_it.add_to_end(choice);
1526  contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1527  choices_length++;
1528  if (choices_length >= max_matches) break;
1529  }
1530  Results->match.truncate(choices_length);
1531 } // ConvertMatchesToChoices
1532 
1533 
1534 /*---------------------------------------------------------------------------*/
1535 #ifndef GRAPHICS_DISABLED
1536 
1547  ADAPT_RESULTS *Results) {
1548  if (static_classifier_ == NULL) return;
1549  INT_FX_RESULT_STRUCT fx_info;
1552  BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1553  if (sample == NULL) return;
1554  static_classifier_->DebugDisplay(*sample, blob->denorm().pix(),
1555  Results->best_unichar_id);
1556 } /* DebugAdaptiveClassifier */
1557 #endif
1558 
1559 /*---------------------------------------------------------------------------*/
1583  UNICHAR_ID *Ambiguities;
1584 
1585  INT_FX_RESULT_STRUCT fx_info;
1589  &bl_features);
1590  if (sample == NULL) return;
1591 
1593  tess_cn_matching) {
1594  CharNormClassifier(Blob, *sample, Results);
1595  } else {
1596  Ambiguities = BaselineClassifier(Blob, bl_features, fx_info,
1597  AdaptedTemplates, Results);
1598  if ((!Results->match.empty() &&
1599  MarginalMatch(Results->best_rating,
1601  !tess_bn_matching) ||
1602  Results->match.empty()) {
1603  CharNormClassifier(Blob, *sample, Results);
1604  } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1605  AmbigClassifier(bl_features, fx_info, Blob,
1608  Ambiguities,
1609  Results);
1610  }
1611  }
1612 
1613  // Force the blob to be classified as noise
1614  // if the results contain only fragments.
1615  // TODO(daria): verify that this is better than
1616  // just adding a NULL classification.
1617  if (!Results->HasNonfragment || Results->match.empty())
1618  ClassifyAsNoise(Results);
1619  delete sample;
1620 } /* DoAdaptiveMatch */
1621 
1622 /*---------------------------------------------------------------------------*/
1640  CLASS_ID CorrectClass) {
1641  ADAPT_RESULTS *Results = new ADAPT_RESULTS();
1642  UNICHAR_ID *Ambiguities;
1643  int i;
1644 
1645  Results->Initialize();
1646  INT_FX_RESULT_STRUCT fx_info;
1650  &bl_features);
1651  if (sample == NULL) {
1652  delete Results;
1653  return NULL;
1654  }
1655 
1656  CharNormClassifier(Blob, *sample, Results);
1657  delete sample;
1658  RemoveBadMatches(Results);
1660 
1661  /* copy the class id's into an string of ambiguities - don't copy if
1662  the correct class is the only class id matched */
1663  Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1664  if (Results->match.size() > 1 ||
1665  (Results->match.size() == 1 &&
1666  Results->match[0].unichar_id != CorrectClass)) {
1667  for (i = 0; i < Results->match.size(); i++)
1668  Ambiguities[i] = Results->match[i].unichar_id;
1669  Ambiguities[i] = -1;
1670  } else {
1671  Ambiguities[0] = -1;
1672  }
1673 
1674  delete Results;
1675  return Ambiguities;
1676 } /* GetAmbiguities */
1677 
1678 // Returns true if the given blob looks too dissimilar to any character
1679 // present in the classifier templates.
1681  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST();
1682  AdaptiveClassifier(blob, ratings);
1683  BLOB_CHOICE_IT ratings_it(ratings);
1686  print_ratings_list("======================\nLooksLikeGarbage() got ",
1687  ratings, unicharset);
1688  }
1689  for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list();
1690  ratings_it.forward()) {
1691  if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != NULL) {
1692  continue;
1693  }
1694  float certainty = ratings_it.data()->certainty();
1695  delete ratings;
1696  return certainty <
1698  }
1699  delete ratings;
1700  return true; // no whole characters in ratings
1701 }
1702 
1703 /*---------------------------------------------------------------------------*/
1728  INT_TEMPLATES templates,
1729  uinT8* pruner_norm_array,
1730  uinT8* char_norm_array) {
1731  FEATURE norm_feature = NewFeature(&CharNormDesc);
1732  float baseline = kBlnBaselineOffset;
1733  float scale = MF_SCALE_FACTOR;
1734  norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1735  norm_feature->Params[CharNormLength] =
1736  fx_info.Length * scale / LENGTH_COMPRESSION;
1737  norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1738  norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1739  // Deletes norm_feature.
1740  ComputeCharNormArrays(norm_feature, templates, char_norm_array,
1741  pruner_norm_array);
1742  return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1743 } /* GetCharNormFeature */
1744 
1745 // Computes the char_norm_array for the unicharset and, if not NULL, the
1746 // pruner_array as appropriate according to the existence of the shape_table.
1748  INT_TEMPLATES_STRUCT* templates,
1749  uinT8* char_norm_array,
1750  uinT8* pruner_array) {
1751  ComputeIntCharNormArray(*norm_feature, char_norm_array);
1752  if (pruner_array != NULL) {
1753  if (shape_table_ == NULL) {
1754  ComputeIntCharNormArray(*norm_feature, pruner_array);
1755  } else {
1756  memset(pruner_array, MAX_UINT8,
1757  templates->NumClasses * sizeof(pruner_array[0]));
1758  // Each entry in the pruner norm array is the MIN of all the entries of
1759  // the corresponding unichars in the CharNormArray.
1760  for (int id = 0; id < templates->NumClasses; ++id) {
1761  int font_set_id = templates->Class[id]->font_set_id;
1762  const FontSet &fs = fontset_table_.get(font_set_id);
1763  for (int config = 0; config < fs.size; ++config) {
1764  const Shape& shape = shape_table_->GetShape(fs.configs[config]);
1765  for (int c = 0; c < shape.size(); ++c) {
1766  if (char_norm_array[shape[c].unichar_id] < pruner_array[id])
1767  pruner_array[id] = char_norm_array[shape[c].unichar_id];
1768  }
1769  }
1770  }
1771  }
1772  }
1773  FreeFeature(norm_feature);
1774 }
1775 
1776 /*---------------------------------------------------------------------------*/
1792  CLASS_ID ClassId,
1793  int FontinfoId,
1794  int NumFeatures,
1795  INT_FEATURE_ARRAY Features,
1796  FEATURE_SET FloatFeatures) {
1797  INT_CLASS IClass;
1798  ADAPT_CLASS Class;
1799  PROTO_ID OldProtos[MAX_NUM_PROTOS];
1800  FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1801  int NumOldProtos;
1802  int NumBadFeatures;
1803  int MaxProtoId, OldMaxProtoId;
1804  int BlobLength = 0;
1805  int MaskSize;
1806  int ConfigId;
1808  int i;
1809  int debug_level = NO_DEBUG;
1810 
1812  debug_level =
1814 
1815  IClass = ClassForClassId(Templates->Templates, ClassId);
1816  Class = Templates->Class[ClassId];
1817 
1818  if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1819  ++NumAdaptationsFailed;
1821  cprintf("Cannot make new temporary config: maximum number exceeded.\n");
1822  return -1;
1823  }
1824 
1825  OldMaxProtoId = IClass->NumProtos - 1;
1826 
1827  NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff,
1828  BlobLength, NumFeatures, Features,
1829  OldProtos, classify_adapt_proto_threshold,
1830  debug_level);
1831 
1832  MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1833  zero_all_bits(TempProtoMask, MaskSize);
1834  for (i = 0; i < NumOldProtos; i++)
1835  SET_BIT(TempProtoMask, OldProtos[i]);
1836 
1837  NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn,
1838  BlobLength, NumFeatures, Features,
1839  BadFeatures,
1841  debug_level);
1842 
1843  MaxProtoId = MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures,
1844  IClass, Class, TempProtoMask);
1845  if (MaxProtoId == NO_PROTO) {
1846  ++NumAdaptationsFailed;
1848  cprintf("Cannot make new temp protos: maximum number exceeded.\n");
1849  return -1;
1850  }
1851 
1852  ConfigId = AddIntConfig(IClass);
1853  ConvertConfig(TempProtoMask, ConfigId, IClass);
1854  Config = NewTempConfig(MaxProtoId, FontinfoId);
1855  TempConfigFor(Class, ConfigId) = Config;
1857 
1859  cprintf("Making new temp config %d fontinfo id %d"
1860  " using %d old and %d new protos.\n",
1861  ConfigId, Config->FontinfoId,
1862  NumOldProtos, MaxProtoId - OldMaxProtoId);
1863 
1864  return ConfigId;
1865 } /* MakeNewTemporaryConfig */
1866 
1867 /*---------------------------------------------------------------------------*/
1889  int NumBadFeat,
1890  FEATURE_ID BadFeat[],
1891  INT_CLASS IClass,
1892  ADAPT_CLASS Class,
1893  BIT_VECTOR TempProtoMask) {
1894  FEATURE_ID *ProtoStart;
1895  FEATURE_ID *ProtoEnd;
1896  FEATURE_ID *LastBad;
1897  TEMP_PROTO TempProto;
1898  PROTO Proto;
1899  FEATURE F1, F2;
1900  FLOAT32 X1, X2, Y1, Y2;
1901  FLOAT32 A1, A2, AngleDelta;
1902  FLOAT32 SegmentLength;
1903  PROTO_ID Pid;
1904 
1905  for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat;
1906  ProtoStart < LastBad; ProtoStart = ProtoEnd) {
1907  F1 = Features->Features[*ProtoStart];
1908  X1 = F1->Params[PicoFeatX];
1909  Y1 = F1->Params[PicoFeatY];
1910  A1 = F1->Params[PicoFeatDir];
1911 
1912  for (ProtoEnd = ProtoStart + 1,
1913  SegmentLength = GetPicoFeatureLength();
1914  ProtoEnd < LastBad;
1915  ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1916  F2 = Features->Features[*ProtoEnd];
1917  X2 = F2->Params[PicoFeatX];
1918  Y2 = F2->Params[PicoFeatY];
1919  A2 = F2->Params[PicoFeatDir];
1920 
1921  AngleDelta = fabs(A1 - A2);
1922  if (AngleDelta > 0.5)
1923  AngleDelta = 1.0 - AngleDelta;
1924 
1925  if (AngleDelta > matcher_clustering_max_angle_delta ||
1926  fabs(X1 - X2) > SegmentLength ||
1927  fabs(Y1 - Y2) > SegmentLength)
1928  break;
1929  }
1930 
1931  F2 = Features->Features[*(ProtoEnd - 1)];
1932  X2 = F2->Params[PicoFeatX];
1933  Y2 = F2->Params[PicoFeatY];
1934  A2 = F2->Params[PicoFeatDir];
1935 
1936  Pid = AddIntProto(IClass);
1937  if (Pid == NO_PROTO)
1938  return (NO_PROTO);
1939 
1940  TempProto = NewTempProto();
1941  Proto = &(TempProto->Proto);
1942 
1943  /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1944  ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1945  instead of the -0.25 to 0.75 used in baseline normalization */
1946  Proto->Length = SegmentLength;
1947  Proto->Angle = A1;
1948  Proto->X = (X1 + X2) / 2.0;
1949  Proto->Y = (Y1 + Y2) / 2.0 - Y_DIM_OFFSET;
1950  FillABC(Proto);
1951 
1952  TempProto->ProtoId = Pid;
1953  SET_BIT(TempProtoMask, Pid);
1954 
1955  ConvertProto(Proto, Pid, IClass);
1956  AddProtoToProtoPruner(Proto, Pid, IClass,
1958 
1959  Class->TempProtos = push(Class->TempProtos, TempProto);
1960  }
1961  return IClass->NumProtos - 1;
1962 } /* MakeNewTempProtos */
1963 
1964 /*---------------------------------------------------------------------------*/
1978  CLASS_ID ClassId,
1979  int ConfigId,
1980  TBLOB *Blob) {
1981  UNICHAR_ID *Ambigs;
1983  ADAPT_CLASS Class;
1984  PROTO_KEY ProtoKey;
1985 
1986  Class = Templates->Class[ClassId];
1987  Config = TempConfigFor(Class, ConfigId);
1988 
1989  MakeConfigPermanent(Class, ConfigId);
1990  if (Class->NumPermConfigs == 0)
1991  Templates->NumPermClasses++;
1992  Class->NumPermConfigs++;
1993 
1994  // Initialize permanent config.
1995  Ambigs = GetAmbiguities(Blob, ClassId);
1997  "PERM_CONFIG_STRUCT");
1998  Perm->Ambigs = Ambigs;
1999  Perm->FontinfoId = Config->FontinfoId;
2000 
2001  // Free memory associated with temporary config (since ADAPTED_CONFIG
2002  // is a union we need to clean up before we record permanent config).
2003  ProtoKey.Templates = Templates;
2004  ProtoKey.ClassId = ClassId;
2005  ProtoKey.ConfigId = ConfigId;
2006  Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
2007  FreeTempConfig(Config);
2008 
2009  // Record permanent config.
2010  PermConfigFor(Class, ConfigId) = Perm;
2011 
2012  if (classify_learning_debug_level >= 1) {
2013  tprintf("Making config %d for %s (ClassId %d) permanent:"
2014  " fontinfo id %d, ambiguities '",
2015  ConfigId, getDict().getUnicharset().debug_str(ClassId).string(),
2016  ClassId, PermConfigFor(Class, ConfigId)->FontinfoId);
2017  for (UNICHAR_ID *AmbigsPointer = Ambigs;
2018  *AmbigsPointer >= 0; ++AmbigsPointer)
2019  tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
2020  tprintf("'.\n");
2021  }
2022 } /* MakePermanent */
2023 } // namespace tesseract
2024 
2025 /*---------------------------------------------------------------------------*/
2040 int MakeTempProtoPerm(void *item1, void *item2) {
2041  ADAPT_CLASS Class;
2043  TEMP_PROTO TempProto;
2044  PROTO_KEY *ProtoKey;
2045 
2046  TempProto = (TEMP_PROTO) item1;
2047  ProtoKey = (PROTO_KEY *) item2;
2048 
2049  Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
2050  Config = TempConfigFor(Class, ProtoKey->ConfigId);
2051 
2052  if (TempProto->ProtoId > Config->MaxProtoId ||
2053  !test_bit (Config->Protos, TempProto->ProtoId))
2054  return FALSE;
2055 
2056  MakeProtoPermanent(Class, TempProto->ProtoId);
2057  AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId,
2058  ProtoKey->Templates->Templates);
2059  FreeTempProto(TempProto);
2060 
2061  return TRUE;
2062 } /* MakeTempProtoPerm */
2063 
2064 /*---------------------------------------------------------------------------*/
2065 namespace tesseract {
2077  for (int i = 0; i < results.match.size(); ++i) {
2078  tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string());
2079  results.match[i].Print();
2080  }
2081 } /* PrintAdaptiveMatchResults */
2082 
2083 /*---------------------------------------------------------------------------*/
2100  int Next, NextGood;
2101  FLOAT32 BadMatchThreshold;
2102  static const char* romans = "i v x I V X";
2103  BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
2104 
2106  UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ?
2107  unicharset.unichar_to_id("1") : -1;
2108  UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ?
2109  unicharset.unichar_to_id("0") : -1;
2110  float scored_one = ScoredUnichar(unichar_id_one, *Results);
2111  float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
2112 
2113  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2114  const UnicharRating& match = Results->match[Next];
2115  if (match.rating >= BadMatchThreshold) {
2116  if (!unicharset.get_isalpha(match.unichar_id) ||
2117  strstr(romans,
2118  unicharset.id_to_unichar(match.unichar_id)) != NULL) {
2119  } else if (unicharset.eq(match.unichar_id, "l") &&
2120  scored_one < BadMatchThreshold) {
2121  Results->match[Next].unichar_id = unichar_id_one;
2122  } else if (unicharset.eq(match.unichar_id, "O") &&
2123  scored_zero < BadMatchThreshold) {
2124  Results->match[Next].unichar_id = unichar_id_zero;
2125  } else {
2126  Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
2127  }
2128  if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
2129  if (NextGood == Next) {
2130  ++NextGood;
2131  } else {
2132  Results->match[NextGood++] = Results->match[Next];
2133  }
2134  }
2135  }
2136  }
2137  } else {
2138  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2139  if (Results->match[Next].rating >= BadMatchThreshold) {
2140  if (NextGood == Next) {
2141  ++NextGood;
2142  } else {
2143  Results->match[NextGood++] = Results->match[Next];
2144  }
2145  }
2146  }
2147  }
2148  Results->match.truncate(NextGood);
2149 } /* RemoveBadMatches */
2150 
2151 /*----------------------------------------------------------------------------*/
2162  int Next, NextGood;
2163  int punc_count; /*no of garbage characters */
2164  int digit_count;
2165  /*garbage characters */
2166  static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2167  static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2168 
2169  punc_count = 0;
2170  digit_count = 0;
2171  for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2172  const UnicharRating& match = Results->match[Next];
2173  bool keep = true;
2174  if (strstr(punc_chars,
2175  unicharset.id_to_unichar(match.unichar_id)) != NULL) {
2176  if (punc_count >= 2)
2177  keep = false;
2178  punc_count++;
2179  } else {
2180  if (strstr(digit_chars,
2181  unicharset.id_to_unichar(match.unichar_id)) != NULL) {
2182  if (digit_count >= 1)
2183  keep = false;
2184  digit_count++;
2185  }
2186  }
2187  if (keep) {
2188  if (NextGood == Next) {
2189  ++NextGood;
2190  } else {
2191  Results->match[NextGood++] = match;
2192  }
2193  }
2194  }
2195  Results->match.truncate(NextGood);
2196 } /* RemoveExtraPuncs */
2197 
2198 /*---------------------------------------------------------------------------*/
2213  Threshold = (Threshold == matcher_good_threshold) ? 0.9: (1.0 - Threshold);
2215  ClipToRange<int>(255 * Threshold, 0, 255));
2217  ClipToRange<int>(255 * Threshold, 0, 255));
2218 } /* SetAdaptiveThreshold */
2219 
2220 /*---------------------------------------------------------------------------*/
2233 void Classify::ShowBestMatchFor(int shape_id,
2234  const INT_FEATURE_STRUCT* features,
2235  int num_features) {
2236 #ifndef GRAPHICS_DISABLED
2237  uinT32 config_mask;
2238  if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2239  tprintf("No built-in templates for class/shape %d\n", shape_id);
2240  return;
2241  }
2242  if (num_features <= 0) {
2243  tprintf("Illegal blob (char norm features)!\n");
2244  return;
2245  }
2246  UnicharRating cn_result;
2247  classify_norm_method.set_value(character);
2250  num_features, features, &cn_result,
2253  tprintf("\n");
2254  config_mask = 1 << cn_result.config;
2255 
2256  tprintf("Static Shape ID: %d\n", shape_id);
2257  ShowMatchDisplay();
2259  AllProtosOn, reinterpret_cast<BIT_VECTOR>(&config_mask),
2260  num_features, features, &cn_result,
2265 #endif // GRAPHICS_DISABLED
2266 } /* ShowBestMatchFor */
2267 
2268 // Returns a string for the classifier class_id: either the corresponding
2269 // unicharset debug_str or the shape_table_ debug str.
2271  int class_id, int config_id) const {
2272  STRING class_string;
2273  if (templates == PreTrainedTemplates && shape_table_ != NULL) {
2274  int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2275  class_string = shape_table_->DebugStr(shape_id);
2276  } else {
2277  class_string = unicharset.debug_str(class_id);
2278  }
2279  return class_string;
2280 }
2281 
2282 // Converts a classifier class_id index to a shape_table_ index
2284  int int_result_config) const {
2285  int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2286  // Older inttemps have no font_ids.
2287  if (font_set_id < 0)
2288  return kBlankFontinfoId;
2289  const FontSet &fs = fontset_table_.get(font_set_id);
2290  ASSERT_HOST(int_result_config >= 0 && int_result_config < fs.size);
2291  return fs.configs[int_result_config];
2292 }
2293 
2294 // Converts a shape_table_ index to a classifier class_id index (not a
2295 // unichar-id!). Uses a search, so not fast.
2296 int Classify::ShapeIDToClassID(int shape_id) const {
2297  for (int id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2298  int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2299  ASSERT_HOST(font_set_id >= 0);
2300  const FontSet &fs = fontset_table_.get(font_set_id);
2301  for (int config = 0; config < fs.size; ++config) {
2302  if (fs.configs[config] == shape_id)
2303  return id;
2304  }
2305  }
2306  tprintf("Shape %d not found\n", shape_id);
2307  return -1;
2308 }
2309 
2310 // Returns true if the given TEMP_CONFIG is good enough to make it
2311 // a permanent config.
2313  const TEMP_CONFIG &config) {
2314  if (classify_learning_debug_level >= 1) {
2315  tprintf("NumTimesSeen for config of %s is %d\n",
2316  getDict().getUnicharset().debug_str(class_id).string(),
2317  config->NumTimesSeen);
2318  }
2320  return true;
2321  } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2322  return false;
2323  } else if (use_ambigs_for_adaption) {
2324  // Go through the ambigs vector and see whether we have already seen
2325  // enough times all the characters represented by the ambigs vector.
2326  const UnicharIdVector *ambigs =
2328  int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
2329  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2330  ADAPT_CLASS ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2331  assert(ambig_class != NULL);
2332  if (ambig_class->NumPermConfigs == 0 &&
2333  ambig_class->MaxNumTimesSeen <
2335  if (classify_learning_debug_level >= 1) {
2336  tprintf("Ambig %s has not been seen enough times,"
2337  " not making config for %s permanent\n",
2338  getDict().getUnicharset().debug_str(
2339  (*ambigs)[ambig]).string(),
2340  getDict().getUnicharset().debug_str(class_id).string());
2341  }
2342  return false;
2343  }
2344  }
2345  }
2346  return true;
2347 }
2348 
2350  const UnicharIdVector *ambigs =
2352  int ambigs_size = (ambigs == NULL) ? 0 : ambigs->size();
2353  if (classify_learning_debug_level >= 1) {
2354  tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2355  getDict().getUnicharset().debug_str(class_id).string(), class_id);
2356  }
2357  for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2358  CLASS_ID ambig_class_id = (*ambigs)[ambig];
2359  const ADAPT_CLASS ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2360  for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2361  if (ConfigIsPermanent(ambigs_class, cfg)) continue;
2362  const TEMP_CONFIG config =
2363  TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2364  if (config != NULL && TempConfigReliable(ambig_class_id, config)) {
2365  if (classify_learning_debug_level >= 1) {
2366  tprintf("Making config %d of %s permanent\n", cfg,
2367  getDict().getUnicharset().debug_str(
2368  ambig_class_id).string());
2369  }
2370  MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2371  }
2372  }
2373  }
2374 }
2375 
2376 } // namespace tesseract
bool matcher_debug_separate_windows
Definition: classify.h:458
FILE * GetDataFilePtr() const
const int kBlnXHeight
Definition: normalis.h:28
Definition: blobs.h:261
#define zero_all_bits(array, length)
Definition: bitvec.h:33
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:477
void ClearCharNormArray(uinT8 *char_norm_array)
Definition: float2int.cpp:48
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
int size() const
Definition: shapetable.h:202
int size() const
Definition: genericvector.h:72
int MakeTempProtoPerm(void *item1, void *item2)
#define PRINT_PROTO_MATCHES
Definition: intproto.h:194
void truncate(int size)
virtual int UnicharClassifySample(const TrainingSample &sample, Pix *page_pix, int debug, UNICHAR_ID keep_this, GenericVector< UnicharRating > *results)
int classify_integer_matcher_multiplier
Definition: classify.h:469
void AddProtoToProtoPruner(PROTO Proto, int ProtoId, INT_CLASS Class, bool debug)
Definition: intproto.cpp:389
bool classify_bln_numeric_mode
Definition: classify.h:500
void FreeBitVector(BIT_VECTOR BitVector)
Definition: bitvec.cpp:55
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:52
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:105
int length() const
Definition: genericvector.h:79
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
float FLOAT32
Definition: host.h:111
#define MAX(x, y)
Definition: ndminx.h:24
ADAPT_TEMPLATES Templates
Definition: adaptmatch.cpp:114
inT16 PROTO_ID
Definition: matchdefs.h:41
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:212
virtual void DebugDisplay(const TrainingSample &sample, Pix *page_pix, UNICHAR_ID unichar_id)
INT_CLASS Class[MAX_NUM_CLASSES]
Definition: intproto.h:124
void EndDangerousAmbigs()
Definition: stopper.cpp:368
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:190
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:553
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
int length() const
Definition: ratngs.h:300
WERD_CHOICE * best_choice
Definition: pageres.h:219
const INT_FEATURE_STRUCT * features() const
int push_back(T object)
BIT_VECTOR PermConfigs
Definition: adaptive.h:69
inT32 BlobLength
Definition: adaptmatch.cpp:83
int FindBadFeatures(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uinT16 BlobLength, inT16 NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:625
GenericVector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:89
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
bool classify_enable_adaptive_matcher
Definition: classify.h:409
TWERD * chopped_word
Definition: pageres.h:201
CLASS_ID ClassId
Definition: adaptmatch.cpp:115
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:167
#define reset_bit(array, bit)
Definition: bitvec.h:59
static void Update()
Definition: scrollview.cpp:715
GenericVector< UnicharRating > match
Definition: adaptmatch.cpp:88
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
double matcher_reliable_adaptive_result
Definition: classify.h:421
double tessedit_class_miss_scale
Definition: classify.h:439
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
FEATURE NewFeature(const FEATURE_DESC_STRUCT *FeatureDesc)
Definition: ocrfeatures.cpp:96
double matcher_good_threshold
Definition: classify.h:420
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:90
TEMP_PROTO NewTempProto()
Definition: adaptive.cpp:254
UNICHARSET unicharset
Definition: ccutil.h:72
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
double segment_penalty_dict_case_ok
Definition: dict.h:574
TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId)
Definition: adaptive.cpp:223
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:469
bool prioritize_division
Definition: classify.h:387
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:73
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File)
Definition: adaptive.cpp:369
PERM_CONFIG_STRUCT * PERM_CONFIG
Definition: adaptive.h:55
inT64 GetEndOffset(TessdataType tessdata_type) const
uinT16 ProtoId
Definition: adaptive.h:30
#define set_all_bits(array, length)
Definition: bitvec.h:41
GenericVector< STRING > correct_text
Definition: pageres.h:259
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:522
inT32 length() const
Definition: strngs.cpp:188
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:90
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
void AddProtoToClassPruner(PROTO Proto, CLASS_ID ClassId, INT_TEMPLATES Templates)
Definition: intproto.cpp:346
const FontInfo * fontinfo
Definition: pageres.h:288
bool classify_save_adapted_templates
Definition: classify.h:413
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:93
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:453
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:235
double classify_adapted_pruning_factor
Definition: classify.h:441
STRING imagefile
Definition: ccutil.h:74
TessdataManager tessdata_manager
Definition: ccutil.h:71
CharSegmentationType
Definition: classify.h:54
int matcher_min_examples_for_prototyping
Definition: classify.h:428
#define test_bit(array, bit)
Definition: bitvec.h:61
int ShapeIDToClassID(int shape_id) const
FEATURE Features[1]
Definition: ocrfeatures.h:72
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:35
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:67
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488
UNICHAR_ID best_unichar_id
Definition: adaptmatch.cpp:85
inT16 right() const
Definition: rect.h:75
uinT8 ProtoVectorSize
Definition: adaptive.h:42
BIT_VECTOR AllProtosOn
Definition: classify.h:480
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
Definition: float2int.cpp:69
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
int AddIntConfig(INT_CLASS Class)
Definition: intproto.cpp:272
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:505
int classify_learning_debug_level
Definition: classify.h:419
#define ASSERT_HOST(x)
Definition: errcode.h:84
double matcher_perfect_threshold
Definition: classify.h:422
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:69
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:524
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
int best_match_index
Definition: adaptmatch.cpp:86
int MaxNumUnichars() const
Definition: shapetable.cpp:465
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
void ComputeBest()
Definition: adaptmatch.cpp:99
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:71
FLOAT32 ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:32
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
double matcher_rating_margin
Definition: classify.h:424
void plot(ScrollView *window)
Definition: blobs.cpp:918
bool classify_nonlinear_norm
Definition: classify.h:416
int NumBlobs() const
Definition: blobs.h:425
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:370
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
bool HasNonfragment
Definition: adaptmatch.cpp:84
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:100
ADAPT_CLASS Class[MAX_NUM_CLASSES]
Definition: adaptive.h:81
BIT_VECTOR AllConfigsOff
Definition: classify.h:482
ShapeTable * shape_table_
Definition: classify.h:512
int AddIntProto(INT_CLASS Class)
Definition: intproto.cpp:295
uinT8 NumPermConfigs
Definition: adaptive.h:65
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:368
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
Pix * pix() const
Definition: normalis.h:248
FLOAT32 X
Definition: protos.h:47
#define LegalClassId(c)
Definition: intproto.h:179
PROTO_STRUCT Proto
Definition: adaptive.h:32
void SetAdaptiveThreshold(FLOAT32 Threshold)
#define LENGTH_COMPRESSION
Definition: normfeat.h:26
int classify_adapt_proto_threshold
Definition: classify.h:445
uinT8 MaxNumTimesSeen
Definition: adaptive.h:66
FEATURE_STRUCT * GetCNFeature() const
#define MakeProtoPermanent(Class, ProtoId)
Definition: adaptive.h:99
unsigned int uinT32
Definition: host.h:103
int matcher_permanent_classes_min
Definition: classify.h:426
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:102
GenericVector< ScoredFont > fonts
Definition: shapetable.h:88
double certainty_scale
Definition: classify.h:437
inT16 left() const
Definition: rect.h:68
STRING to_string() const
Definition: unicharset.h:73
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:611
TWERD * rebuild_word
Definition: pageres.h:244
const FEATURE_DESC_STRUCT CharNormDesc
FLOAT32 Angle
Definition: protos.h:49
void Match(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, inT16 NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:472
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS Class)
Definition: intproto.cpp:493
char * classify_learn_debug_str
Definition: classify.h:459
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:102
TEMP_PROTO_STRUCT * TEMP_PROTO
Definition: adaptive.h:37
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:155
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
bool classify_use_pre_adapted_templates
Definition: classify.h:411
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
void Init(tesseract::IntParam *classify_debug_level)
Definition: intmatcher.cpp:677
void SetAdaptiveThreshold(FLOAT32 Threshold)
#define NO_DEBUG
Definition: adaptmatch.cpp:70
const STRING debug_string() const
Definition: ratngs.h:502
static int SortDescendingRating(const void *t1, const void *t2)
Definition: shapetable.h:56
INT_TEMPLATES ReadIntTemplates(FILE *File)
Definition: intproto.cpp:770
#define NO_PROTO
Definition: matchdefs.h:42
double matcher_avg_noise_size
Definition: classify.h:425
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:886
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1072
FLOAT32 best_rating
Definition: adaptmatch.cpp:87
Dict & getDict()
Definition: classify.h:65
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:132
#define GetPicoFeatureLength()
Definition: picofeat.h:59
#define SET_BIT(array, bit)
Definition: bitvec.h:57
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
const int kBlnBaselineOffset
Definition: normalis.h:29
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:374
#define MAX_UINT8
Definition: host.h:121
void RemoveBadMatches(ADAPT_RESULTS *Results)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:526
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
uinT8 FEATURE_ID
Definition: matchdefs.h:47
int UNICHAR_ID
Definition: unichar.h:33
uinT8 NumTimesSeen
Definition: adaptive.h:41
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:66
const DENORM & denorm() const
Definition: blobs.h:340
void UpdateMatchDisplay()
Definition: intproto.cpp:473
#define MAX_INT32
Definition: host.h:120
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:628
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:363
#define UnusedClassIdIn(T, c)
Definition: intproto.h:180
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:216
STRING language_data_path_prefix
Definition: ccutil.h:70
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:133
int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId)
Definition: adaptive.cpp:190
double matcher_bad_match_pad
Definition: classify.h:423
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:978
inT16 bottom() const
Definition: rect.h:61
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
TBOX bounding_box() const
Definition: blobs.cpp:881
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, GenericVector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:81
#define ClassForClassId(T, c)
Definition: intproto.h:181
INT_TEMPLATES Templates
Definition: adaptive.h:77
bool classify_debug_character_fragments
Definition: classify.h:455
bool empty() const
Definition: genericvector.h:84
CLUSTERCONFIG Config
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:273
void Initialize()
Definition: adaptmatch.cpp:93
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:291
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
void * alloc_struct(inT32 count, const char *)
Definition: memry.cpp:39
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:75
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:193
void FillABC(PROTO Proto)
Definition: protos.cpp:198
FLOAT32 Length
Definition: protos.h:50
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:200
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:748
#define FALSE
Definition: capi.h:29
PROTO_ID MaxProtoId
Definition: adaptive.h:43
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:430
const UNICHARSET & getUnicharset() const
Definition: dict.h:96
bool SeekToStart(TessdataType tessdata_type)
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:96
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
void InitIntegerFX()
Definition: intfx.cpp:55
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:812
void FreeFeature(FEATURE Feature)
Definition: ocrfeatures.cpp:60
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:826
Definition: cluster.h:32
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:430
int IntCastRounded(double x)
Definition: helpers.h:172
double classify_adapted_pruning_threshold
Definition: classify.h:443
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
IntegerMatcher im_
Definition: classify.h:503
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:77
float adjust_factor() const
Definition: ratngs.h:303
void FreeTempProto(void *arg)
Definition: adaptive.cpp:90
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:717
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:69
Definition: rect.h:30
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:94
bool LooksLikeGarbage(TBLOB *blob)
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:191
#define TRUE
Definition: capi.h:28
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:47
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:108
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:125
uinT8 NumConfigs
Definition: intproto.h:110
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:220
#define MAX_FLOAT32
Definition: host.h:124
int FindGoodProtos(INT_CLASS ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, uinT16 BlobLength, inT16 NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:554
void free_adapted_templates(ADAPT_TEMPLATES templates)
Definition: adaptive.cpp:199
#define MAX_MATCHES
Definition: adaptmatch.cpp:68
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:122
UnicityTable< FontSet > fontset_table_
Definition: classify.h:496
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
bool disable_character_fragments
Definition: classify.h:450
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:456
UNICHAR_ID * Ambigs
Definition: adaptive.h:52
Definition: strngs.h:44
void MasterMatcher(INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
bool classify_enable_learning
Definition: classify.h:389
void cprintf(const char *format,...)
Definition: callcpp.cpp:40
void InitMatcherRatings(register FLOAT32 *Rating)
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:428
GenericVector< int > best_state
Definition: pageres.h:255
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:613
#define NULL
Definition: host.h:144
void InitAdaptiveClassifier(bool load_pre_trained_templates)
Definition: adaptmatch.cpp:527
bool use_ambigs_for_adaption
Definition: ccutil.h:93
BIT_VECTOR TempProtoMask
Definition: classify.h:483
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
#define MAX_NUM_PROTOS
Definition: intproto.h:47
TBOX bounding_box() const
Definition: blobs.cpp:482
int size() const
Definition: unicharset.h:297
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:644
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:194
BIT_VECTOR Protos
Definition: adaptive.h:45
const char * string() const
Definition: strngs.cpp:193
const double kStandardFeatureLength
Definition: intfx.h:46
BIT_VECTOR AllConfigsOn
Definition: classify.h:481
bool classify_enable_adaptive_debugger
Definition: classify.h:414
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:323
inT16 top() const
Definition: rect.h:54
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
double classify_misfit_junk_penalty
Definition: classify.h:435
#define copy_all_bits(source, dest, length)
Definition: bitvec.h:49
double matcher_clustering_max_angle_delta
Definition: classify.h:432
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:78
void ClassifyAsNoise(ADAPT_RESULTS *Results)
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:473
int classify_adapt_feature_threshold
Definition: classify.h:447
uinT16 NumProtos
Definition: intproto.h:108
NORM_PROTOS * ReadNormProtos(FILE *File, inT64 end_offset)
Definition: normmatch.cpp:245
BIT_VECTOR PermProtos
Definition: adaptive.h:68
void FreeTempConfig(TEMP_CONFIG Config)
Definition: adaptive.cpp:80
int geo_feature(int index) const
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
#define MF_SCALE_FACTOR
Definition: mfoutline.h:63
NORM_PROTOS * NormProtos
Definition: classify.h:486
short inT16
Definition: host.h:100
double certainty_scale
Definition: dict.h:601
int inT32
Definition: host.h:102
FLOAT32 Y
Definition: protos.h:48
unsigned char uinT8
Definition: host.h:99