All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
classify.h
Go to the documentation of this file.
1 // File: classify.h
3 // Description: classify class.
4 // Author: Samuel Charron
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #ifndef TESSERACT_CLASSIFY_CLASSIFY_H__
20 #define TESSERACT_CLASSIFY_CLASSIFY_H__
21 
22 #include "adaptive.h"
23 #include "ccstruct.h"
24 #include "classify.h"
25 #include "dict.h"
26 #include "featdefs.h"
27 #include "fontinfo.h"
28 #include "imagedata.h"
29 #include "intfx.h"
30 #include "intmatcher.h"
31 #include "normalis.h"
32 #include "ratngs.h"
33 #include "ocrfeatures.h"
34 #include "unicity_table.h"
35 
36 class ScrollView;
37 class WERD_CHOICE;
38 class WERD_RES;
39 struct ADAPT_RESULTS;
40 struct NORM_PROTOS;
41 
42 static const int kUnknownFontinfoId = -1;
43 static const int kBlankFontinfoId = -2;
44 
45 namespace tesseract {
46 
47 class ShapeClassifier;
48 struct ShapeRating;
49 class ShapeTable;
50 struct UnicharRating;
51 
52 // How segmented is a blob. In this enum, character refers to a classifiable
53 // unit, but that is too long and character is usually easier to understand.
55  CST_FRAGMENT, // A partial character.
56  CST_WHOLE, // A correctly segmented character.
57  CST_IMPROPER, // More than one but less than 2 characters.
58  CST_NGRAM // Multiple characters.
59 };
60 
61 class Classify : public CCStruct {
62  public:
63  Classify();
64  virtual ~Classify();
66  return dict_;
67  }
68 
69  const ShapeTable* shape_table() const {
70  return shape_table_;
71  }
72 
73  // Takes ownership of the given classifier, and uses it for future calls
74  // to CharNormClassifier.
75  void SetStaticClassifier(ShapeClassifier* static_classifier);
76 
77  // Adds a noise classification result that is a bit worse than the worst
78  // current result, or the worst possible result if no current results.
79  void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices);
80 
81  // Returns true if the blob is small enough to be a large speckle.
82  bool LargeSpeckle(const TBLOB &blob);
83 
84  /* adaptive.cpp ************************************************************/
85  ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset);
86  int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId);
87  // Runs the class pruner from int_templates on the given features, returning
88  // the number of classes output in results.
89  // int_templates Class pruner tables
90  // num_features Number of features in blob
91  // features Array of features
92  // normalization_factors (input) Array of int_templates->NumClasses fudge
93  // factors from blob normalization process.
94  // (Indexed by CLASS_INDEX)
95  // expected_num_features (input) Array of int_templates->NumClasses
96  // expected number of features for each class.
97  // (Indexed by CLASS_INDEX)
98  // results (output) Sorted Array of pruned classes.
99  // Array must be sized to take the maximum possible
100  // number of outputs : int_templates->NumClasses.
101  int PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, int num_features,
102  int keep_this, const INT_FEATURE_STRUCT* features,
103  const uinT8* normalization_factors,
104  const uinT16* expected_num_features,
106  void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset,
107  CLASS_CUTOFF_ARRAY Cutoffs);
108  void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
109  void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates);
111  /* normmatch.cpp ************************************************************/
113  const FEATURE_STRUCT& feature, BOOL8 DebugMatch);
114  void FreeNormProtos();
115  NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset);
116  /* protos.cpp ***************************************************************/
117  void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class);
119  const UNICHARSET& target_unicharset);
120  /* adaptmatch.cpp ***********************************************************/
121 
122  // Learns the given word using its chopped_word, seam_array, denorm,
123  // box_word, best_state, and correct_text to learn both correctly and
124  // incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
125  // is called and the data will be saved in an internal buffer.
126  // Otherwise AdaptToBlob is called for adaption within a document.
127  void LearnWord(const char* fontname, WERD_RES* word);
128 
129  // Builds a blob of length fragments, from the word, starting at start,
130  // and then learns it, as having the given correct_text.
131  // If fontname is not NULL, then LearnBlob is called and the data will be
132  // saved in an internal buffer for static training.
133  // Otherwise AdaptToBlob is called for adaption within a document.
134  // threshold is a magic number required by AdaptToChar and generated by
135  // ComputeAdaptionThresholds.
136  // Although it can be partly inferred from the string, segmentation is
137  // provided to explicitly clarify the character segmentation.
138  void LearnPieces(const char* fontname, int start, int length, float threshold,
139  CharSegmentationType segmentation, const char* correct_text,
140  WERD_RES* word);
141  void InitAdaptiveClassifier(bool load_pre_trained_templates);
142  void InitAdaptedClass(TBLOB *Blob,
143  CLASS_ID ClassId,
144  int FontinfoId,
145  ADAPT_CLASS Class,
146  ADAPT_TEMPLATES Templates);
147  void AmbigClassifier(const GenericVector<INT_FEATURE_STRUCT>& int_features,
148  const INT_FX_RESULT_STRUCT& fx_info,
149  const TBLOB *blob,
150  INT_TEMPLATES templates,
151  ADAPT_CLASS *classes,
152  UNICHAR_ID *ambiguities,
153  ADAPT_RESULTS *results);
154  void MasterMatcher(INT_TEMPLATES templates,
155  inT16 num_features,
156  const INT_FEATURE_STRUCT* features,
157  const uinT8* norm_factors,
158  ADAPT_CLASS* classes,
159  int debug,
160  int matcher_multiplier,
161  const TBOX& blob_box,
162  const GenericVector<CP_RESULT_STRUCT>& results,
163  ADAPT_RESULTS* final_results);
164  // Converts configs to fonts, and if the result is not adapted, and a
165  // shape_table_ is present, the shape is expanded to include all
166  // unichar_ids represented, before applying a set of corrections to the
167  // distance rating in int_result, (see ComputeCorrectedRating.)
168  // The results are added to the final_results output.
170  bool debug,
171  int class_id,
172  int bottom, int top,
173  float cp_rating,
174  int blob_length,
175  int matcher_multiplier,
176  const uinT8* cn_factors,
177  UnicharRating* int_result,
178  ADAPT_RESULTS* final_results);
179  // Applies a set of corrections to the distance im_rating,
180  // including the cn_correction, miss penalty and additional penalty
181  // for non-alnums being vertical misfits. Returns the corrected distance.
182  double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
183  double im_rating, int feature_misses,
184  int bottom, int top,
185  int blob_length, int matcher_multiplier,
186  const uinT8* cn_factors);
187  void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box,
188  ADAPT_RESULTS *Results,
189  BLOB_CHOICE_LIST *Choices);
190  void AddNewResult(const UnicharRating& new_result, ADAPT_RESULTS *results);
191  int GetAdaptiveFeatures(TBLOB *Blob,
192  INT_FEATURE_ARRAY IntFeatures,
193  FEATURE_SET *FloatFeatures);
194 
195 #ifndef GRAPHICS_DISABLED
196  void DebugAdaptiveClassifier(TBLOB *Blob,
197  ADAPT_RESULTS *Results);
198 #endif
200  int NumBadFeat,
201  FEATURE_ID BadFeat[],
202  INT_CLASS IClass,
203  ADAPT_CLASS Class,
206  CLASS_ID ClassId,
207  int FontinfoId,
208  int NumFeatures,
209  INT_FEATURE_ARRAY Features,
210  FEATURE_SET FloatFeatures);
211  void MakePermanent(ADAPT_TEMPLATES Templates,
212  CLASS_ID ClassId,
213  int ConfigId,
214  TBLOB *Blob);
215  void PrintAdaptiveMatchResults(const ADAPT_RESULTS& results);
216  void RemoveExtraPuncs(ADAPT_RESULTS *Results);
217  void RemoveBadMatches(ADAPT_RESULTS *Results);
218  void SetAdaptiveThreshold(FLOAT32 Threshold);
219  void ShowBestMatchFor(int shape_id,
220  const INT_FEATURE_STRUCT* features,
221  int num_features);
222  // Returns a string for the classifier class_id: either the corresponding
223  // unicharset debug_str or the shape_table_ debug str.
225  int class_id, int config_id) const;
226  // Converts a classifier class_id index with a config ID to:
227  // shape_table_ present: a shape_table_ index OR
228  // No shape_table_: a font ID.
229  // Without shape training, each class_id, config pair represents a single
230  // unichar id/font combination, so this function looks up the corresponding
231  // font id.
232  // With shape training, each class_id, config pair represents a single
233  // shape table index, so the fontset_table stores the shape table index,
234  // and the shape_table_ must be consulted to obtain the actual unichar_id/
235  // font combinations that the shape represents.
236  int ClassAndConfigIDToFontOrShapeID(int class_id,
237  int int_result_config) const;
238  // Converts a shape_table_ index to a classifier class_id index (not a
239  // unichar-id!). Uses a search, so not fast.
240  int ShapeIDToClassID(int shape_id) const;
242  TBLOB *Blob, const GenericVector<INT_FEATURE_STRUCT>& int_features,
243  const INT_FX_RESULT_STRUCT& fx_info,
244  ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results);
245  int CharNormClassifier(TBLOB *blob,
246  const TrainingSample& sample,
247  ADAPT_RESULTS *adapt_results);
248 
249  // As CharNormClassifier, but operates on a TrainingSample and outputs to
250  // a GenericVector of ShapeRating without conversion to classes.
251  int CharNormTrainingSample(bool pruner_only, int keep_this,
252  const TrainingSample& sample,
254  UNICHAR_ID *GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass);
255  void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results);
256  void AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId,
257  FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates);
258  void DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class);
259  bool AdaptableWord(WERD_RES* word);
260  void EndAdaptiveClassifier();
261  void SettupPass1();
262  void SettupPass2();
263  void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices);
264  void ClassifyAsNoise(ADAPT_RESULTS *Results);
268 
269  int GetCharNormFeature(const INT_FX_RESULT_STRUCT& fx_info,
270  INT_TEMPLATES templates,
271  uinT8* pruner_norm_array,
272  uinT8* char_norm_array);
273  // Computes the char_norm_array for the unicharset and, if not NULL, the
274  // pruner_array as appropriate according to the existence of the shape_table.
275  // The norm_feature is deleted as it is almost certainly no longer needed.
276  void ComputeCharNormArrays(FEATURE_STRUCT* norm_feature,
277  INT_TEMPLATES_STRUCT* templates,
278  uinT8* char_norm_array,
279  uinT8* pruner_array);
280 
281  bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config);
282  void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob);
283 
284  bool AdaptiveClassifierIsFull() const { return NumAdaptationsFailed > 0; }
286  return AdaptedTemplates->NumPermClasses == 0;
287  }
288  bool LooksLikeGarbage(TBLOB *blob);
289  void RefreshDebugWindow(ScrollView **win, const char *msg,
290  int y_offset, const TBOX &wbox);
291  // intfx.cpp
292  // Computes the DENORMS for bl(baseline) and cn(character) normalization
293  // during feature extraction. The input denorm describes the current state
294  // of the blob, which is usually a baseline-normalized word.
295  // The Transforms setup are as follows:
296  // Baseline Normalized (bl) Output:
297  // We center the grapheme by aligning the x-coordinate of its centroid with
298  // x=128 and leaving the already-baseline-normalized y as-is.
299  //
300  // Character Normalized (cn) Output:
301  // We align the grapheme's centroid at the origin and scale it
302  // asymmetrically in x and y so that the 2nd moments are a standard value
303  // (51.2) ie the result is vaguely square.
304  // If classify_nonlinear_norm is true:
305  // A non-linear normalization is setup that attempts to evenly distribute
306  // edges across x and y.
307  //
308  // Some of the fields of fx_info are also setup:
309  // Length: Total length of outline.
310  // Rx: Rounded y second moment. (Reversed by convention.)
311  // Ry: rounded x second moment.
312  // Xmean: Rounded x center of mass of the blob.
313  // Ymean: Rounded y center of mass of the blob.
314  static void SetupBLCNDenorms(const TBLOB& blob, bool nonlinear_norm,
315  DENORM* bl_denorm, DENORM* cn_denorm,
316  INT_FX_RESULT_STRUCT* fx_info);
317 
318  // Extracts sets of 3-D features of length kStandardFeatureLength (=12.8), as
319  // (x,y) position and angle as measured counterclockwise from the vector
320  // <-1, 0>, from blob using two normalizations defined by bl_denorm and
321  // cn_denorm. See SetpuBLCNDenorms for definitions.
322  // If outline_cn_counts is not NULL, on return it contains the cumulative
323  // number of cn features generated for each outline in the blob (in order).
324  // Thus after the first outline, there were (*outline_cn_counts)[0] features,
325  // after the second outline, there were (*outline_cn_counts)[1] features etc.
326  static void ExtractFeatures(const TBLOB& blob,
327  bool nonlinear_norm,
330  INT_FX_RESULT_STRUCT* results,
331  GenericVector<int>* outline_cn_counts);
332  /* float2int.cpp ************************************************************/
333  void ClearCharNormArray(uinT8* char_norm_array);
334  void ComputeIntCharNormArray(const FEATURE_STRUCT& norm_feature,
335  uinT8* char_norm_array);
336  void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures);
337  /* intproto.cpp *************************************************************/
339  void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
340  const UNICHARSET& target_unicharset);
341  CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on,
342  bool* pretrained_on, int* shape_id);
343  void ShowMatchDisplay();
344  /* font detection ***********************************************************/
346  return fontinfo_table_;
347  }
349  return fontinfo_table_;
350  }
352  return fontset_table_;
353  }
354  /* mfoutline.cpp ***********************************************************/
355  void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale);
356  /* outfeat.cpp ***********************************************************/
358  /* picofeat.cpp ***********************************************************/
361  const INT_FX_RESULT_STRUCT& fx_info);
363  const INT_FX_RESULT_STRUCT& fx_info);
364  /* blobclass.cpp ***********************************************************/
365  // Extracts features from the given blob and saves them in the tr_file_data_
366  // member variable.
367  // fontname: Name of font that this blob was printed in.
368  // cn_denorm: Character normalization transformation to apply to the blob.
369  // fx_info: Character normalization parameters computed with cn_denorm.
370  // blob_text: Ground truth text for the blob.
371  void LearnBlob(const STRING& fontname, TBLOB* Blob, const DENORM& cn_denorm,
372  const INT_FX_RESULT_STRUCT& fx_info, const char* blob_text);
373  // Writes stored training data to a .tr file based on the given filename.
374  // Returns false on error.
375  bool WriteTRFile(const STRING& filename);
376 
377  // Member variables.
378 
379  // Parameters.
380  // Set during training (in lang.config) to indicate whether the divisible
381  // blobs chopper should be used (true for latin script.)
382  BOOL_VAR_H(allow_blob_division, true, "Use divisible blobs chopping");
383  // Set during training (in lang.config) to indicate whether the divisible
384  // blobs chopper should be used in preference to chopping. Set to true for
385  // southern Indic scripts.
387  "Prioritize blob division over chopping");
388  INT_VAR_H(tessedit_single_match, FALSE, "Top choice only from CP");
389  BOOL_VAR_H(classify_enable_learning, true, "Enable adaptive classifier");
390  INT_VAR_H(classify_debug_level, 0, "Classify debug level");
391 
392  /* mfoutline.cpp ***********************************************************/
393  /* control knobs used to control normalization of outlines */
394  INT_VAR_H(classify_norm_method, character, "Normalization Method ...");
396  "Character Normalization Range ...");
397  double_VAR_H(classify_min_norm_scale_x, 0.0, "Min char x-norm scale ...");
398  double_VAR_H(classify_max_norm_scale_x, 0.325, "Max char x-norm scale ...");
399  double_VAR_H(classify_min_norm_scale_y, 0.0, "Min char y-norm scale ...");
400  double_VAR_H(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...");
402  "Veto ratio between classifier ratings");
404  "Veto difference between classifier certainties");
405 
406  /* adaptmatch.cpp ***********************************************************/
407  BOOL_VAR_H(tess_cn_matching, 0, "Character Normalized Matching");
408  BOOL_VAR_H(tess_bn_matching, 0, "Baseline Normalized Matching");
409  BOOL_VAR_H(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier");
411  "Use pre-adapted classifier templates");
413  "Save adapted templates to a file");
414  BOOL_VAR_H(classify_enable_adaptive_debugger, 0, "Enable match debugger");
416  "Non-linear stroke-density normalization");
417  INT_VAR_H(matcher_debug_level, 0, "Matcher Debug Level");
418  INT_VAR_H(matcher_debug_flags, 0, "Matcher Debug Flags");
419  INT_VAR_H(classify_learning_debug_level, 0, "Learning Debug Level: ");
420  double_VAR_H(matcher_good_threshold, 0.125, "Good Match (0-1)");
421  double_VAR_H(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)");
422  double_VAR_H(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)");
423  double_VAR_H(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)");
424  double_VAR_H(matcher_rating_margin, 0.1, "New template margin (0-1)");
425  double_VAR_H(matcher_avg_noise_size, 12.0, "Avg. noise blob length: ");
426  INT_VAR_H(matcher_permanent_classes_min, 1, "Min # of permanent classes");
428  "Reliable Config Threshold");
430  "Enable adaption even if the ambiguities have not been seen");
432  "Maximum angle delta for prototype clustering");
434  "Penalty to apply when a non-alnum is vertically out of "
435  "its expected textline position");
436  double_VAR_H(rating_scale, 1.5, "Rating scaling factor");
437  double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
439  "Scale factor for features not used");
441  "Prune poor adapted results this much worse than best result");
443  "Threshold at which classify_adapted_pruning_factor starts");
445  "Threshold for good protos during adaptive 0-255");
447  "Threshold for good features during adaptive 0-255");
449  "Do not include character fragments in the"
450  " results of the classifier");
452  "Exclude fragments that do not match any whole character"
453  " with at least this certainty");
455  "Bring up graphical debugging windows for fragments training");
457  "Use two different windows for debugging the matching: "
458  "One for the protos and one for the features.");
459  STRING_VAR_H(classify_learn_debug_str, "", "Class str to debug learning");
460 
461  /* intmatcher.cpp **********************************************************/
463  "Class Pruner Threshold 0-255");
465  "Class Pruner Multiplier 0-255: ");
467  "Class Pruner CutoffStrength: ");
469  "Integer Matcher Multiplier 0-255: ");
470 
471  // Use class variables to hold onto built-in templates and adapted templates.
474  // The backup adapted templates are created from the previous page (only)
475  // so they are always ready and reasonably well trained if the primary
476  // adapted templates become full.
478 
479  // Create dummy proto and config masks for use with the built-in templates.
485  /* normmatch.cpp */
487  /* font detection ***********************************************************/
489  // Without shape training, each class_id, config pair represents a single
490  // unichar id/font combination, so each fontset_table_ entry holds font ids
491  // for each config in the class.
492  // With shape training, each class_id, config pair represents a single
493  // shape_table_ index, so the fontset_table_ stores the shape_table_ index,
494  // and the shape_table_ must be consulted to obtain the actual unichar_id/
495  // font combinations that the shape represents.
497 
498  INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
500  "Assume the input is numbers [0-9].");
501  double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
503  "Penalty to add to worst rating for noise");
504 
505  protected:
508  // If a shape_table_ is present, it is used to remap classifier output in
509  // ExpandShapesAndApplyCorrections. font_ids referenced by configs actually
510  // mean an index to the shape_table_ and the choices returned are *all* the
511  // shape_table_ entries at that index.
513 
514  private:
515  Dict dict_;
516  // The currently active static classifier.
517  ShapeClassifier* static_classifier_;
518 
519  /* variables used to hold performance statistics */
520  int NumAdaptationsFailed;
521 
522  // Training data gathered here for all the images in a document.
523  STRING tr_file_data_;
524 
525  // Expected number of features in the class pruner, used to penalize
526  // unknowns that have too few features (like a c being classified as e) so
527  // it doesn't recognize everything as '@' or '#'.
528  // CharNormCutoffs is for the static classifier (with no shapetable).
529  // BaselineCutoffs gets a copy of CharNormCutoffs as an estimate of the real
530  // value in the adaptive classifier. Both are indexed by unichar_id.
531  // shapetable_cutoffs_ provides a similar value for each shape in the
532  // shape_table_
533  uinT16* CharNormCutoffs;
534  uinT16* BaselineCutoffs;
535  GenericVector<uinT16> shapetable_cutoffs_;
536  ScrollView* learn_debug_win_;
537  ScrollView* learn_fragmented_word_debug_win_;
538  ScrollView* learn_fragments_debug_win_;
539 };
540 } // namespace tesseract
541 
542 #endif // TESSERACT_CLASSIFY_CLASSIFY_H__
bool matcher_debug_separate_windows
Definition: classify.h:458
Definition: blobs.h:261
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
const UnicityTable< FontInfo > & get_fontinfo_table() const
Definition: classify.h:348
ADAPT_TEMPLATES BackupAdaptedTemplates
Definition: classify.h:477
void ClearCharNormArray(uinT8 *char_norm_array)
Definition: float2int.cpp:48
STRING ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
int classify_integer_matcher_multiplier
Definition: classify.h:469
bool classify_bln_numeric_mode
Definition: classify.h:500
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
Definition: cutoffs.cpp:52
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
float FLOAT32
Definition: host.h:111
inT16 PROTO_ID
Definition: matchdefs.h:41
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:212
double classify_min_norm_scale_y
Definition: classify.h:399
const ShapeTable * shape_table() const
Definition: classify.h:69
bool classify_enable_adaptive_matcher
Definition: classify.h:409
ADAPT_TEMPLATES NewAdaptedTemplates(bool InitFromUnicharset)
Definition: adaptive.cpp:167
bool AdaptiveClassifierIsFull() const
Definition: classify.h:284
#define INT_VAR_H(name, val, comment)
Definition: params.h:265
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
double matcher_reliable_adaptive_result
Definition: classify.h:421
double tessedit_class_miss_scale
Definition: classify.h:439
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
double matcher_good_threshold
Definition: classify.h:420
void MakePermanent(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
INT_TEMPLATES PreTrainedTemplates
Definition: classify.h:469
bool prioritize_division
Definition: classify.h:387
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File)
Definition: adaptive.cpp:369
unsigned char BOOL8
Definition: host.h:113
void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class)
Definition: intproto.cpp:522
bool classify_save_adapted_templates
Definition: classify.h:413
double classify_character_fragments_garbage_certainty_threshold
Definition: classify.h:453
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:235
double classify_adapted_pruning_factor
Definition: classify.h:441
CharSegmentationType
Definition: classify.h:54
int matcher_min_examples_for_prototyping
Definition: classify.h:428
int ShapeIDToClassID(int shape_id) const
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:35
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:67
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:488
double speckle_rating_penalty
Definition: classify.h:503
BIT_VECTOR AllProtosOn
Definition: classify.h:480
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
Definition: float2int.cpp:69
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:564
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
#define STRING_VAR_H(name, val, comment)
Definition: params.h:271
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:505
int classify_learning_debug_level
Definition: classify.h:419
double matcher_perfect_threshold
Definition: classify.h:422
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
virtual ~Classify()
Definition: classify.cpp:192
double matcher_rating_margin
Definition: classify.h:424
UnicityTable< FontSet > & get_fontset_table()
Definition: classify.h:351
bool classify_nonlinear_norm
Definition: classify.h:416
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:100
double speckle_large_max_size
Definition: classify.h:501
BIT_VECTOR AllConfigsOff
Definition: classify.h:482
ShapeTable * shape_table_
Definition: classify.h:512
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:368
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
double classify_max_norm_scale_x
Definition: classify.h:398
int classify_adapt_proto_threshold
Definition: classify.h:445
int matcher_permanent_classes_min
Definition: classify.h:426
double certainty_scale
Definition: classify.h:437
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
char * classify_learn_debug_str
Definition: classify.h:459
void SetStaticClassifier(ShapeClassifier *static_classifier)
Definition: classify.cpp:204
int classify_class_pruner_multiplier
Definition: classify.h:465
int classify_class_pruner_threshold
Definition: classify.h:463
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:155
void AmbigClassifier(const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
bool classify_use_pre_adapted_templates
Definition: classify.h:411
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
void SetAdaptiveThreshold(FLOAT32 Threshold)
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:262
INT_TEMPLATES ReadIntTemplates(FILE *File)
Definition: intproto.cpp:770
double matcher_avg_noise_size
Definition: classify.h:425
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:886
Dict & getDict()
Definition: classify.h:65
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
void RemoveBadMatches(ADAPT_RESULTS *Results)
#define double_VAR_H(name, val, comment)
Definition: params.h:274
void ExpandShapesAndApplyCorrections(ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
uinT8 FEATURE_ID
Definition: matchdefs.h:47
int UNICHAR_ID
Definition: unichar.h:33
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:628
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:133
int GetFontinfoId(ADAPT_CLASS Class, uinT8 ConfigId)
Definition: adaptive.cpp:190
double matcher_bad_match_pad
Definition: classify.h:423
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:978
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
double classify_max_norm_scale_y
Definition: classify.h:400
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:285
bool classify_debug_character_fragments
Definition: classify.h:455
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:409
double classify_max_rating_ratio
Definition: classify.h:402
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates)
Definition: adaptive.cpp:273
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:507
FLOAT32 ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
Definition: normmatch.cpp:88
double classify_char_norm_range
Definition: classify.h:396
double classify_min_norm_scale_x
Definition: classify.h:397
#define FALSE
Definition: capi.h:29
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG &config)
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:812
Definition: cluster.h:32
int matcher_sufficient_examples_for_prototyping
Definition: classify.h:430
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
double classify_adapted_pruning_threshold
Definition: classify.h:443
double classify_max_certainty_margin
Definition: classify.h:404
IntegerMatcher im_
Definition: classify.h:503
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
Definition: adaptmatch.cpp:717
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:69
Definition: rect.h:30
bool LooksLikeGarbage(TBLOB *blob)
#define TRUE
Definition: capi.h:28
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:47
CLASS_ID GetClassToDebug(const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
Definition: intproto.cpp:1405
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:220
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1138
UnicityTable< FontSet > fontset_table_
Definition: classify.h:496
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:97
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:850
bool disable_character_fragments
Definition: classify.h:450
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:456
Definition: strngs.h:44
void MasterMatcher(INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
bool classify_enable_learning
Definition: classify.h:389
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:613
void InitAdaptiveClassifier(bool load_pre_trained_templates)
Definition: adaptmatch.cpp:527
BIT_VECTOR TempProtoMask
Definition: classify.h:483
int MakeNewTemporaryConfig(ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
bool allow_blob_division
Definition: classify.h:382
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:644
BIT_VECTOR AllConfigsOn
Definition: classify.h:481
bool classify_enable_adaptive_debugger
Definition: classify.h:414
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
double classify_misfit_junk_penalty
Definition: classify.h:435
double matcher_clustering_max_angle_delta
Definition: classify.h:432
void ClassifyAsNoise(ADAPT_RESULTS *Results)
uinT16 CLASS_CUTOFF_ARRAY[MAX_NUM_CLASSES]
Definition: cutoffs.h:26
int classify_cp_cutoff_strength
Definition: classify.h:467
#define BOOL_VAR_H(name, val, comment)
Definition: params.h:268
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:473
int classify_adapt_feature_threshold
Definition: classify.h:447
NORM_PROTOS * ReadNormProtos(FILE *File, inT64 end_offset)
Definition: normmatch.cpp:245
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:230
void NormalizeOutlines(LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
Definition: mfoutline.cpp:300
unsigned short uinT16
Definition: host.h:101
NORM_PROTOS * NormProtos
Definition: classify.h:486
short inT16
Definition: host.h:100
unsigned char uinT8
Definition: host.h:99
long long int inT64
Definition: host.h:108