tesseract v5.3.3.20231005
tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

Public Member Functions

 Classify ()
 
 ~Classify () override
 
virtual DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
int GetFontinfoId (ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
 
ADAPT_TEMPLATES_STRUCTReadAdaptedTemplates (TFile *File)
 
void ConvertProto (PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
 
INT_TEMPLATES_STRUCTCreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
 
void AmbigClassifier (const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (float Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
std::string ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uint8_t *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES_STRUCTReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, float *XScale, float *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const char *filename)
 
 BOOL_VAR_H (allow_blob_division)
 
 BOOL_VAR_H (prioritize_division)
 
 BOOL_VAR_H (classify_enable_learning)
 
 INT_VAR_H (classify_debug_level)
 
 INT_VAR_H (classify_norm_method)
 
 double_VAR_H (classify_char_norm_range)
 
 double_VAR_H (classify_max_rating_ratio)
 
 double_VAR_H (classify_max_certainty_margin)
 
 BOOL_VAR_H (tess_cn_matching)
 
 BOOL_VAR_H (tess_bn_matching)
 
 BOOL_VAR_H (classify_enable_adaptive_matcher)
 
 BOOL_VAR_H (classify_use_pre_adapted_templates)
 
 BOOL_VAR_H (classify_save_adapted_templates)
 
 BOOL_VAR_H (classify_enable_adaptive_debugger)
 
 BOOL_VAR_H (classify_nonlinear_norm)
 
 INT_VAR_H (matcher_debug_level)
 
 INT_VAR_H (matcher_debug_flags)
 
 INT_VAR_H (classify_learning_debug_level)
 
 double_VAR_H (matcher_good_threshold)
 
 double_VAR_H (matcher_reliable_adaptive_result)
 
 double_VAR_H (matcher_perfect_threshold)
 
 double_VAR_H (matcher_bad_match_pad)
 
 double_VAR_H (matcher_rating_margin)
 
 double_VAR_H (matcher_avg_noise_size)
 
 INT_VAR_H (matcher_permanent_classes_min)
 
 INT_VAR_H (matcher_min_examples_for_prototyping)
 
 INT_VAR_H (matcher_sufficient_examples_for_prototyping)
 
 double_VAR_H (matcher_clustering_max_angle_delta)
 
 double_VAR_H (classify_misfit_junk_penalty)
 
 double_VAR_H (rating_scale)
 
 double_VAR_H (tessedit_class_miss_scale)
 
 double_VAR_H (classify_adapted_pruning_factor)
 
 double_VAR_H (classify_adapted_pruning_threshold)
 
 INT_VAR_H (classify_adapt_proto_threshold)
 
 INT_VAR_H (classify_adapt_feature_threshold)
 
 BOOL_VAR_H (disable_character_fragments)
 
 double_VAR_H (classify_character_fragments_garbage_certainty_threshold)
 
 BOOL_VAR_H (classify_debug_character_fragments)
 
 BOOL_VAR_H (matcher_debug_separate_windows)
 
 STRING_VAR_H (classify_learn_debug_str)
 
 INT_VAR_H (classify_class_pruner_threshold)
 
 INT_VAR_H (classify_class_pruner_multiplier)
 
 INT_VAR_H (classify_cp_cutoff_strength)
 
 INT_VAR_H (classify_integer_matcher_multiplier)
 
 BOOL_VAR_H (classify_bln_numeric_mode)
 
 double_VAR_H (speckle_large_max_size)
 
 double_VAR_H (speckle_rating_penalty)
 
NormEvidenceOf

Return the new type of evidence number corresponding to this normalization adjustment. The equation that represents the transform is: 1 / (1 + (NormAdj / midpoint) ^ curl)

float ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const std::string &argv0, const std::string &basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 
 INT_VAR_H (ambigs_debug_level)
 
 BOOL_VAR_H (use_ambigs_for_adaption)
 

Static Public Member Functions

static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, std::vector< INT_FEATURE_STRUCT > *bl_features, std::vector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, std::vector< int > *outline_cn_counts)
 

Public Attributes

INT_TEMPLATES_STRUCTPreTrainedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTAdaptedTemplates = nullptr
 
ADAPT_TEMPLATES_STRUCTBackupAdaptedTemplates = nullptr
 
BIT_VECTOR AllProtosOn = nullptr
 
BIT_VECTOR AllConfigsOn = nullptr
 
BIT_VECTOR AllConfigsOff = nullptr
 
BIT_VECTOR TempProtoMask = nullptr
 
NORM_PROTOSNormProtos = nullptr
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
bool EnableLearning = true
 
- Public Attributes inherited from tesseract::CCUtil
std::string datadir
 
std::string imagebasename
 
std::string lang
 
std::string language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
std::string imagefile
 
std::string directory
 

Protected Attributes

IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_ = nullptr
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 

Detailed Description

Definition at line 94 of file classify.h.

Constructor & Destructor Documentation

◆ Classify()

tesseract::Classify::Classify ( )

Definition at line 60 of file classify.cpp.

61 : BOOL_MEMBER(allow_blob_division, true, "Use divisible blobs chopping", this->params())
62 , BOOL_MEMBER(prioritize_division, false, "Prioritize blob division over chopping",
63 this->params())
64 , BOOL_MEMBER(classify_enable_learning, true, "Enable adaptive classifier", this->params())
65 , INT_MEMBER(classify_debug_level, 0, "Classify debug level", this->params())
66 , INT_MEMBER(classify_norm_method, character, "Normalization Method ...", this->params())
67 , double_MEMBER(classify_char_norm_range, 0.2, "Character Normalization Range ...",
68 this->params())
69 , double_MEMBER(classify_max_rating_ratio, 1.5, "Veto ratio between classifier ratings",
70 this->params())
71 , double_MEMBER(classify_max_certainty_margin, 5.5,
72 "Veto difference between classifier certainties", this->params())
73 , BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching", this->params())
74 , BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching", this->params())
75 , BOOL_MEMBER(classify_enable_adaptive_matcher, 1, "Enable adaptive classifier", this->params())
76 , BOOL_MEMBER(classify_use_pre_adapted_templates, 0, "Use pre-adapted classifier templates",
77 this->params())
78 , BOOL_MEMBER(classify_save_adapted_templates, 0, "Save adapted templates to a file",
79 this->params())
80 , BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger", this->params())
81 , BOOL_MEMBER(classify_nonlinear_norm, 0, "Non-linear stroke-density normalization",
82 this->params())
83 , INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params())
84 , INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params())
85 , INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ", this->params())
86 , double_MEMBER(matcher_good_threshold, 0.125, "Good Match (0-1)", this->params())
87 , double_MEMBER(matcher_reliable_adaptive_result, 0.0, "Great Match (0-1)", this->params())
88 , double_MEMBER(matcher_perfect_threshold, 0.02, "Perfect Match (0-1)", this->params())
89 , double_MEMBER(matcher_bad_match_pad, 0.15, "Bad Match Pad (0-1)", this->params())
90 , double_MEMBER(matcher_rating_margin, 0.1, "New template margin (0-1)", this->params())
91 , double_MEMBER(matcher_avg_noise_size, 12.0, "Avg. noise blob length", this->params())
92 , INT_MEMBER(matcher_permanent_classes_min, 1, "Min # of permanent classes", this->params())
93 , INT_MEMBER(matcher_min_examples_for_prototyping, 3, "Reliable Config Threshold",
94 this->params())
95 , INT_MEMBER(matcher_sufficient_examples_for_prototyping, 5,
96 "Enable adaption even if the ambiguities have not been seen", this->params())
97 , double_MEMBER(matcher_clustering_max_angle_delta, 0.015,
98 "Maximum angle delta for prototype clustering", this->params())
99 , double_MEMBER(classify_misfit_junk_penalty, 0.0,
100 "Penalty to apply when a non-alnum is vertically out of "
101 "its expected textline position",
102 this->params())
103 , double_MEMBER(rating_scale, 1.5, "Rating scaling factor", this->params())
104 , double_MEMBER(tessedit_class_miss_scale, 0.00390625, "Scale factor for features not used",
105 this->params())
106 , double_MEMBER(classify_adapted_pruning_factor, 2.5,
107 "Prune poor adapted results this much worse than best result", this->params())
108 , double_MEMBER(classify_adapted_pruning_threshold, -1.0,
109 "Threshold at which classify_adapted_pruning_factor starts", this->params())
110 , INT_MEMBER(classify_adapt_proto_threshold, 230,
111 "Threshold for good protos during adaptive 0-255", this->params())
112 , INT_MEMBER(classify_adapt_feature_threshold, 230,
113 "Threshold for good features during adaptive 0-255", this->params())
114 , BOOL_MEMBER(disable_character_fragments, true,
115 "Do not include character fragments in the"
116 " results of the classifier",
117 this->params())
118 , double_MEMBER(classify_character_fragments_garbage_certainty_threshold, -3.0,
119 "Exclude fragments that do not look like whole"
120 " characters from training and adaption",
121 this->params())
122 , BOOL_MEMBER(classify_debug_character_fragments, false,
123 "Bring up graphical debugging windows for fragments training", this->params())
124 , BOOL_MEMBER(matcher_debug_separate_windows, false,
125 "Use two different windows for debugging the matching: "
126 "One for the protos and one for the features.",
127 this->params())
128 , STRING_MEMBER(classify_learn_debug_str, "", "Class str to debug learning", this->params())
129 , INT_MEMBER(classify_class_pruner_threshold, 229, "Class Pruner Threshold 0-255",
130 this->params())
131 , INT_MEMBER(classify_class_pruner_multiplier, 15,
132 "Class Pruner Multiplier 0-255: ", this->params())
133 , INT_MEMBER(classify_cp_cutoff_strength, 7,
134 "Class Pruner CutoffStrength: ", this->params())
135 , INT_MEMBER(classify_integer_matcher_multiplier, 10,
136 "Integer Matcher Multiplier 0-255: ", this->params())
137 , BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].",
138 this->params())
139 , double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", this->params())
140 , double_MEMBER(speckle_rating_penalty, 10.0, "Penalty to add to worst rating for noise",
141 this->params())
142 , im_(&classify_debug_level)
143 , dict_(this) {
144 using namespace std::placeholders; // for _1, _2
145 fontinfo_table_.set_clear_callback(std::bind(FontInfoDeleteCallback, _1));
146
148}
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:369
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:375
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:373
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:371
#define classify_enable_adaptive_matcher
Definition: adaptmatch.cpp:78
void FontInfoDeleteCallback(FontInfo f)
Definition: fontinfo.cpp:129
@ character
Definition: mfoutline.h:53
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:87
ParamsVectors * params()
Definition: ccutil.h:53
IntegerMatcher im_
Definition: classify.h:445
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:446
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:434

◆ ~Classify()

tesseract::Classify::~Classify ( )
override

Definition at line 150 of file classify.cpp.

150 {
152#ifndef GRAPHICS_DISABLED
153 delete learn_debug_win_;
154 delete learn_fragmented_word_debug_win_;
155 delete learn_fragments_debug_win_;
156#endif
157}
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:464

Member Function Documentation

◆ AdaptableWord()

bool tesseract::Classify::AdaptableWord ( WERD_RES word)

Return true if the specified word is acceptable for adaptation.

Globals: none

Parameters
wordcurrent word
Returns
true or false

Definition at line 811 of file adaptmatch.cpp.

811 {
812 if (word->best_choice == nullptr) {
813 return false;
814 }
815 auto BestChoiceLength = word->best_choice->length();
816 float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
817 return // rules that apply in general - simplest to compute first
818 BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
819 BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
820 // This basically ensures that the word is at least a dictionary match
821 // (freq word, user word, system dawg word, etc).
822 // Since all the other adjustments will make adjust factor higher
823 // than higher than adaptable_score=1.1+0.05=1.15
824 // Since these are other flags that ensure that the word is dict word,
825 // this check could be at times redundant.
826 word->best_choice->adjust_factor() <= adaptable_score &&
827 // Make sure that alternative choices are not dictionary words.
828 word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
829}
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:85
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:87
virtual Dict & getDict()
Definition: classify.h:98

◆ AdaptiveClassifier()

void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
BLOB_CHOICE_LIST *  Choices 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Parameters
Blobblob to be classified
[out]ChoicesList of choices found by adaptive matcher. filled on return with the choices found by the class pruner and the ratings there from. Also contains the detailed results of the integer matcher.

Definition at line 202 of file adaptmatch.cpp.

202 {
203 assert(Choices != nullptr);
204 auto *Results = new ADAPT_RESULTS;
205 Results->Initialize();
206
207 ASSERT_HOST(AdaptedTemplates != nullptr);
208
209 DoAdaptiveMatch(Blob, Results);
210
211 RemoveBadMatches(Results);
212 std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
213 RemoveExtraPuncs(Results);
214 Results->ComputeBest();
215 ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);
216
217 // TODO(rays) Move to before ConvertMatchesToChoices!
218 if (LargeSpeckle(*Blob) || Choices->empty()) {
219 AddLargeSpeckleTo(Results->BlobLength, Choices);
220 }
221
222 if (matcher_debug_level >= 1) {
223 tprintf("AD Matches = ");
225 }
226
227#ifndef GRAPHICS_DISABLED
228 if (classify_enable_adaptive_debugger) {
229 DebugAdaptiveClassifier(Blob, Results);
230 }
231#endif
232
233 delete Results;
234} /* AdaptiveClassifier */
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void RemoveBadMatches(ADAPT_RESULTS *Results)
ADAPT_TEMPLATES_STRUCT * AdaptedTemplates
Definition: classify.h:420
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:190
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:169
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)

◆ AdaptiveClassifierIsEmpty()

bool tesseract::Classify::AdaptiveClassifierIsEmpty ( ) const
inline

Definition at line 268 of file classify.h.

268 {
269 return AdaptedTemplates->NumPermClasses == 0;
270 }

◆ AdaptiveClassifierIsFull()

bool tesseract::Classify::AdaptiveClassifierIsFull ( ) const
inline

Definition at line 265 of file classify.h.

265 {
266 return NumAdaptationsFailed > 0;
267 }

◆ AdaptToChar()

void tesseract::Classify::AdaptToChar ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
float  Threshold,
ADAPT_TEMPLATES_STRUCT adaptive_templates 
)
Parameters
Blobblob to add to templates for ClassId
ClassIdclass to add blob to
FontinfoIdfont information from pre-trained templates
Thresholdminimum match rating to existing template
adaptive_templatescurrent set of adapted templates

Globals:

  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs

Definition at line 843 of file adaptmatch.cpp.

844 {
845 int NumFeatures;
846 INT_FEATURE_ARRAY IntFeatures;
847 UnicharRating int_result;
848 INT_CLASS_STRUCT *IClass;
849 ADAPT_CLASS_STRUCT *Class;
850 TEMP_CONFIG_STRUCT *TempConfig;
851 FEATURE_SET FloatFeatures;
852 int NewTempConfigId;
853
854 if (!LegalClassId(ClassId)) {
855 return;
856 }
857
858 int_result.unichar_id = ClassId;
859 Class = adaptive_templates->Class[ClassId];
860 assert(Class != nullptr);
861 if (IsEmptyAdaptedClass(Class)) {
862 InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
863 } else {
864 IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
865
866 NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
867 if (NumFeatures <= 0) {
868 return; // Features already freed by GetAdaptiveFeatures.
869 }
870
871 // Only match configs with the matching font.
872 BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
873 for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
874 if (GetFontinfoId(Class, cfg) == FontinfoId) {
875 SET_BIT(MatchingFontConfigs, cfg);
876 } else {
877 reset_bit(MatchingFontConfigs, cfg);
878 }
879 }
880 im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,
881 classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
882 FreeBitVector(MatchingFontConfigs);
883
884 SetAdaptiveThreshold(Threshold);
885
886 if (1.0f - int_result.rating <= Threshold) {
887 if (ConfigIsPermanent(Class, int_result.config)) {
888 if (classify_learning_debug_level >= 1) {
889 tprintf("Found good match to perm config %d = %4.1f%%.\n", int_result.config,
890 int_result.rating * 100.0);
891 }
892 delete FloatFeatures;
893 return;
894 }
895
896 TempConfig = TempConfigFor(Class, int_result.config);
897 IncreaseConfidence(TempConfig);
898 if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
899 Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
900 }
901 if (classify_learning_debug_level >= 1) {
902 tprintf("Increasing reliability of temp config %d to %d.\n", int_result.config,
903 TempConfig->NumTimesSeen);
904 }
905
906 if (TempConfigReliable(ClassId, TempConfig)) {
907 MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
908 UpdateAmbigsGroup(ClassId, Blob);
909 }
910 } else {
911 if (classify_learning_debug_level >= 1) {
912 tprintf("Found poor match to temp config %d = %4.1f%%.\n", int_result.config,
913 int_result.rating * 100.0);
914#ifndef GRAPHICS_DISABLED
915 if (classify_learning_debug_level > 2) {
916 DisplayAdaptedChar(Blob, IClass);
917 }
918#endif
919 }
920 NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,
921 IntFeatures, FloatFeatures);
922 if (NewTempConfigId >= 0 &&
923 TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
924 MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
925 UpdateAmbigsGroup(ClassId, Blob);
926 }
927
928#ifndef GRAPHICS_DISABLED
929 if (classify_learning_debug_level > 1) {
930 DisplayAdaptedChar(Blob, IClass);
931 }
932#endif
933 }
934 delete FloatFeatures;
935 }
936} /* AdaptToChar */
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define reset_bit(array, bit)
Definition: bitvec.h:57
#define SET_BIT(array, bit)
Definition: bitvec.h:55
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define ClassForClassId(T, c)
Definition: intproto.h:156
#define LegalClassId(c)
Definition: intproto.h:154
#define NO_DEBUG
Definition: adaptmatch.cpp:84
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:83
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:85
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:87
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:137
BIT_VECTOR AllProtosOn
Definition: classify.h:427
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:940
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:778
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptmatch.cpp:686
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void SetAdaptiveThreshold(float Threshold)
int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
Definition: adaptive.cpp:118
void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:482

◆ AddLargeSpeckleTo()

void tesseract::Classify::AddLargeSpeckleTo ( int  blob_length,
BLOB_CHOICE_LIST *  choices 
)

Definition at line 169 of file classify.cpp.

169 {
170 BLOB_CHOICE_IT bc_it(choices);
171 // If there is no classifier result, we will use the worst possible certainty
172 // and corresponding rating.
173 float certainty = -getDict().certainty_scale;
174 float rating = rating_scale * blob_length;
175 if (!choices->empty() && blob_length > 0) {
176 bc_it.move_to_last();
177 BLOB_CHOICE *worst_choice = bc_it.data();
178 // Add speckle_rating_penalty to worst rating, matching old value.
179 rating = worst_choice->rating() + speckle_rating_penalty;
180 // Compute the rating to correspond to the certainty. (Used to be kept
181 // the same, but that messes up the language model search.)
182 certainty = -rating * getDict().certainty_scale / (rating_scale * blob_length);
183 }
184 auto *blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, -1, 0.0f, FLT_MAX, 0,
186 bc_it.add_to_end(blob_choice);
187}
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ BCC_SPECKLE_CLASSIFIER
Definition: ratngs.h:51

◆ AddNewResult()

void tesseract::Classify::AddNewResult ( const UnicharRating new_result,
ADAPT_RESULTS results 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

  • #matcher_bad_match_pad defines limits of an acceptable match
Parameters
new_resultnew result to add
[out]resultsresults to add new result to

Definition at line 986 of file adaptmatch.cpp.

986 {
987 auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
988
989 if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
990 (old_match < results->match.size() &&
991 new_result.rating <= results->match[old_match].rating)) {
992 return; // New one not good enough.
993 }
994
995 if (!unicharset.get_fragment(new_result.unichar_id)) {
996 results->HasNonfragment = true;
997 }
998
999 if (old_match < results->match.size()) {
1000 results->match[old_match].rating = new_result.rating;
1001 } else {
1002 results->match.push_back(new_result);
1003 }
1004
1005 if (new_result.rating > results->best_rating &&
1006 // Ensure that fragments do not affect best rating, class and config.
1007 // This is needed so that at least one non-fragmented character is
1008 // always present in the results.
1009 // TODO(daria): verify that this helps accuracy and does not
1010 // hurt performance.
1011 !unicharset.get_fragment(new_result.unichar_id)) {
1012 results->best_match_index = old_match;
1013 results->best_rating = new_result.rating;
1014 results->best_unichar_id = new_result.unichar_id;
1015 }
1016} /* AddNewResult */
UNICHARSET unicharset
Definition: ccutil.h:61
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768

◆ AmbigClassifier()

void tesseract::Classify::AmbigClassifier ( const std::vector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
const TBLOB blob,
INT_TEMPLATES_STRUCT templates,
ADAPT_CLASS_STRUCT **  classes,
UNICHAR_ID ambiguities,
ADAPT_RESULTS results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters
blobblob to be classified
templatesbuilt-in templates to classify against
classesadapted class templates
ambiguitiesarray of unichar id's to match against
[out]resultsplace to put match results
int_features
fx_info

Definition at line 1037 of file adaptmatch.cpp.

1040 {
1041 if (int_features.empty()) {
1042 return;
1043 }
1044 auto *CharNormArray = new uint8_t[unicharset.size()];
1045 UnicharRating int_result;
1046
1047 results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);
1048 bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1049 if (debug) {
1050 tprintf("AM Matches = ");
1051 }
1052
1053 int top = blob->bounding_box().top();
1054 int bottom = blob->bounding_box().bottom();
1055 while (*ambiguities >= 0) {
1056 CLASS_ID class_id = *ambiguities;
1057
1058 int_result.unichar_id = class_id;
1059 im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),
1060 &int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,
1061 matcher_debug_separate_windows);
1062
1063 ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,
1064 classify_integer_matcher_multiplier, CharNormArray, &int_result,
1065 results);
1066 ambiguities++;
1067 }
1068 delete[] CharNormArray;
1069} /* AmbigClassifier */
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
size_t size() const
Definition: unicharset.h:355
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
BIT_VECTOR AllConfigsOn
Definition: classify.h:428

◆ BaselineClassifier()

UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const std::vector< INT_FEATURE_STRUCT > &  int_features,
const INT_FX_RESULT_STRUCT fx_info,
ADAPT_TEMPLATES_STRUCT Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters
Blobblob to be classified
Templatescurrent set of adapted templates
Resultsplace to put match results
int_features
fx_info
Returns
Array of possible ambiguous chars that should be checked.

Definition at line 1224 of file adaptmatch.cpp.

1227 {
1228 if (int_features.empty()) {
1229 return nullptr;
1230 }
1231 auto *CharNormArray = new uint8_t[unicharset.size()];
1232 ClearCharNormArray(CharNormArray);
1233
1234 Results->BlobLength = IntCastRounded(fx_info.Length / kStandardFeatureLength);
1235 PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,
1236 BaselineCutoffs, &Results->CPResults);
1237
1238 if (matcher_debug_level >= 2 || classify_debug_level > 1) {
1239 tprintf("BL Matches = ");
1240 }
1241
1242 MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,
1243 Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,
1244 Results);
1245
1246 delete[] CharNormArray;
1247 CLASS_ID ClassId = Results->best_unichar_id;
1248 if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {
1249 return nullptr;
1250 }
1251
1252 return Templates->Class[ClassId]
1253 ->Config[Results->match[Results->best_match_index].config]
1254 .Perm->Ambigs;
1255} /* BaselineClassifier */
const double kStandardFeatureLength
Definition: intfx.h:44
int IntCastRounded(double x)
Definition: helpers.h:170
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:427
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:41
void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)

◆ BOOL_VAR_H() [1/14]

tesseract::Classify::BOOL_VAR_H ( allow_blob_division  )

◆ BOOL_VAR_H() [2/14]

tesseract::Classify::BOOL_VAR_H ( classify_bln_numeric_mode  )

◆ BOOL_VAR_H() [3/14]

tesseract::Classify::BOOL_VAR_H ( classify_debug_character_fragments  )

◆ BOOL_VAR_H() [4/14]

tesseract::Classify::BOOL_VAR_H ( classify_enable_adaptive_debugger  )

◆ BOOL_VAR_H() [5/14]

tesseract::Classify::BOOL_VAR_H ( classify_enable_adaptive_matcher  )

◆ BOOL_VAR_H() [6/14]

tesseract::Classify::BOOL_VAR_H ( classify_enable_learning  )

◆ BOOL_VAR_H() [7/14]

tesseract::Classify::BOOL_VAR_H ( classify_nonlinear_norm  )

◆ BOOL_VAR_H() [8/14]

tesseract::Classify::BOOL_VAR_H ( classify_save_adapted_templates  )

◆ BOOL_VAR_H() [9/14]

tesseract::Classify::BOOL_VAR_H ( classify_use_pre_adapted_templates  )

◆ BOOL_VAR_H() [10/14]

tesseract::Classify::BOOL_VAR_H ( disable_character_fragments  )

◆ BOOL_VAR_H() [11/14]

tesseract::Classify::BOOL_VAR_H ( matcher_debug_separate_windows  )

◆ BOOL_VAR_H() [12/14]

tesseract::Classify::BOOL_VAR_H ( prioritize_division  )

◆ BOOL_VAR_H() [13/14]

tesseract::Classify::BOOL_VAR_H ( tess_bn_matching  )

◆ BOOL_VAR_H() [14/14]

tesseract::Classify::BOOL_VAR_H ( tess_cn_matching  )

◆ CharNormClassifier()

int tesseract::Classify::CharNormClassifier ( TBLOB blob,
const TrainingSample sample,
ADAPT_RESULTS adapt_results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters
blobblob to be classified
sampletemplates to classify unknown against
adapt_resultsplace to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs

Definition at line 1273 of file adaptmatch.cpp.

1274 {
1275 // This is the length that is used for scaling ratings vs certainty.
1276 adapt_results->BlobLength = IntCastRounded(sample.outline_length() / kStandardFeatureLength);
1277 std::vector<UnicharRating> unichar_results;
1278 static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);
1279 // Convert results to the format used internally by AdaptiveClassifier.
1280 for (auto &r : unichar_results) {
1281 AddNewResult(r, adapt_results);
1282 }
1283 return sample.num_features();
1284} /* CharNormClassifier */
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:986
virtual int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug, UNICHAR_ID keep_this, std::vector< UnicharRating > *results)

◆ CharNormTrainingSample()

int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
int  keep_this,
const TrainingSample sample,
std::vector< UnicharRating > *  results 
)

Definition at line 1288 of file adaptmatch.cpp.

1289 {
1290 results->clear();
1291 std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
1292 adapt_results->Initialize();
1293 // Compute the bounding box of the features.
1294 uint32_t num_features = sample.num_features();
1295 // Only the top and bottom of the blob_box are used by MasterMatcher, so
1296 // fabricate right and left using top and bottom.
1297 TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1298 sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1299 // Compute the char_norm_array from the saved cn_feature.
1300 FEATURE norm_feature = sample.GetCNFeature();
1301 std::vector<uint8_t> char_norm_array(unicharset.size());
1302 auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
1303 std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
1304 adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);
1305 ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
1306
1307 PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
1308 shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1309 &adapt_results->CPResults);
1310 if (keep_this >= 0) {
1311 adapt_results->CPResults[0].Class = keep_this;
1312 adapt_results->CPResults.resize(1);
1313 }
1314 if (pruner_only) {
1315 // Convert pruner results to output format.
1316 for (auto &it : adapt_results->CPResults) {
1317 int class_id = it.Class;
1318 results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
1319 }
1320 } else {
1321 MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
1322 matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
1323 adapt_results->CPResults, adapt_results.get());
1324 // Convert master matcher results to output format.
1325 for (auto &i : adapt_results->match) {
1326 results->push_back(i);
1327 }
1328 if (results->size() > 1) {
1329 std::sort(results->begin(), results->end(), SortDescendingRating);
1330 }
1331 }
1332 return num_features;
1333} /* CharNormTrainingSample */
@ TBOX
FEATURE_STRUCT * FEATURE
Definition: ocrfeatures.h:68
@ GeoTop
Definition: picofeat.h:37
@ GeoBottom
Definition: picofeat.h:36
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:27
ShapeTable * shape_table_
Definition: classify.h:451
INT_TEMPLATES_STRUCT * PreTrainedTemplates
Definition: classify.h:419
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)

◆ ClassAndConfigIDToFontOrShapeID()

int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const

Definition at line 2109 of file adaptmatch.cpp.

2109 {
2110 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2111 // Older inttemps have no font_ids.
2112 if (font_set_id < 0) {
2113 return kBlankFontinfoId;
2114 }
2115 const FontSet &fs = fontset_table_.at(font_set_id);
2116 return fs.at(int_result_config);
2117}
std::vector< int > FontSet
Definition: fontinfo.h:154
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56
UnicityTable< FontSet > fontset_table_
Definition: classify.h:442
INT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: intproto.h:111

◆ ClassIDToDebugStr()

std::string tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const

Definition at line 2096 of file adaptmatch.cpp.

2097 {
2098 std::string class_string;
2099 if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2100 int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2101 class_string = shape_table_->DebugStr(shape_id);
2102 } else {
2103 class_string = unicharset.debug_str(class_id);
2104 }
2105 return class_string;
2106}
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292

◆ ClassifyAsNoise()

void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS results)

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters
resultsresults to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob

Definition at line 1347 of file adaptmatch.cpp.

1347 {
1348 float rating = results->BlobLength / matcher_avg_noise_size;
1349 rating *= rating;
1350 rating /= 1 + rating;
1351
1352 AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1353} /* ClassifyAsNoise */

◆ ClearCharNormArray()

void tesseract::Classify::ClearCharNormArray ( uint8_t *  char_norm_array)

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters
char_norm_arrayarray to be cleared

Definition at line 41 of file float2int.cpp.

41 {
42 memset(char_norm_array, 0, sizeof(*char_norm_array) * unicharset.size());
43} /* ClearCharNormArray */

◆ ComputeCharNormArrays()

void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uint8_t *  char_norm_array,
uint8_t *  pruner_array 
)

Definition at line 1629 of file adaptmatch.cpp.

1630 {
1631 ComputeIntCharNormArray(*norm_feature, char_norm_array);
1632 //if (pruner_array != nullptr) {
1633 if (shape_table_ == nullptr) {
1634 ComputeIntCharNormArray(*norm_feature, pruner_array);
1635 } else {
1636 memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
1637 // Each entry in the pruner norm array is the MIN of all the entries of
1638 // the corresponding unichars in the CharNormArray.
1639 for (unsigned id = 0; id < templates->NumClasses; ++id) {
1640 int font_set_id = templates->Class[id]->font_set_id;
1641 const FontSet &fs = fontset_table_.at(font_set_id);
1642 for (auto f : fs) {
1643 const Shape &shape = shape_table_->GetShape(f);
1644 for (int c = 0; c < shape.size(); ++c) {
1645 if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
1646 pruner_array[id] = char_norm_array[shape[c].unichar_id];
1647 }
1648 }
1649 }
1650 }
1651 }
1652 //}
1653 delete norm_feature;
1654}
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:58
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292

◆ ComputeCorrectedRating()

double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors 
)

Definition at line 1171 of file adaptmatch.cpp.

1174 {
1175 // Compute class feature corrections.
1176 double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],
1177 matcher_multiplier);
1178 double miss_penalty = tessedit_class_miss_scale * feature_misses;
1179 double vertical_penalty = 0.0;
1180 // Penalize non-alnums for being vertical misfits.
1181 if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&
1182 cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1183 int min_bottom, max_bottom, min_top, max_top;
1184 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
1185 if (debug) {
1186 tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n", top, min_top, max_top, bottom,
1187 min_bottom, max_bottom);
1188 }
1189 if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {
1190 vertical_penalty = classify_misfit_junk_penalty;
1191 }
1192 }
1193 double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1194 if (result < WORST_POSSIBLE_RATING) {
1195 result = WORST_POSSIBLE_RATING;
1196 }
1197 if (debug) {
1198 tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1199 unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,
1200 (1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,
1201 cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);
1202 }
1203 return result;
1204}
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:91
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)

◆ ComputeIntCharNormArray()

void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uint8_t *  char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • PreTrainedTemplates current set of built-in templates
Parameters
norm_featurecharacter normalization feature
[out]char_norm_arrayplace to put results of size unicharset.size()

Definition at line 58 of file float2int.cpp.

59 {
60 for (unsigned i = 0; i < unicharset.size(); i++) {
61 if (i < PreTrainedTemplates->NumClasses) {
62 int norm_adjust =
63 static_cast<int>(INT_CHAR_NORM_RANGE * ComputeNormMatch(i, norm_feature, false));
64 char_norm_array[i] = ClipToRange(norm_adjust, 0, MAX_INT_CHAR_NORM);
65 } else {
66 // Classes with no templates (eg. ambigs & ligatures) default
67 // to worst match.
68 char_norm_array[i] = MAX_INT_CHAR_NORM;
69 }
70 }
71} /* ComputeIntCharNormArray */
#define INT_CHAR_NORM_RANGE
Definition: intproto.h:117
#define MAX_INT_CHAR_NORM
Definition: float2int.cpp:27
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94

◆ ComputeIntFeatures()

void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters
Featuresfloating point pico-features to be converted
[out]IntFeaturesarray to put converted features into

Definition at line 85 of file float2int.cpp.

85 {
86 float YShift;
87
88 if (classify_norm_method == baseline) {
89 YShift = BASELINE_Y_SHIFT;
90 } else {
91 YShift = Y_SHIFT;
92 }
93
94 for (int Fid = 0; Fid < Features->NumFeatures; Fid++) {
95 FEATURE Feature = Features->Features[Fid];
96
97 IntFeatures[Fid].X = Bucket8For(Feature->Params[PicoFeatX], X_SHIFT, INT_FEAT_RANGE);
98 IntFeatures[Fid].Y = Bucket8For(Feature->Params[PicoFeatY], YShift, INT_FEAT_RANGE);
99 IntFeatures[Fid].Theta =
101 IntFeatures[Fid].CP_misses = 0;
102 }
103} /* ComputeIntFeatures */
#define ANGLE_SHIFT
Definition: intproto.h:40
#define X_SHIFT
Definition: intproto.h:41
#define Y_SHIFT
Definition: intproto.h:42
#define BASELINE_Y_SHIFT
Definition: float2int.h:28
#define INT_FEAT_RANGE
Definition: float2int.h:27
uint8_t Bucket8For(float param, float offset, int num_buckets)
Definition: intproto.cpp:385
@ PicoFeatDir
Definition: picofeat.h:43
@ PicoFeatX
Definition: picofeat.h:43
@ PicoFeatY
Definition: picofeat.h:43
@ baseline
Definition: mfoutline.h:53
uint8_t CircBucketFor(float param, float offset, int num_buckets)
Definition: intproto.cpp:399

◆ ComputeNormMatch()

float tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
bool  DebugMatch 
)

This routine compares Features against each character normalization proto for ClassId and returns the match rating of the best match.

Parameters
ClassIdid of class to match against
featurecharacter normalization feature
DebugMatchcontrols dump of debug info

Globals: NormProtos character normalization prototypes

Returns
Best match rating for Feature against protos of ClassId.

Definition at line 94 of file normmatch.cpp.

94 {
95 if (ClassId >= NormProtos->NumProtos) {
96 ClassId = NO_CLASS;
97 }
98
99 /* handle requests for classification as noise */
100 if (ClassId == NO_CLASS) {
101 /* kludge - clean up constants and make into control knobs later */
102 float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
103 feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
104 feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
105 return (1.0f - NormEvidenceOf(Match));
106 }
107
108 float BestMatch = FLT_MAX;
109 LIST Protos = NormProtos->Protos[ClassId];
110
111 if (DebugMatch) {
112 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
113 }
114
115 int ProtoId = 0;
116 iterate(Protos) {
117 auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
118 float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
119 float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
120 if (DebugMatch) {
121 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
122 Proto->Weight.Elliptical[CharNormY], Match);
123 }
124 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
125 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
126 if (DebugMatch) {
127 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
128 Proto->Weight.Elliptical[CharNormRx], Match);
129 }
130 // Ry is width! See intfx.cpp.
131 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
132 if (DebugMatch) {
133 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
134 Proto->Weight.Elliptical[CharNormRy]);
135 }
136 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
137 Delta *= kWidthErrorWeighting;
138 Match += Delta;
139 if (DebugMatch) {
140 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
141 Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
142 256 * (1 - NormEvidenceOf(Match)));
143 }
144
145 if (Match < BestMatch) {
146 BestMatch = Match;
147 }
148
149 ProtoId++;
150 }
151 return 1.0 - NormEvidenceOf(BestMatch);
152} /* ComputeNormMatch */
#define NO_CLASS
Definition: matchdefs.h:35
#define iterate(l)
Definition: oldlist.h:91
list_rec * LIST
Definition: oldlist.h:125
const double kWidthErrorWeighting
Definition: normmatch.cpp:76
double classify_norm_adj_midpoint
Definition: normmatch.cpp:73
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
NORM_PROTOS * NormProtos
Definition: classify.h:432
std::vector< LIST > Protos
Definition: normmatch.cpp:41

◆ ConvertMatchesToChoices()

void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

Definition at line 1361 of file adaptmatch.cpp.

1362 {
1363 assert(Choices != nullptr);
1364 float Rating;
1365 float Certainty;
1366 BLOB_CHOICE_IT temp_it;
1367 bool contains_nonfrag = false;
1368 temp_it.set_to_list(Choices);
1369 int choices_length = 0;
1370 // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1371 // number of returned results, but with a shape_table_ we want to have room
1372 // for at least the biggest shape (which might contain hundreds of Indic
1373 // grapheme fragments) and more, so use double the size of the biggest shape
1374 // if that is more than the default.
1375 int max_matches = MAX_MATCHES;
1376 if (shape_table_ != nullptr) {
1377 max_matches = shape_table_->MaxNumUnichars() * 2;
1378 if (max_matches < MAX_MATCHES) {
1379 max_matches = MAX_MATCHES;
1380 }
1381 }
1382
1383 float best_certainty = -FLT_MAX;
1384 for (auto &it : Results->match) {
1385 const UnicharRating &result = it;
1386 bool adapted = result.adapted;
1387 bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1388 if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {
1389 continue; // look for a non-fragmented character to fill the
1390 // last spot in Choices if only fragments are present
1391 }
1392 // BlobLength can never be legally 0, this means recognition failed.
1393 // But we must return a classification result because some invoking
1394 // functions (chopper/permuter) do not anticipate a null blob choice.
1395 // So we need to assign a poor, but not infinitely bad score.
1396 if (Results->BlobLength == 0) {
1397 Certainty = -20;
1398 Rating = 100; // should be -certainty * real_blob_length
1399 } else {
1400 Rating = Certainty = (1.0f - result.rating);
1401 Rating *= rating_scale * Results->BlobLength;
1402 Certainty *= -(getDict().certainty_scale);
1403 }
1404 // Adapted results, by their very nature, should have good certainty.
1405 // Those that don't are at best misleading, and often lead to errors,
1406 // so don't accept adapted results that are too far behind the best result,
1407 // whether adapted or static.
1408 // TODO(rays) find some way of automatically tuning these constants.
1409 if (Certainty > best_certainty) {
1410 best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1411 } else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {
1412 continue; // Don't accept bad adapted results.
1413 }
1414
1415 float min_xheight, max_xheight, yshift;
1416 denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);
1417 auto *choice = new BLOB_CHOICE(
1418 result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,
1419 max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);
1420 choice->set_fonts(result.fonts);
1421 temp_it.add_to_end(choice);
1422 contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1423 choices_length++;
1424 if (choices_length >= max_matches) {
1425 break;
1426 }
1427 }
1428 Results->match.resize(choices_length);
1429} // ConvertMatchesToChoices
#define MAX_MATCHES
Definition: adaptmatch.cpp:82
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:49
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:50
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681
int MaxNumUnichars() const
Definition: shapetable.cpp:472

◆ ConvertProto()

void tesseract::Classify::ConvertProto ( PROTO_STRUCT Proto,
int  ProtoId,
INT_CLASS_STRUCT Class 
)

This routine converts Proto to integer format and installs it as ProtoId in Class.

Parameters
Protofloating-pt proto to be converted to integer format
ProtoIdid of proto
Classinteger class to add converted proto to

Definition at line 452 of file intproto.cpp.

452 {
453 assert(ProtoId < Class->NumProtos);
454
455 INT_PROTO_STRUCT *P = ProtoForProtoId(Class, ProtoId);
456
457 float Param = Proto->A * 128;
458 P->A = TruncateParam(Param, -128, 127);
459
460 Param = -Proto->B * 256;
461 P->B = TruncateParam(Param, 0, 255);
462
463 Param = Proto->C * 128;
464 P->C = TruncateParam(Param, -128, 127);
465
466 Param = Proto->Angle * 256;
467 if (Param < 0 || Param >= 256) {
468 P->Angle = 0;
469 } else {
470 P->Angle = static_cast<uint8_t>(Param);
471 }
472
473 /* round proto length to nearest integer number of pico-features */
474 Param = (Proto->Length / GetPicoFeatureLength()) + 0.5;
475 Class->ProtoLengths[ProtoId] = TruncateParam(Param, 1, 255);
476 if (classify_learning_debug_level >= 2) {
477 tprintf("Converted ffeat to (A=%d,B=%d,C=%d,L=%d)", P->A, P->B, P->C,
478 Class->ProtoLengths[ProtoId]);
479 }
480} /* ConvertProto */
#define ProtoForProtoId(C, P)
Definition: intproto.h:148
#define GetPicoFeatureLength()
Definition: picofeat.h:56

◆ CreateIntTemplates()

INT_TEMPLATES_STRUCT * tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)

This routine converts from the old floating point format to the new integer format.

Parameters
FloatProtosprototypes in old floating pt format
target_unicharsetthe UNICHARSET to use
Returns
New set of training templates in integer format.
Note
Globals: none

Definition at line 490 of file intproto.cpp.

491 {
492 CLASS_TYPE FClass;
493 INT_CLASS_STRUCT *IClass;
494 int ProtoId;
495 int ConfigId;
496
497 auto IntTemplates = new INT_TEMPLATES_STRUCT;
498
499 for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
500 FClass = &(FloatProtos[ClassId]);
501 if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
502 strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
503 tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
504 target_unicharset.id_to_unichar(ClassId));
505 }
506 assert(UnusedClassIdIn(IntTemplates, ClassId));
507 IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
508 unsigned fs_size = FClass->font_set.size();
509 FontSet fs;
510 fs.reserve(fs_size);
511 for (unsigned i = 0; i < fs_size; ++i) {
512 fs.push_back(FClass->font_set[i]);
513 }
514 IClass->font_set_id = this->fontset_table_.push_back(fs);
515 AddIntClass(IntTemplates, ClassId, IClass);
516
517 for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
518 AddIntProto(IClass);
519 ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
520 AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
521 classify_learning_debug_level >= 2);
522 AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
523 }
524
525 for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
526 AddIntConfig(IClass);
527 ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
528 }
529 }
530 return (IntTemplates);
531} /* CreateIntTemplates */
#define UnusedClassIdIn(T, c)
Definition: intproto.h:155
#define ProtoIn(Class, Pid)
Definition: protos.h:70
void AddIntClass(INT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:220
void AddProtoToProtoPruner(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class, bool debug)
Definition: intproto.cpp:344
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:430
CLASS_STRUCT * CLASS_TYPE
Definition: protos.h:49
void AddProtoToClassPruner(PROTO_STRUCT *Proto, CLASS_ID ClassId, INT_TEMPLATES_STRUCT *Templates)
Definition: intproto.cpp:306
int AddIntConfig(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:250
int AddIntProto(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:270
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:80
void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:452

◆ DebugAdaptiveClassifier()

void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB blob,
ADAPT_RESULTS Results 
)
Parameters
blobblob whose classification is being debugged
Resultsresults of match being debugged

Globals: none

Definition at line 1440 of file adaptmatch.cpp.

1440 {
1441 if (static_classifier_ == nullptr) {
1442 return;
1443 }
1444 INT_FX_RESULT_STRUCT fx_info;
1445 std::vector<INT_FEATURE_STRUCT> bl_features;
1446 TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1447 if (sample == nullptr) {
1448 return;
1449 }
1450 static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);
1451} /* DebugAdaptiveClassifier */
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, std::vector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
void DebugDisplay(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id)

◆ DisplayAdaptedChar()

void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
INT_CLASS_STRUCT int_class 
)

Definition at line 940 of file adaptmatch.cpp.

940 {
941 INT_FX_RESULT_STRUCT fx_info;
942 std::vector<INT_FEATURE_STRUCT> bl_features;
943 TrainingSample *sample =
944 BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);
945 if (sample == nullptr) {
946 return;
947 }
948
949 UnicharRating int_result;
950 im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,
951 classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
952 tprintf("Best match to temp config %d = %4.1f%%.\n", int_result.config,
953 int_result.rating * 100.0);
954 if (classify_learning_debug_level >= 2) {
955 uint32_t ConfigMask;
956 ConfigMask = 1 << int_result.config;
958 im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),
959 &bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,
960 matcher_debug_separate_windows);
962 }
963
964 delete sample;
965}
void UpdateMatchDisplay()
Definition: intproto.cpp:413

◆ DoAdaptiveMatch()

void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters
Blobblob to be classified
Resultsplace to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_reliable_adaptive_result rating limit for a great match

Definition at line 1474 of file adaptmatch.cpp.

1474 {
1475 UNICHAR_ID *Ambiguities;
1476
1477 INT_FX_RESULT_STRUCT fx_info;
1478 std::vector<INT_FEATURE_STRUCT> bl_features;
1479 TrainingSample *sample =
1480 BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1481 if (sample == nullptr) {
1482 return;
1483 }
1484
1485 // TODO: With LSTM, static_classifier_ is nullptr.
1486 // Return to avoid crash in CharNormClassifier.
1487 if (static_classifier_ == nullptr) {
1488 delete sample;
1489 return;
1490 }
1491
1492 if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {
1493 CharNormClassifier(Blob, *sample, Results);
1494 } else {
1495 Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);
1496 if ((!Results->match.empty() &&
1497 MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&
1498 !tess_bn_matching) ||
1499 Results->match.empty()) {
1500 CharNormClassifier(Blob, *sample, Results);
1501 } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1502 AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,
1503 Ambiguities, Results);
1504 }
1505 }
1506
1507 // Force the blob to be classified as noise
1508 // if the results contain only fragments.
1509 // TODO(daria): verify that this is better than
1510 // just adding a nullptr classification.
1511 if (!Results->HasNonfragment || Results->match.empty()) {
1512 ClassifyAsNoise(Results);
1513 }
1514 delete sample;
1515} /* DoAdaptiveMatch */
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:142
int UNICHAR_ID
Definition: unichar.h:34
ADAPT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: adaptive.h:75
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void AmbigClassifier(const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)

◆ double_VAR_H() [1/18]

tesseract::Classify::double_VAR_H ( classify_adapted_pruning_factor  )

◆ double_VAR_H() [2/18]

tesseract::Classify::double_VAR_H ( classify_adapted_pruning_threshold  )

◆ double_VAR_H() [3/18]

tesseract::Classify::double_VAR_H ( classify_char_norm_range  )

◆ double_VAR_H() [4/18]

tesseract::Classify::double_VAR_H ( classify_character_fragments_garbage_certainty_threshold  )

◆ double_VAR_H() [5/18]

tesseract::Classify::double_VAR_H ( classify_max_certainty_margin  )

◆ double_VAR_H() [6/18]

tesseract::Classify::double_VAR_H ( classify_max_rating_ratio  )

◆ double_VAR_H() [7/18]

tesseract::Classify::double_VAR_H ( classify_misfit_junk_penalty  )

◆ double_VAR_H() [8/18]

tesseract::Classify::double_VAR_H ( matcher_avg_noise_size  )

◆ double_VAR_H() [9/18]

tesseract::Classify::double_VAR_H ( matcher_bad_match_pad  )

◆ double_VAR_H() [10/18]

tesseract::Classify::double_VAR_H ( matcher_clustering_max_angle_delta  )

◆ double_VAR_H() [11/18]

tesseract::Classify::double_VAR_H ( matcher_good_threshold  )

◆ double_VAR_H() [12/18]

tesseract::Classify::double_VAR_H ( matcher_perfect_threshold  )

◆ double_VAR_H() [13/18]

tesseract::Classify::double_VAR_H ( matcher_rating_margin  )

◆ double_VAR_H() [14/18]

tesseract::Classify::double_VAR_H ( matcher_reliable_adaptive_result  )

◆ double_VAR_H() [15/18]

tesseract::Classify::double_VAR_H ( rating_scale  )

◆ double_VAR_H() [16/18]

tesseract::Classify::double_VAR_H ( speckle_large_max_size  )

◆ double_VAR_H() [17/18]

tesseract::Classify::double_VAR_H ( speckle_rating_penalty  )

◆ double_VAR_H() [18/18]

tesseract::Classify::double_VAR_H ( tessedit_class_miss_scale  )

◆ EndAdaptiveClassifier()

void tesseract::Classify::EndAdaptiveClassifier ( )

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Definition at line 464 of file adaptmatch.cpp.

464 {
465 std::string Filename;
466 FILE *File;
467
469 classify_save_adapted_templates) {
470 Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
471 File = fopen(Filename.c_str(), "wb");
472 if (File == nullptr) {
473 tprintf("Unable to save adapted templates to %s!\n", Filename.c_str());
474 } else {
475 tprintf("\nSaving adapted templates to %s ...", Filename.c_str());
476 fflush(stdout);
478 tprintf("\n");
479 fclose(File);
480 }
481 }
482
483 delete AdaptedTemplates;
484 AdaptedTemplates = nullptr;
486 BackupAdaptedTemplates = nullptr;
487
488 if (PreTrainedTemplates != nullptr) {
489 delete PreTrainedTemplates;
490 PreTrainedTemplates = nullptr;
491 }
494 if (AllProtosOn != nullptr) {
495 FreeBitVector(AllProtosOn);
496 FreeBitVector(AllConfigsOn);
497 FreeBitVector(AllConfigsOff);
498 FreeBitVector(TempProtoMask);
499 AllProtosOn = nullptr;
500 AllConfigsOn = nullptr;
501 AllConfigsOff = nullptr;
502 TempProtoMask = nullptr;
503 }
504 delete shape_table_;
505 shape_table_ = nullptr;
506 delete static_classifier_;
507 static_classifier_ = nullptr;
508} /* EndAdaptiveClassifier */
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:80
std::string imagefile
Definition: ccutil.h:65
BIT_VECTOR TempProtoMask
Definition: classify.h:430
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:345
BIT_VECTOR AllConfigsOff
Definition: classify.h:429
ADAPT_TEMPLATES_STRUCT * BackupAdaptedTemplates
Definition: classify.h:424
void EndDangerousAmbigs()
Definition: stopper.cpp:358

◆ ExpandShapesAndApplyCorrections()

void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS_STRUCT **  classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
int  matcher_multiplier,
const uint8_t *  cn_factors,
UnicharRating int_result,
ADAPT_RESULTS final_results 
)

Definition at line 1102 of file adaptmatch.cpp.

1106 {
1107 if (classes != nullptr) {
1108 // Adapted result. Convert configs to fontinfo_ids.
1109 int_result->adapted = true;
1110 for (auto &font : int_result->fonts) {
1111 font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);
1112 }
1113 } else {
1114 // Pre-trained result. Map fonts using font_sets_.
1115 int_result->adapted = false;
1116 for (auto &font : int_result->fonts) {
1117 font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);
1118 }
1119 if (shape_table_ != nullptr) {
1120 // Two possible cases:
1121 // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1122 // int_result->fonts are the same. In this case build a new vector of
1123 // mapped fonts and replace the fonts in int_result.
1124 // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1125 // by int_result. In this case, build a vector of UnicharRating to
1126 // gather together different font-ids for each unichar. Also covers case1.
1127 std::vector<UnicharRating> mapped_results;
1128 for (auto &f : int_result->fonts) {
1129 int shape_id = f.fontinfo_id;
1130 const Shape &shape = shape_table_->GetShape(shape_id);
1131 for (int c = 0; c < shape.size(); ++c) {
1132 int unichar_id = shape[c].unichar_id;
1133 if (!unicharset.get_enabled(unichar_id)) {
1134 continue;
1135 }
1136 // Find the mapped_result for unichar_id.
1137 unsigned r = 0;
1138 for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;
1139 ++r) {
1140 }
1141 if (r == mapped_results.size()) {
1142 mapped_results.push_back(*int_result);
1143 mapped_results[r].unichar_id = unichar_id;
1144 mapped_results[r].fonts.clear();
1145 }
1146 for (int font_id : shape[c].font_ids) {
1147 mapped_results[r].fonts.emplace_back(font_id, f.score);
1148 }
1149 }
1150 }
1151 for (auto &m : mapped_results) {
1152 m.rating = ComputeCorrectedRating(debug, m.unichar_id, cp_rating, int_result->rating,
1153 int_result->feature_misses, bottom, top, blob_length,
1154 matcher_multiplier, cn_factors);
1155 AddNewResult(m, final_results);
1156 }
1157 return;
1158 }
1159 }
1160 if (unicharset.get_enabled(class_id)) {
1161 int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,
1162 int_result->feature_misses, bottom, top,
1163 blob_length, matcher_multiplier, cn_factors);
1164 AddNewResult(*int_result, final_results);
1165 }
1166}
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)

◆ ExtractFeatures()

void tesseract::Classify::ExtractFeatures ( const TBLOB blob,
bool  nonlinear_norm,
std::vector< INT_FEATURE_STRUCT > *  bl_features,
std::vector< INT_FEATURE_STRUCT > *  cn_features,
INT_FX_RESULT_STRUCT results,
std::vector< int > *  outline_cn_counts 
)
static

Definition at line 436 of file intfx.cpp.

440 {
441 DENORM bl_denorm, cn_denorm;
442 tesseract::Classify::SetupBLCNDenorms(blob, nonlinear_norm, &bl_denorm, &cn_denorm, results);
443 if (outline_cn_counts != nullptr) {
444 outline_cn_counts->clear();
445 }
446 // Iterate the outlines.
447 for (TESSLINE *ol = blob.outlines; ol != nullptr; ol = ol->next) {
448 // Iterate the polygon.
449 EDGEPT *loop_pt = ol->FindBestStartPt();
450 EDGEPT *pt = loop_pt;
451 if (pt == nullptr) {
452 continue;
453 }
454 do {
455 if (pt->IsHidden()) {
456 continue;
457 }
458 // Find a run of equal src_outline.
459 EDGEPT *last_pt = pt;
460 do {
461 last_pt = last_pt->next;
462 } while (last_pt != loop_pt && !last_pt->IsHidden() &&
463 last_pt->src_outline == pt->src_outline);
464 last_pt = last_pt->prev;
465 // Until the adaptive classifier can be weaned off polygon segments,
466 // we have to force extraction from the polygon for the bl_features.
467 ExtractFeaturesFromRun(pt, last_pt, bl_denorm, kStandardFeatureLength, true, bl_features);
468 ExtractFeaturesFromRun(pt, last_pt, cn_denorm, kStandardFeatureLength, false, cn_features);
469 pt = last_pt;
470 } while ((pt = pt->next) != loop_pt);
471 if (outline_cn_counts != nullptr) {
472 outline_cn_counts->push_back(cn_features->size());
473 }
474 }
475 results->NumBL = bl_features->size();
476 results->NumCN = cn_features->size();
477 results->YBottom = blob.bounding_box().bottom();
478 results->YTop = blob.bounding_box().top();
479 results->Width = blob.bounding_box().width();
480}
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129

◆ ExtractIntCNFeatures()

FEATURE_SET tesseract::Classify::ExtractIntCNFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Integer character-normalized features for blob.

Definition at line 204 of file picofeat.cpp.

204 {
205 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
206 std::vector<INT_FEATURE_STRUCT> bl_features;
208 tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);
209 if (sample == nullptr) {
210 return nullptr;
211 }
212
213 uint32_t num_features = sample->num_features();
214 const INT_FEATURE_STRUCT *features = sample->features();
215 auto feature_set = new FEATURE_SET_STRUCT(num_features);
216 for (uint32_t f = 0; f < num_features; ++f) {
217 auto feature = new FEATURE_STRUCT(&IntFeatDesc);
218 feature->Params[IntX] = features[f].X;
219 feature->Params[IntY] = features[f].Y;
220 feature->Params[IntDir] = features[f].Theta;
221 AddFeature(feature_set, feature);
222 }
223 delete sample;
224
225 return feature_set;
226} /* ExtractIntCNFeatures */
const FEATURE_DESC_STRUCT IntFeatDesc
bool AddFeature(FEATURE_SET FeatureSet, FEATURE Feature)
Definition: ocrfeatures.cpp:40
@ IntDir
Definition: picofeat.h:31
const INT_FEATURE_STRUCT * features() const
uint32_t num_features() const

◆ ExtractIntGeoFeatures()

FEATURE_SET tesseract::Classify::ExtractIntGeoFeatures ( const TBLOB blob,
const INT_FX_RESULT_STRUCT fx_info 
)
Parameters
blobblob to extract features from
fx_info
Returns
Geometric (top/bottom/width) features for blob.

Definition at line 234 of file picofeat.cpp.

235 {
236 INT_FX_RESULT_STRUCT local_fx_info(fx_info);
237 std::vector<INT_FEATURE_STRUCT> bl_features;
239 tesseract::BlobToTrainingSample(blob, false, &local_fx_info, &bl_features);
240 if (sample == nullptr) {
241 return nullptr;
242 }
243
244 auto feature_set = new FEATURE_SET_STRUCT(1);
245 auto feature = new FEATURE_STRUCT(&IntFeatDesc);
246
247 feature->Params[GeoBottom] = sample->geo_feature(GeoBottom);
248 feature->Params[GeoTop] = sample->geo_feature(GeoTop);
249 feature->Params[GeoWidth] = sample->geo_feature(GeoWidth);
250 AddFeature(feature_set, feature);
251 delete sample;
252
253 return feature_set;
254} /* ExtractIntGeoFeatures */
@ GeoWidth
Definition: picofeat.h:38
int geo_feature(int index) const

◆ ExtractOutlineFeatures()

FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob)

Convert each segment in the outline to a feature and return the features.

Parameters
Blobblob to extract pico-features from
Returns
Outline-features for Blob.
Note
Globals: none

Definition at line 40 of file outfeat.cpp.

40 {
41 auto FeatureSet = new FEATURE_SET_STRUCT(MAX_OUTLINE_FEATURES);
42 if (Blob == nullptr) {
43 return (FeatureSet);
44 }
45
46 auto Outlines = ConvertBlob(Blob);
47
48 float XScale, YScale;
49 NormalizeOutlines(Outlines, &XScale, &YScale);
50 auto RemainingOutlines = Outlines;
51 iterate(RemainingOutlines) {
52 auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());
53 ConvertToOutlineFeatures(Outline, FeatureSet);
54 }
55 if (classify_norm_method == baseline) {
56 NormalizeOutlineX(FeatureSet);
57 }
58 FreeOutlines(Outlines);
59 return (FeatureSet);
60} /* ExtractOutlineFeatures */
#define MAX_OUTLINE_FEATURES
Definition: outfeat.h:34
LIST MFOUTLINE
Definition: mfoutline.h:28
void FreeOutlines(LIST Outlines)
Definition: mfoutline.cpp:151
void NormalizeOutlineX(FEATURE_SET FeatureSet)
Definition: outfeat.cpp:134
void ConvertToOutlineFeatures(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: outfeat.cpp:97
LIST ConvertBlob(TBLOB *blob)
Definition: mfoutline.cpp:34
void NormalizeOutlines(LIST Outlines, float *XScale, float *YScale)
Definition: mfoutline.cpp:249

◆ ExtractPicoFeatures()

FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob)

Operation: Dummy for now.

Globals:

  • classify_norm_method normalization method currently specified
    Parameters
    Blobblob to extract pico-features from
    Returns
    Pico-features for Blob.

Definition at line 60 of file picofeat.cpp.

60 {
61 auto FeatureSet = new FEATURE_SET_STRUCT(MAX_PICO_FEATURES);
62 auto Outlines = ConvertBlob(Blob);
63 float XScale, YScale;
64 NormalizeOutlines(Outlines, &XScale, &YScale);
65 auto RemainingOutlines = Outlines;
66 iterate(RemainingOutlines) {
67 auto Outline = static_cast<MFOUTLINE>(RemainingOutlines->first_node());
68 ConvertToPicoFeatures2(Outline, FeatureSet);
69 }
70 if (classify_norm_method == baseline) {
71 NormalizePicoX(FeatureSet);
72 }
73 FreeOutlines(Outlines);
74 return (FeatureSet);
75
76} /* ExtractPicoFeatures */
#define MAX_PICO_FEATURES
Definition: picofeat.h:45
void ConvertToPicoFeatures2(MFOUTLINE Outline, FEATURE_SET FeatureSet)
Definition: picofeat.cpp:144
void NormalizePicoX(FEATURE_SET FeatureSet)
Definition: picofeat.cpp:181

◆ FreeNormProtos()

void tesseract::Classify::FreeNormProtos ( )

Definition at line 154 of file normmatch.cpp.

154 {
155 if (NormProtos != nullptr) {
156 for (int i = 0; i < NormProtos->NumProtos; i++) {
158 }
159 delete[] NormProtos->ParamDesc;
160 delete NormProtos;
161 NormProtos = nullptr;
162 }
163}
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:40

◆ get_fontinfo_table() [1/2]

UnicityTable< FontInfo > & tesseract::Classify::get_fontinfo_table ( )
inline

Definition at line 324 of file classify.h.

324 {
325 return fontinfo_table_;
326 }

◆ get_fontinfo_table() [2/2]

const UnicityTable< FontInfo > & tesseract::Classify::get_fontinfo_table ( ) const
inline

Definition at line 327 of file classify.h.

327 {
328 return fontinfo_table_;
329 }

◆ get_fontset_table()

UnicityTable< FontSet > & tesseract::Classify::get_fontset_table ( )
inline

Definition at line 330 of file classify.h.

330 {
331 return fontset_table_;
332 }

◆ GetAdaptiveFeatures()

int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters
Blobblob to extract features from
[out]IntFeaturesarray to fill with integer features
[out]FloatFeaturesplace to return actual floating-pt features
Returns
Number of pico-features returned (0 if an error occurred)

Definition at line 778 of file adaptmatch.cpp.

779 {
780 FEATURE_SET Features;
781 int NumFeatures;
782
783 classify_norm_method.set_value(baseline);
784 Features = ExtractPicoFeatures(Blob);
785
786 NumFeatures = Features->NumFeatures;
787 if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
788 delete Features;
789 return 0;
790 }
791
792 ComputeIntFeatures(Features, IntFeatures);
793 *FloatFeatures = Features;
794
795 return NumFeatures;
796} /* GetAdaptiveFeatures */
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:83
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:60
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:85

◆ GetAmbiguities()

UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters
Blobblob to get classification ambiguities for
CorrectClasscorrect class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns
String containing all possible ambiguous classes.

Definition at line 1532 of file adaptmatch.cpp.

1532 {
1533 auto *Results = new ADAPT_RESULTS();
1534 UNICHAR_ID *Ambiguities;
1535
1536 Results->Initialize();
1537 INT_FX_RESULT_STRUCT fx_info;
1538 std::vector<INT_FEATURE_STRUCT> bl_features;
1539 TrainingSample *sample =
1540 BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1541 if (sample == nullptr) {
1542 delete Results;
1543 return nullptr;
1544 }
1545
1546 CharNormClassifier(Blob, *sample, Results);
1547 delete sample;
1548 RemoveBadMatches(Results);
1549 std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
1550
1551 /* copy the class id's into an string of ambiguities - don't copy if
1552 the correct class is the only class id matched */
1553 Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1554 if (Results->match.size() > 1 ||
1555 (Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {
1556 unsigned i;
1557 for (i = 0; i < Results->match.size(); i++) {
1558 Ambiguities[i] = Results->match[i].unichar_id;
1559 }
1560 Ambiguities[i] = -1;
1561 } else {
1562 Ambiguities[0] = -1;
1563 }
1564
1565 delete Results;
1566 return Ambiguities;
1567} /* GetAmbiguities */

◆ GetCharNormFeature()

int tesseract::Classify::GetCharNormFeature ( const INT_FX_RESULT_STRUCT fx_info,
INT_TEMPLATES_STRUCT templates,
uint8_t *  pruner_norm_array,
uint8_t *  char_norm_array 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters
templatesused to compute char norm adjustments
pruner_norm_arrayArray of factors from blob normalization process
char_norm_arrayarray to fill with dummy char norm adjustments
fx_info

Globals:

Returns
Number of features extracted or 0 if an error occurred.

Definition at line 1613 of file adaptmatch.cpp.

1614 {
1615 auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);
1617 float scale = MF_SCALE_FACTOR;
1618 norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1619 norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;
1620 norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1621 norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1622 // Deletes norm_feature.
1623 ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);
1624 return IntCastRounded(fx_info.Length / kStandardFeatureLength);
1625} /* GetCharNormFeature */
#define LENGTH_COMPRESSION
Definition: normfeat.h:26
const float MF_SCALE_FACTOR
Definition: mfoutline.h:61
const FEATURE_DESC_STRUCT CharNormDesc
const int kBlnBaselineOffset
Definition: normalis.h:34

◆ GetClassToDebug()

CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)

This routine prompts the user with Prompt and waits for the user to enter something in the debug window.

Parameters
Promptprompt to print while waiting for input from window
adaptive_on
pretrained_on
shape_id
Returns
Character entered in the debug window.
Note
Globals: none

Definition at line 1165 of file intproto.cpp.

1166 {
1167 tprintf("%s\n", Prompt);
1168 SVEventType ev_type;
1169 int unichar_id = INVALID_UNICHAR_ID;
1170 // Wait until a click or popup event.
1171 do {
1172 auto ev = IntMatchWindow->AwaitEvent(SVET_ANY);
1173 ev_type = ev->type;
1174 if (ev_type == SVET_POPUP) {
1175 if (ev->command_id == IDA_SHAPE_INDEX) {
1176 if (shape_table_ != nullptr) {
1177 *shape_id = atoi(ev->parameter);
1178 *adaptive_on = false;
1179 *pretrained_on = true;
1180 if (*shape_id >= 0 && static_cast<unsigned>(*shape_id) < shape_table_->NumShapes()) {
1181 int font_id;
1182 shape_table_->GetFirstUnicharAndFont(*shape_id, &unichar_id, &font_id);
1183 tprintf("Shape %d, first unichar=%d, font=%d\n", *shape_id, unichar_id, font_id);
1184 return unichar_id;
1185 }
1186 tprintf("Shape index '%s' not found in shape table\n", ev->parameter);
1187 } else {
1188 tprintf("No shape table loaded!\n");
1189 }
1190 } else {
1191 if (unicharset.contains_unichar(ev->parameter)) {
1192 unichar_id = unicharset.unichar_to_id(ev->parameter);
1193 if (ev->command_id == IDA_ADAPTIVE) {
1194 *adaptive_on = true;
1195 *pretrained_on = false;
1196 *shape_id = -1;
1197 } else if (ev->command_id == IDA_STATIC) {
1198 *adaptive_on = false;
1199 *pretrained_on = true;
1200 } else {
1201 *adaptive_on = true;
1202 *pretrained_on = true;
1203 }
1204 if (ev->command_id == IDA_ADAPTIVE || shape_table_ == nullptr) {
1205 *shape_id = -1;
1206 return unichar_id;
1207 }
1208 for (unsigned s = 0; s < shape_table_->NumShapes(); ++s) {
1209 if (shape_table_->GetShape(s).ContainsUnichar(unichar_id)) {
1210 tprintf("%s\n", shape_table_->DebugStr(s).c_str());
1211 }
1212 }
1213 } else {
1214 tprintf("Char class '%s' not found in unicharset", ev->parameter);
1215 }
1216 }
1217 }
1218 } while (ev_type != SVET_CLICK);
1219 return 0;
1220} /* GetClassToDebug */
@ SVET_POPUP
Definition: scrollview.h:62
@ SVET_CLICK
Definition: scrollview.h:56
@ IDA_SHAPE_INDEX
Definition: intproto.h:139
@ IDA_ADAPTIVE
Definition: intproto.h:139
@ IDA_STATIC
Definition: intproto.h:139
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:150
unsigned NumShapes() const
Definition: shapetable.h:248
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:420
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)
Definition: scrollview.cpp:432

◆ getDict()

virtual Dict & tesseract::Classify::getDict ( )
inlinevirtual

Reimplemented in tesseract::Tesseract.

Definition at line 98 of file classify.h.

98 {
99 return dict_;
100 }

◆ GetFontinfoId()

int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS_STRUCT Class,
uint8_t  ConfigId 
)

Definition at line 118 of file adaptive.cpp.

118 {
119 return (ConfigIsPermanent(Class, ConfigId) ? PermConfigFor(Class, ConfigId)->FontinfoId
120 : TempConfigFor(Class, ConfigId)->FontinfoId);
121}
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93

◆ InitAdaptedClass()

void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS_STRUCT Class,
ADAPT_TEMPLATES_STRUCT Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters
Blobblob to model new class after
ClassIdid of the class to be initialized
FontinfoIdfont information inferred from pre-trained templates
Classadapted class to be initialized
Templatesadapted templates to add new class to

Globals:

Definition at line 686 of file adaptmatch.cpp.

687 {
688 FEATURE_SET Features;
689 int Fid, Pid;
690 FEATURE Feature;
691 int NumFeatures;
692 PROTO_STRUCT *Proto;
693 INT_CLASS_STRUCT *IClass;
694 TEMP_CONFIG_STRUCT *Config;
695
696 classify_norm_method.set_value(baseline);
697 Features = ExtractOutlineFeatures(Blob);
698 NumFeatures = Features->NumFeatures;
699 if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
700 delete Features;
701 return;
702 }
703
704 Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);
705 TempConfigFor(Class, 0) = Config;
706
707 /* this is a kludge to construct cutoffs for adapted templates */
708 if (Templates == AdaptedTemplates) {
709 BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
710 }
711
712 IClass = ClassForClassId(Templates->Templates, ClassId);
713
714 for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
715 Pid = AddIntProto(IClass);
716 assert(Pid != NO_PROTO);
717
718 Feature = Features->Features[Fid];
719 auto TempProto = new TEMP_PROTO_STRUCT;
720 Proto = &(TempProto->Proto);
721
722 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
723 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
724 instead of the -0.25 to 0.75 used in baseline normalization */
725 Proto->Angle = Feature->Params[OutlineFeatDir];
726 Proto->X = Feature->Params[OutlineFeatX];
727 Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
728 Proto->Length = Feature->Params[OutlineFeatLength];
729 FillABC(Proto);
730
731 TempProto->ProtoId = Pid;
732 SET_BIT(Config->Protos, Pid);
733
734 ConvertProto(Proto, Pid, IClass);
735 AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
736
737 Class->TempProtos = push(Class->TempProtos, TempProto);
738 }
739 delete Features;
740
741 AddIntConfig(IClass);
742 ConvertConfig(AllProtosOn, 0, IClass);
743
744 if (classify_learning_debug_level >= 1) {
745 tprintf("Added new class '%s' with class id %d and %d protos.\n",
746 unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
747#ifndef GRAPHICS_DISABLED
748 if (classify_learning_debug_level > 1) {
749 DisplayAdaptedChar(Blob, IClass);
750 }
751#endif
752 }
753
754 if (IsEmptyAdaptedClass(Class)) {
755 (Templates->NumNonEmptyClasses)++;
756 }
757} /* InitAdaptedClass */
#define NO_PROTO
Definition: matchdefs.h:41
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:89
@ OutlineFeatLength
Definition: outfeat.h:30
@ OutlineFeatY
Definition: outfeat.h:29
@ OutlineFeatX
Definition: outfeat.h:28
@ OutlineFeatDir
Definition: outfeat.h:31
CLUSTERCONFIG Config
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
void FillABC(PROTO_STRUCT *Proto)
Definition: protos.cpp:103
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:40

◆ InitAdaptiveClassifier()

void tesseract::Classify::InitAdaptiveClassifier ( TessdataManager mgr)

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be loaded. Should only be set to true if the necessary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Definition at line 527 of file adaptmatch.cpp.

527 {
529 return;
530 }
531 if (AllProtosOn != nullptr) {
532 EndAdaptiveClassifier(); // Don't leak with multiple inits.
533 }
534
535 // If there is no language_data_path_prefix, the classifier will be
536 // adaptive only.
537 if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
538 TFile fp;
539 ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp));
541
542 if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
543 shape_table_ = new ShapeTable(unicharset);
544 if (!shape_table_->DeSerialize(&fp)) {
545 tprintf("Error loading shape table!\n");
546 delete shape_table_;
547 shape_table_ = nullptr;
548 }
549 }
550
551 ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp));
552 ReadNewCutoffs(&fp, CharNormCutoffs);
553
554 ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp));
556 static_classifier_ = new TessClassifier(false, this);
557 }
558
560
561 AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
562 AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
563 AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
564 TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
565 set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
566 set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
567 zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
568
569 for (uint16_t &BaselineCutoff : BaselineCutoffs) {
570 BaselineCutoff = 0;
571 }
572
573 if (classify_use_pre_adapted_templates) {
574 TFile fp;
575 std::string Filename = imagefile;
576 Filename += ADAPT_TEMPLATE_SUFFIX;
577 if (!fp.Open(Filename.c_str(), nullptr)) {
578 AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
579 } else {
580 tprintf("\nReading pre-adapted templates from %s ...\n", Filename.c_str());
581 fflush(stdout);
583 tprintf("\n");
585
586 for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
587 BaselineCutoffs[i] = CharNormCutoffs[i];
588 }
589 }
590 } else {
591 delete AdaptedTemplates;
592 AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
593 }
594} /* InitAdaptiveClassifier */
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
@ TESSDATA_SHAPE_TABLE
void InitIntegerFX()
Definition: intfx.cpp:54
std::string language_data_path_prefix
Definition: ccutil.h:60
INT_TEMPLATES_STRUCT * Templates
Definition: adaptive.h:72
INT_TEMPLATES_STRUCT * ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:629
ADAPT_TEMPLATES_STRUCT * ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:235
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:173
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:153
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255

◆ INT_VAR_H() [1/14]

tesseract::Classify::INT_VAR_H ( classify_adapt_feature_threshold  )

◆ INT_VAR_H() [2/14]

tesseract::Classify::INT_VAR_H ( classify_adapt_proto_threshold  )

◆ INT_VAR_H() [3/14]

tesseract::Classify::INT_VAR_H ( classify_class_pruner_multiplier  )

◆ INT_VAR_H() [4/14]

tesseract::Classify::INT_VAR_H ( classify_class_pruner_threshold  )

◆ INT_VAR_H() [5/14]

tesseract::Classify::INT_VAR_H ( classify_cp_cutoff_strength  )

◆ INT_VAR_H() [6/14]

tesseract::Classify::INT_VAR_H ( classify_debug_level  )

◆ INT_VAR_H() [7/14]

tesseract::Classify::INT_VAR_H ( classify_integer_matcher_multiplier  )

◆ INT_VAR_H() [8/14]

tesseract::Classify::INT_VAR_H ( classify_learning_debug_level  )

◆ INT_VAR_H() [9/14]

tesseract::Classify::INT_VAR_H ( classify_norm_method  )

◆ INT_VAR_H() [10/14]

tesseract::Classify::INT_VAR_H ( matcher_debug_flags  )

◆ INT_VAR_H() [11/14]

tesseract::Classify::INT_VAR_H ( matcher_debug_level  )

◆ INT_VAR_H() [12/14]

tesseract::Classify::INT_VAR_H ( matcher_min_examples_for_prototyping  )

◆ INT_VAR_H() [13/14]

tesseract::Classify::INT_VAR_H ( matcher_permanent_classes_min  )

◆ INT_VAR_H() [14/14]

tesseract::Classify::INT_VAR_H ( matcher_sufficient_examples_for_prototyping  )

◆ LargeSpeckle()

bool tesseract::Classify::LargeSpeckle ( const TBLOB blob)

Definition at line 190 of file classify.cpp.

190 {
191 double speckle_size = kBlnXHeight * speckle_large_max_size;
192 TBOX bbox = blob.bounding_box();
193 return bbox.width() < speckle_size && bbox.height() < speckle_size;
194}
const int kBlnXHeight
Definition: normalis.h:33

◆ LearnBlob()

void tesseract::Classify::LearnBlob ( const std::string &  fontname,
TBLOB Blob,
const DENORM cn_denorm,
const INT_FX_RESULT_STRUCT fx_info,
const char *  blob_text 
)

Definition at line 35 of file blobclass.cpp.

36 {
37 std::unique_ptr<CHAR_DESC_STRUCT> CharDesc(new CHAR_DESC_STRUCT(feature_defs_));
38 CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
39 CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
40 CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
41 CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
42
43 if (ValidCharDescription(feature_defs_, CharDesc.get())) {
44 // Label the features with a class name and font name.
45 tr_file_data_ += "\n";
46 tr_file_data_ += fontname;
47 tr_file_data_ += " ";
48 tr_file_data_ += blob_text;
49 tr_file_data_ += "\n";
50
51 // write micro-features to file and clean up
52 WriteCharDescription(feature_defs_, CharDesc.get(), tr_file_data_);
53 } else {
54 tprintf("Blob learned was invalid!\n");
55 }
56} // LearnBlob
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc)
Definition: featdefs.cpp:131
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC_STRUCT *CharDesc, std::string &str)
Definition: featdefs.cpp:109
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:41
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:56
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:204
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:234

◆ LearnPieces()

void tesseract::Classify::LearnPieces ( const char *  fontname,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)

Definition at line 385 of file adaptmatch.cpp.

387 {
388 // TODO(daria) Remove/modify this if/when we want
389 // to train and/or adapt to n-grams.
390 if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {
391 return;
392 }
393
394 if (length > 1) {
395 SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
396 }
397 TBLOB *blob = word->chopped_word->blobs[start];
398 // Rotate the blob if needed for classification.
399 TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();
400 if (rotated_blob == nullptr) {
401 rotated_blob = blob;
402 }
403
404#ifndef GRAPHICS_DISABLED
405 // Draw debug windows showing the blob that is being learned if needed.
406 if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
407 RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600, word->chopped_word->bounding_box());
408 rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
409 learn_debug_win_->Update();
410 learn_debug_win_->Wait();
411 }
412 if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
413 ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
414 blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);
415 learn_fragments_debug_win_->Update();
416 }
417#endif // !GRAPHICS_DISABLED
418
419 if (fontname != nullptr) {
420 classify_norm_method.set_value(character); // force char norm spc 30/11/93
421 tess_bn_matching.set_value(false); // turn it off
422 tess_cn_matching.set_value(false);
423 DENORM bl_denorm, cn_denorm;
424 INT_FX_RESULT_STRUCT fx_info;
425 SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);
426 LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
427 } else if (unicharset.contains_unichar(correct_text)) {
428 UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
429 int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;
430 if (classify_learning_debug_level >= 1) {
431 tprintf("Adapting to char = %s, thr= %g font_id= %d\n", unicharset.id_to_unichar(class_id),
432 threshold, font_id);
433 }
434 // If filename is not nullptr we are doing recognition
435 // (as opposed to training), so we must have already set word fonts.
436 AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
437 if (BackupAdaptedTemplates != nullptr) {
438 // Adapt the backup templates too. They will be used if the primary gets
439 // too full.
440 AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);
441 }
442 } else if (classify_debug_level >= 1) {
443 tprintf("Can't adapt to %s not in unicharset\n", correct_text);
444 }
445 if (rotated_blob != blob) {
446 delete rotated_blob;
447 }
448
449 SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
450} // LearnPieces.
@ CST_WHOLE
Definition: classify.h:89
@ CST_FRAGMENT
Definition: classify.h:88
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181
void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:35
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:240
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
Definition: adaptmatch.cpp:843
static void Update()
Definition: scrollview.cpp:700

◆ LearnWord()

void tesseract::Classify::LearnWord ( const char *  fontname,
WERD_RES word 
)

Definition at line 262 of file adaptmatch.cpp.

262 {
263 int word_len = word->correct_text.size();
264 if (word_len == 0) {
265 return;
266 }
267
268 float *thresholds = nullptr;
269 if (fontname == nullptr) {
270 // Adaption mode.
271 if (!EnableLearning || word->best_choice == nullptr) {
272 return; // Can't or won't adapt.
273 }
274
275 if (classify_learning_debug_level >= 1) {
276 tprintf("\n\nAdapting to word = %s\n", word->best_choice->debug_string().c_str());
277 }
278 thresholds = new float[word_len];
279 word->ComputeAdaptionThresholds(getDict().certainty_scale, matcher_perfect_threshold,
280 matcher_good_threshold, matcher_rating_margin, thresholds);
281 }
282 int start_blob = 0;
283
284#ifndef GRAPHICS_DISABLED
285 if (classify_debug_character_fragments) {
286 if (learn_fragmented_word_debug_win_ != nullptr) {
287 learn_fragmented_word_debug_win_->Wait();
288 }
289 RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
290 word->chopped_word->bounding_box());
291 RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
292 word->chopped_word->bounding_box());
293 word->chopped_word->plot(learn_fragmented_word_debug_win_);
295 }
296#endif // !GRAPHICS_DISABLED
297
298 for (int ch = 0; ch < word_len; ++ch) {
299 if (classify_debug_character_fragments) {
300 tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
301 }
302 if (word->correct_text[ch].length() > 0) {
303 float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
304
305 LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,
306 word->correct_text[ch].c_str(), word);
307
308 if (word->best_state[ch] > 1 && !disable_character_fragments) {
309 // Check that the character breaks into meaningful fragments
310 // that each match a whole character with at least
311 // classify_character_fragments_garbage_certainty_threshold
312 bool garbage = false;
313 int frag;
314 for (frag = 0; frag < word->best_state[ch]; ++frag) {
315 TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];
316 if (classify_character_fragments_garbage_certainty_threshold < 0) {
317 garbage |= LooksLikeGarbage(frag_blob);
318 }
319 }
320 // Learn the fragments.
321 if (!garbage) {
322 bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);
323 if (pieces_all_natural || !prioritize_division) {
324 for (frag = 0; frag < word->best_state[ch]; ++frag) {
325 std::vector<std::string> tokens = split(word->correct_text[ch], ' ');
326
327 tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],
328 pieces_all_natural);
329
330 std::string full_string;
331 for (unsigned i = 0; i < tokens.size(); i++) {
332 full_string += tokens[i];
333 if (i != tokens.size() - 1) {
334 full_string += ' ';
335 }
336 }
337 LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,
338 full_string.c_str(), word);
339 }
340 }
341 }
342 }
343
344 // TODO(rays): re-enable this part of the code when we switch to the
345 // new classifier that needs to see examples of garbage.
346 /*
347if (word->best_state[ch] > 1) {
348 // If the next blob is good, make junk with the rightmost fragment.
349 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
350 LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
351 word->best_state[ch + 1] + 1,
352 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
353 }
354 // If the previous blob is good, make junk with the leftmost fragment.
355 if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
356 LearnPieces(fontname, start_blob - word->best_state[ch - 1],
357 word->best_state[ch - 1] + 1,
358 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
359 }
360}
361// If the next blob is good, make a join with it.
362if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
363 std::string joined_text = word->correct_text[ch];
364 joined_text += word->correct_text[ch + 1];
365 LearnPieces(fontname, start_blob,
366 word->best_state[ch] + word->best_state[ch + 1],
367 threshold, CST_NGRAM, joined_text.c_str(), word);
368}
369*/
370 }
371 start_blob += word->best_state[ch];
372 }
373 delete[] thresholds;
374} // LearnWord.
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
std::string to_string() const
Definition: unicharset.h:91
bool LooksLikeGarbage(TBLOB *blob)
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:385

◆ LooksLikeGarbage()

bool tesseract::Classify::LooksLikeGarbage ( TBLOB blob)

Definition at line 1571 of file adaptmatch.cpp.

1571 {
1572 auto *ratings = new BLOB_CHOICE_LIST();
1573 AdaptiveClassifier(blob, ratings);
1574 BLOB_CHOICE_IT ratings_it(ratings);
1575 const UNICHARSET &unicharset = getDict().getUnicharset();
1576 if (classify_debug_character_fragments) {
1577 print_ratings_list("======================\nLooksLikeGarbage() got ", ratings, unicharset);
1578 }
1579 for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {
1580 if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1581 continue;
1582 }
1583 float certainty = ratings_it.data()->certainty();
1584 delete ratings;
1585 return certainty < classify_character_fragments_garbage_certainty_threshold;
1586 }
1587 delete ratings;
1588 return true; // no whole characters in ratings
1589}
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:202
const UNICHARSET & getUnicharset() const
Definition: dict.h:104

◆ MakeNewTemporaryConfig()

int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES_STRUCT Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters
Templatesadapted templates to add new config to
ClassIdclass id to associate with new config
FontinfoIdfont information inferred from pre-trained templates
NumFeaturesnumber of features in IntFeatures
Featuresfeatures describing model for new config
FloatFeaturesfloating-pt representation of features
Returns
The id of the new config created, a negative integer in case of error.

Definition at line 1669 of file adaptmatch.cpp.

1671 {
1672 INT_CLASS_STRUCT *IClass;
1673 ADAPT_CLASS_STRUCT *Class;
1674 PROTO_ID OldProtos[MAX_NUM_PROTOS];
1675 FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1676 int NumOldProtos;
1677 int NumBadFeatures;
1678 int MaxProtoId, OldMaxProtoId;
1679 int MaskSize;
1680 int ConfigId;
1681 int i;
1682 int debug_level = NO_DEBUG;
1683
1684 if (classify_learning_debug_level >= 3) {
1686 }
1687
1688 IClass = ClassForClassId(Templates->Templates, ClassId);
1689 Class = Templates->Class[ClassId];
1690
1691 if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1692 ++NumAdaptationsFailed;
1693 if (classify_learning_debug_level >= 1) {
1694 tprintf("Cannot make new temporary config: maximum number exceeded.\n");
1695 }
1696 return -1;
1697 }
1698
1699 OldMaxProtoId = IClass->NumProtos - 1;
1700
1701 NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,
1702 OldProtos, classify_adapt_proto_threshold, debug_level);
1703
1704 MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1705 zero_all_bits(TempProtoMask, MaskSize);
1706 for (i = 0; i < NumOldProtos; i++) {
1707 SET_BIT(TempProtoMask, OldProtos[i]);
1708 }
1709
1710 NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,
1711 BadFeatures, classify_adapt_feature_threshold, debug_level);
1712
1713 MaxProtoId =
1714 MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);
1715 if (MaxProtoId == NO_PROTO) {
1716 ++NumAdaptationsFailed;
1717 if (classify_learning_debug_level >= 1) {
1718 tprintf("Cannot make new temp protos: maximum number exceeded.\n");
1719 }
1720 return -1;
1721 }
1722
1723 ConfigId = AddIntConfig(IClass);
1724 ConvertConfig(TempProtoMask, ConfigId, IClass);
1725 auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);
1726 TempConfigFor(Class, ConfigId) = Config;
1727 copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1728
1729 if (classify_learning_debug_level >= 1) {
1730 tprintf(
1731 "Making new temp config %d fontinfo id %d"
1732 " using %d old and %d new protos.\n",
1733 ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);
1734 }
1735
1736 return ConfigId;
1737} /* MakeNewTemporaryConfig */
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:116
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:165
#define PRINT_PROTO_MATCHES
Definition: intproto.h:169
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:168
int16_t PROTO_ID
Definition: matchdefs.h:40
uint8_t FEATURE_ID
Definition: matchdefs.h:46
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
int FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:619
int FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:555

◆ MakeNewTempProtos()

PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS_STRUCT IClass,
ADAPT_CLASS_STRUCT Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters
Featuresfloating-pt features describing new character
NumBadFeatnumber of bad features to turn into protos
BadFeatfeature id's of bad features
IClassinteger class templates to add new protos to
Classadapted class templates to add new protos to
TempProtoMaskproto mask to add new protos to

Globals: none

Returns
Max proto id in class after all protos have been added.

Definition at line 1758 of file adaptmatch.cpp.

1760 {
1761 FEATURE_ID *ProtoStart;
1762 FEATURE_ID *ProtoEnd;
1763 FEATURE_ID *LastBad;
1764 PROTO_STRUCT *Proto;
1765 FEATURE F1, F2;
1766 float X1, X2, Y1, Y2;
1767 float A1, A2, AngleDelta;
1768 float SegmentLength;
1769 PROTO_ID Pid;
1770
1771 for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;
1772 ProtoStart = ProtoEnd) {
1773 F1 = Features->Features[*ProtoStart];
1774 X1 = F1->Params[PicoFeatX];
1775 Y1 = F1->Params[PicoFeatY];
1776 A1 = F1->Params[PicoFeatDir];
1777
1778 for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;
1779 ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1780 F2 = Features->Features[*ProtoEnd];
1781 X2 = F2->Params[PicoFeatX];
1782 Y2 = F2->Params[PicoFeatY];
1783 A2 = F2->Params[PicoFeatDir];
1784
1785 AngleDelta = std::fabs(A1 - A2);
1786 if (AngleDelta > 0.5f) {
1787 AngleDelta = 1 - AngleDelta;
1788 }
1789
1790 if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||
1791 std::fabs(Y1 - Y2) > SegmentLength) {
1792 break;
1793 }
1794 }
1795
1796 F2 = Features->Features[*(ProtoEnd - 1)];
1797 X2 = F2->Params[PicoFeatX];
1798 Y2 = F2->Params[PicoFeatY];
1799 A2 = F2->Params[PicoFeatDir];
1800
1801 Pid = AddIntProto(IClass);
1802 if (Pid == NO_PROTO) {
1803 return (NO_PROTO);
1804 }
1805
1806 auto TempProto = new TEMP_PROTO_STRUCT;
1807 Proto = &(TempProto->Proto);
1808
1809 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1810 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1811 instead of the -0.25 to 0.75 used in baseline normalization */
1812 Proto->Length = SegmentLength;
1813 Proto->Angle = A1;
1814 Proto->X = (X1 + X2) / 2;
1815 Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;
1816 FillABC(Proto);
1817
1818 TempProto->ProtoId = Pid;
1819 SET_BIT(TempProtoMask, Pid);
1820
1821 ConvertProto(Proto, Pid, IClass);
1822 AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
1823
1824 Class->TempProtos = push(Class->TempProtos, TempProto);
1825 }
1826 return IClass->NumProtos - 1;
1827} /* MakeNewTempProtos */

◆ MakePermanent()

void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES_STRUCT Templates,
CLASS_ID  ClassId,
int  ConfigId,
TBLOB Blob 
)
Parameters
Templatescurrent set of adaptive templates
ClassIdclass containing config to be made permanent
ConfigIdconfig to be made permanent
Blobcurrent blob being adapted to

Globals: none

Definition at line 1839 of file adaptmatch.cpp.

1840 {
1841 UNICHAR_ID *Ambigs;
1842 PROTO_KEY ProtoKey;
1843
1844 auto Class = Templates->Class[ClassId];
1845 auto Config = TempConfigFor(Class, ConfigId);
1846
1847 MakeConfigPermanent(Class, ConfigId);
1848 if (Class->NumPermConfigs == 0) {
1849 Templates->NumPermClasses++;
1850 }
1851 Class->NumPermConfigs++;
1852
1853 // Initialize permanent config.
1854 Ambigs = GetAmbiguities(Blob, ClassId);
1855 auto Perm = new PERM_CONFIG_STRUCT;
1856 Perm->Ambigs = Ambigs;
1857 Perm->FontinfoId = Config->FontinfoId;
1858
1859 // Free memory associated with temporary config (since ADAPTED_CONFIG
1860 // is a union we need to clean up before we record permanent config).
1861 ProtoKey.Templates = Templates;
1862 ProtoKey.ClassId = ClassId;
1863 ProtoKey.ConfigId = ConfigId;
1864 Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1865 delete Config;
1866
1867 // Record permanent config.
1868 PermConfigFor(Class, ConfigId) = Perm;
1869
1870 if (classify_learning_debug_level >= 1) {
1871 tprintf(
1872 "Making config %d for %s (ClassId %d) permanent:"
1873 " fontinfo id %d, ambiguities '",
1874 ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,
1875 PermConfigFor(Class, ConfigId)->FontinfoId);
1876 for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {
1877 tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1878 }
1879 tprintf("'.\n");
1880 }
1881} /* MakePermanent */
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:87
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:88
int MakeTempProtoPerm(void *item1, void *item2)
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)

◆ MasterMatcher()

void tesseract::Classify::MasterMatcher ( INT_TEMPLATES_STRUCT templates,
int16_t  num_features,
const INT_FEATURE_STRUCT features,
const uint8_t *  norm_factors,
ADAPT_CLASS_STRUCT **  classes,
int  debug,
int  matcher_multiplier,
const TBOX blob_box,
const std::vector< CP_RESULT_STRUCT > &  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

Definition at line 1074 of file adaptmatch.cpp.

1078 {
1079 int top = blob_box.top();
1080 int bottom = blob_box.bottom();
1081 UnicharRating int_result;
1082 for (auto &&result : results) {
1083 CLASS_ID class_id = result.Class;
1084 BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;
1085 BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;
1086
1087 int_result.unichar_id = class_id;
1088 im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,
1089 &int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);
1090 bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1091 ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,
1092 final_results->BlobLength, matcher_multiplier, norm_factors,
1093 &int_result, final_results);
1094 }
1095}

◆ NormalizeOutlines()

void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
float *  XScale,
float *  YScale 
)

This routine normalizes every outline in Outlines according to the currently selected normalization method. It also returns the scale factors that it used to do this scaling. The scale factors returned represent the x and y sizes in the normalized coordinate system that correspond to 1 pixel in the original coordinate system. Outlines are changed and XScale and YScale are updated.

Globals:

  • classify_norm_method method being used for normalization
  • classify_char_norm_range map radius of gyration to this value
    Parameters
    Outlineslist of outlines to be normalized
    XScalex-direction scale factor used by routine
    YScaley-direction scale factor used by routine

Definition at line 249 of file mfoutline.cpp.

249 {
250 MFOUTLINE Outline;
251
252 switch (classify_norm_method) {
253 case character:
254 ASSERT_HOST(!"How did NormalizeOutlines get called in character mode?");
255 break;
256
257 case baseline:
258 iterate(Outlines) {
259 Outline = static_cast<MFOUTLINE>(Outlines->first_node());
260 NormalizeOutline(Outline, 0.0);
261 }
262 *XScale = *YScale = MF_SCALE_FACTOR;
263 break;
264 }
265} /* NormalizeOutlines */
void NormalizeOutline(MFOUTLINE Outline, float XOrigin)
Definition: mfoutline.cpp:218

◆ PrintAdaptedTemplates()

void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES_STRUCT Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters
Fileopen text file to print Templates to
Templatesadapted templates to print to File
Note
Globals: none

Definition at line 153 of file adaptive.cpp.

153 {
154 INT_CLASS_STRUCT *IClass;
155 ADAPT_CLASS_STRUCT *AClass;
156
157 fprintf(File, "\n\nSUMMARY OF ADAPTED TEMPLATES:\n\n");
158 fprintf(File, "Num classes = %d; Num permanent classes = %d\n\n", Templates->NumNonEmptyClasses,
159 Templates->NumPermClasses);
160 fprintf(File, " Id NC NPC NP NPP\n");
161 fprintf(File, "------------------------\n");
162
163 for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
164 IClass = Templates->Templates->Class[i];
165 AClass = Templates->Class[i];
166 if (!IsEmptyAdaptedClass(AClass)) {
167 fprintf(File, "%5u %s %3d %3d %3d %3zd\n", i, unicharset.id_to_unichar(i), IClass->NumConfigs,
168 AClass->NumPermConfigs, IClass->NumProtos,
169 IClass->NumProtos - AClass->TempProtos->size());
170 }
171 }
172 fprintf(File, "\n");
173
174} /* PrintAdaptedTemplates */

◆ PrintAdaptiveMatchResults()

void tesseract::Classify::PrintAdaptiveMatchResults ( const ADAPT_RESULTS results)

This routine writes the matches in Results to File.

Parameters
resultsmatch results to write to File

Globals: none

Definition at line 1922 of file adaptmatch.cpp.

1922 {
1923 for (auto &it : results.match) {
1924 tprintf("%s ", unicharset.debug_str(it.unichar_id).c_str());
1925 it.Print();
1926 }
1927} /* PrintAdaptiveMatchResults */

◆ PruneClasses()

int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
int  keep_this,
const INT_FEATURE_STRUCT features,
const uint8_t *  normalization_factors,
const uint16_t *  expected_num_features,
std::vector< CP_RESULT_STRUCT > *  results 
)

Runs the class pruner from int_templates on the given features, returning the number of classes output in results.

Parameters
int_templatesClass pruner tables
num_featuresNumber of features in blob
featuresArray of features
normalization_factorsArray of fudge factors from blob normalization process (by CLASS_INDEX)
expected_num_featuresArray of expected number of features for each class (by CLASS_INDEX)
resultsSorted Array of pruned classes. Must be an array of size at least int_templates->NumClasses.
keep_this

Definition at line 427 of file intmatcher.cpp.

431 {
432 ClassPruner pruner(int_templates->NumClasses);
433 // Compute initial match scores for all classes.
434 pruner.ComputeScores(int_templates, num_features, features);
435 // Adjust match scores for number of expected features.
436 pruner.AdjustForExpectedNumFeatures(expected_num_features, classify_cp_cutoff_strength);
437 // Apply disabled classes in unicharset - only works without a shape_table.
438 if (shape_table_ == nullptr) {
439 pruner.DisableDisabledClasses(unicharset);
440 }
441 // If fragments are disabled, remove them, also only without a shape table.
442 if (disable_character_fragments && shape_table_ == nullptr) {
443 pruner.DisableFragments(unicharset);
444 }
445
446 // If we have good x-heights, apply the given normalization factors.
447 if (normalization_factors != nullptr) {
448 pruner.NormalizeForXheight(classify_class_pruner_multiplier, normalization_factors);
449 } else {
450 pruner.NoNormalization();
451 }
452 // Do the actual pruning and sort the short-list.
453 pruner.PruneAndSort(classify_class_pruner_threshold, keep_this, shape_table_ == nullptr,
454 unicharset);
455
456 if (classify_debug_level > 2) {
457 pruner.DebugMatch(*this, int_templates, features);
458 }
459 if (classify_debug_level > 1) {
460 pruner.SummarizeResult(*this, int_templates, expected_num_features,
461 classify_class_pruner_multiplier, normalization_factors);
462 }
463 // Convert to the expected output format.
464 return pruner.SetupResults(results);
465}

◆ ReadAdaptedTemplates()

ADAPT_TEMPLATES_STRUCT * tesseract::Classify::ReadAdaptedTemplates ( TFile fp)

Read a set of adapted templates from file and return a ptr to the templates.

Parameters
fpopen text file to read adapted templates from
Returns
Ptr to adapted templates read from file.
Note
Globals: none

Definition at line 235 of file adaptive.cpp.

235 {
236 auto Templates = new ADAPT_TEMPLATES_STRUCT;
237
238 /* first read the high level adaptive template struct */
239 fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1);
240
241 /* then read in the basic integer templates */
242 Templates->Templates = ReadIntTemplates(fp);
243
244 /* then read in the adaptive info for each class */
245 for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
246 Templates->Class[i] = ReadAdaptedClass(fp);
247 }
248 return (Templates);
249
250} /* ReadAdaptedTemplates */
ADAPT_CLASS_STRUCT * ReadAdaptedClass(TFile *fp)
Definition: adaptive.cpp:186

◆ ReadIntTemplates()

INT_TEMPLATES_STRUCT * tesseract::Classify::ReadIntTemplates ( TFile fp)

This routine reads a set of integer templates from File. File must already be open and must be in the correct binary format.

Parameters
fpopen file to read templates from
Returns
Pointer to integer templates read from File.
Note
Globals: none

Definition at line 629 of file intproto.cpp.

629 {
630 int j, w, x, y, z;
631 INT_TEMPLATES_STRUCT *Templates;
632 CLASS_PRUNER_STRUCT *Pruner;
633 INT_CLASS_STRUCT *Class;
634
635 /* variables for conversion from older inttemp formats */
636 int b, bit_number, last_cp_bit_number, new_b, new_i, new_w;
637 CLASS_ID class_id, max_class_id;
638 std::vector<CLASS_ID> ClassIdFor(MAX_NUM_CLASSES);
639 std::vector<CLASS_PRUNER_STRUCT *> TempClassPruner(MAX_NUM_CLASS_PRUNERS);
640 uint32_t SetBitsForMask = // word with NUM_BITS_PER_CLASS
641 (1 << NUM_BITS_PER_CLASS) - 1; // set starting at bit 0
642 uint32_t Mask, NewMask, ClassBits;
643 unsigned MaxNumConfigs = MAX_NUM_CONFIGS;
644 unsigned WerdsPerConfigVec = WERDS_PER_CONFIG_VEC;
645
646 /* first read the high level template struct */
647 Templates = new INT_TEMPLATES_STRUCT;
648 // Read Templates in parts for 64 bit compatibility.
649 uint32_t unicharset_size;
650 if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1) != 1) {
651 tprintf("Bad read of inttemp!\n");
652 }
653 int32_t version_id = 0;
654 if (fp->FReadEndian(&version_id, sizeof(version_id), 1) != 1 ||
655 fp->FReadEndian(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1) != 1) {
656 tprintf("Bad read of inttemp!\n");
657 }
658 if (version_id < 0) {
659 // This file has a version id!
660 version_id = -version_id;
661 if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1) != 1) {
662 tprintf("Bad read of inttemp!\n");
663 }
664 } else {
665 Templates->NumClasses = version_id;
666 }
667
668 if (version_id < 3) {
669 MaxNumConfigs = OLD_MAX_NUM_CONFIGS;
670 WerdsPerConfigVec = OLD_WERDS_PER_CONFIG_VEC;
671 }
672
673 if (version_id < 2) {
674 std::vector<int16_t> IndexFor(MAX_NUM_CLASSES);
675 if (fp->FReadEndian(&IndexFor[0], sizeof(IndexFor[0]), unicharset_size) != unicharset_size) {
676 tprintf("Bad read of inttemp!\n");
677 }
678 if (fp->FReadEndian(&ClassIdFor[0], sizeof(ClassIdFor[0]), Templates->NumClasses) !=
679 Templates->NumClasses) {
680 tprintf("Bad read of inttemp!\n");
681 }
682 }
683
684 /* then read in the class pruners */
685 const unsigned kNumBuckets = NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR;
686 for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
687 Pruner = new CLASS_PRUNER_STRUCT;
688 if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets) != kNumBuckets) {
689 tprintf("Bad read of inttemp!\n");
690 }
691 if (version_id < 2) {
692 TempClassPruner[i] = Pruner;
693 } else {
694 Templates->ClassPruners[i] = Pruner;
695 }
696 }
697
698 /* fix class pruners if they came from an old version of inttemp */
699 if (version_id < 2) {
700 // Allocate enough class pruners to cover all the class ids.
701 max_class_id = 0;
702 for (unsigned i = 0; i < Templates->NumClasses; i++) {
703 if (ClassIdFor[i] > max_class_id) {
704 max_class_id = ClassIdFor[i];
705 }
706 }
707 for (int i = 0; i <= CPrunerIdFor(max_class_id); i++) {
708 Templates->ClassPruners[i] = new CLASS_PRUNER_STRUCT;
709 memset(Templates->ClassPruners[i], 0, sizeof(CLASS_PRUNER_STRUCT));
710 }
711 // Convert class pruners from the old format (indexed by class index)
712 // to the new format (indexed by class id).
713 last_cp_bit_number = NUM_BITS_PER_CLASS * Templates->NumClasses - 1;
714 for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
715 for (x = 0; x < NUM_CP_BUCKETS; x++) {
716 for (y = 0; y < NUM_CP_BUCKETS; y++) {
717 for (z = 0; z < NUM_CP_BUCKETS; z++) {
718 for (w = 0; w < WERDS_PER_CP_VECTOR; w++) {
719 if (TempClassPruner[i]->p[x][y][z][w] == 0) {
720 continue;
721 }
722 for (b = 0; b < BITS_PER_WERD; b += NUM_BITS_PER_CLASS) {
723 bit_number = i * BITS_PER_CP_VECTOR + w * BITS_PER_WERD + b;
724 if (bit_number > last_cp_bit_number) {
725 break; // the rest of the bits in this word are not used
726 }
727 class_id = ClassIdFor[bit_number / NUM_BITS_PER_CLASS];
728 // Single out NUM_BITS_PER_CLASS bits relating to class_id.
729 Mask = SetBitsForMask << b;
730 ClassBits = TempClassPruner[i]->p[x][y][z][w] & Mask;
731 // Move these bits to the new position in which they should
732 // appear (indexed corresponding to the class_id).
733 new_i = CPrunerIdFor(class_id);
734 new_w = CPrunerWordIndexFor(class_id);
735 new_b = CPrunerBitIndexFor(class_id) * NUM_BITS_PER_CLASS;
736 if (new_b > b) {
737 ClassBits <<= (new_b - b);
738 } else {
739 ClassBits >>= (b - new_b);
740 }
741 // Copy bits relating to class_id to the correct position
742 // in Templates->ClassPruner.
743 NewMask = SetBitsForMask << new_b;
744 Templates->ClassPruners[new_i]->p[x][y][z][new_w] &= ~NewMask;
745 Templates->ClassPruners[new_i]->p[x][y][z][new_w] |= ClassBits;
746 }
747 }
748 }
749 }
750 }
751 }
752 for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
753 delete TempClassPruner[i];
754 }
755 }
756
757 /* then read in each class */
758 for (unsigned i = 0; i < Templates->NumClasses; i++) {
759 /* first read in the high level struct for the class */
760 Class = new INT_CLASS_STRUCT;
761 if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1) != 1 ||
762 fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 ||
763 fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1) {
764 tprintf("Bad read of inttemp!\n");
765 }
766 if (version_id == 0) {
767 // Only version 0 writes 5 pointless pointers to the file.
768 for (j = 0; j < 5; ++j) {
769 int32_t junk;
770 if (fp->FRead(&junk, sizeof(junk), 1) != 1) {
771 tprintf("Bad read of inttemp!\n");
772 }
773 }
774 }
775 unsigned num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs;
776 ASSERT_HOST(num_configs <= MaxNumConfigs);
777 if (fp->FReadEndian(Class->ConfigLengths, sizeof(uint16_t), num_configs) != num_configs) {
778 tprintf("Bad read of inttemp!\n");
779 }
780 if (version_id < 2) {
781 ClassForClassId(Templates, ClassIdFor[i]) = Class;
782 } else {
783 ClassForClassId(Templates, i) = Class;
784 }
785
786 /* then read in the proto lengths */
787 Class->ProtoLengths.clear();
788 if (MaxNumIntProtosIn(Class) > 0) {
789 Class->ProtoLengths.resize(MaxNumIntProtosIn(Class));
790 if (fp->FRead(&Class->ProtoLengths[0], sizeof(uint8_t), MaxNumIntProtosIn(Class)) !=
791 MaxNumIntProtosIn(Class)) {
792 tprintf("Bad read of inttemp!\n");
793 }
794 }
795
796 /* then read in the proto sets */
797 for (j = 0; j < Class->NumProtoSets; j++) {
798 auto ProtoSet = new PROTO_SET_STRUCT;
799 unsigned num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR;
800 if (fp->FReadEndian(&ProtoSet->ProtoPruner, sizeof(ProtoSet->ProtoPruner[0][0][0]),
801 num_buckets) != num_buckets) {
802 tprintf("Bad read of inttemp!\n");
803 }
804 for (x = 0; x < PROTOS_PER_PROTO_SET; x++) {
805 if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A), 1) != 1 ||
806 fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B), 1) != 1 ||
807 fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C), 1) != 1 ||
808 fp->FRead(&ProtoSet->Protos[x].Angle, sizeof(ProtoSet->Protos[x].Angle), 1) != 1) {
809 tprintf("Bad read of inttemp!\n");
810 }
811 if (fp->FReadEndian(&ProtoSet->Protos[x].Configs, sizeof(ProtoSet->Protos[x].Configs[0]),
812 WerdsPerConfigVec) != WerdsPerConfigVec) {
813 tprintf("Bad read of inttemp!\n");
814 }
815 }
816 Class->ProtoSets[j] = ProtoSet;
817 }
818 if (version_id < 4) {
819 Class->font_set_id = -1;
820 } else {
821 fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1);
822 }
823 }
824
825 if (version_id < 2) {
826 /* add an empty nullptr class with class id 0 */
827 assert(UnusedClassIdIn(Templates, 0));
828 ClassForClassId(Templates, 0) = new INT_CLASS_STRUCT(1, 1);
829 ClassForClassId(Templates, 0)->font_set_id = -1;
830 Templates->NumClasses++;
831 /* make sure the classes are contiguous */
832 for (unsigned i = 0; i < MAX_NUM_CLASSES; i++) {
833 if (i < Templates->NumClasses) {
834 if (ClassForClassId(Templates, i) == nullptr) {
835 fprintf(stderr, "Non-contiguous class ids in inttemp\n");
836 exit(1);
837 }
838 } else {
839 if (ClassForClassId(Templates, i) != nullptr) {
840 fprintf(stderr, "Class id %u exceeds NumClassesIn (Templates) %u\n", i,
841 Templates->NumClasses);
842 exit(1);
843 }
844 }
845 }
846 }
847 if (version_id >= 4) {
848 using namespace std::placeholders; // for _1, _2
849 this->fontinfo_table_.read(fp, std::bind(read_info, _1, _2));
850 if (version_id >= 5) {
851 this->fontinfo_table_.read(fp, std::bind(read_spacing_info, _1, _2));
852 }
853 this->fontset_table_.read(fp, [](auto *f, auto *fs) { return f->DeSerialize(*fs); } );
854 }
855
856 return (Templates);
857} /* ReadIntTemplates */
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define BITS_PER_CP_VECTOR
Definition: intproto.h:59
#define MaxNumIntProtosIn(C)
Definition: intproto.h:145
#define NUM_PP_PARAMS
Definition: intproto.h:51
#define WERDS_PER_PP_VECTOR
Definition: intproto.h:62
#define BITS_PER_WERD
Definition: intproto.h:45
#define WERDS_PER_CONFIG_VEC
Definition: intproto.h:65
#define CPrunerWordIndexFor(c)
Definition: intproto.h:160
#define CPrunerIdFor(c)
Definition: intproto.h:158
#define CPrunerBitIndexFor(c)
Definition: intproto.h:161
#define NUM_CP_BUCKETS
Definition: intproto.h:53
#define MAX_NUM_CLASS_PRUNERS
Definition: intproto.h:60
#define WERDS_PER_CP_VECTOR
Definition: intproto.h:61
#define PROTOS_PER_PROTO_SET
Definition: intproto.h:49
#define NUM_PP_BUCKETS
Definition: intproto.h:52
#define NUM_BITS_PER_CLASS
Definition: intproto.h:55
#define OLD_MAX_NUM_CONFIGS
Definition: intproto.cpp:95
#define OLD_WERDS_PER_CONFIG_VEC
Definition: intproto.cpp:96
Uncopyable z
const double y
const char * p
bool read_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:143
bool read_spacing_info(TFile *f, FontInfo *fi)
Definition: fontinfo.cpp:163
bool read(tesseract::TFile *f, const std::function< bool(tesseract::TFile *, T *)> &cb)

◆ ReadNewCutoffs()

void tesseract::Classify::ReadNewCutoffs ( TFile fp,
uint16_t *  Cutoffs 
)

Open file, read in all of the class-id/cutoff pairs and insert them into the Cutoffs array. Cutoffs are indexed in the array by class id. Unused entries in the array are set to an arbitrarily high cutoff value.

Parameters
fpfile containing cutoff definitions
Cutoffsarray to put cutoffs into

Definition at line 41 of file cutoffs.cpp.

41 {
42 int Cutoff;
43
44 if (shape_table_ != nullptr) {
45 if (!fp->DeSerialize(shapetable_cutoffs_)) {
46 tprintf("Error during read of shapetable pffmtable!\n");
47 }
48 }
49 for (int i = 0; i < MAX_NUM_CLASSES; i++) {
50 Cutoffs[i] = MAX_CUTOFF;
51 }
52
53 const int kMaxLineSize = 100;
54 char line[kMaxLineSize];
55 while (fp->FGets(line, kMaxLineSize) != nullptr) {
56 std::string Class;
57 CLASS_ID ClassId;
58 std::istringstream stream(line);
59 stream.imbue(std::locale::classic());
60 stream >> Class >> Cutoff;
61 if (stream.fail()) {
62 break;
63 }
64 if (Class.compare("NULL") == 0) {
65 ClassId = unicharset.unichar_to_id(" ");
66 } else {
67 ClassId = unicharset.unichar_to_id(Class.c_str());
68 }
69 ASSERT_HOST(ClassId >= 0 && ClassId < MAX_NUM_CLASSES);
70 Cutoffs[ClassId] = Cutoff;
71 }
72}
#define MAX_CUTOFF
Definition: cutoffs.cpp:30

◆ ReadNormProtos()

NORM_PROTOS * tesseract::Classify::ReadNormProtos ( TFile fp)

This routine allocates a new data structure to hold a set of character normalization protos. It then fills in the data structure by reading from the specified File.

Parameters
fpopen text file to read normalization protos from Globals: none
Returns
Character normalization protos.

Definition at line 173 of file normmatch.cpp.

173 {
174 char unichar[2 * UNICHAR_LEN + 1];
175 UNICHAR_ID unichar_id;
176 LIST Protos;
177 int NumProtos;
178
179 /* allocate and initialization data structure */
180 auto NormProtos = new NORM_PROTOS(unicharset.size());
181
182 /* read file header and save in data structure */
185
186 /* read protos for each class into a separate list */
187 const int kMaxLineSize = 100;
188 char line[kMaxLineSize];
189 while (fp->FGets(line, kMaxLineSize) != nullptr) {
190 std::istringstream stream(line);
191 stream.imbue(std::locale::classic());
192 stream >> unichar >> NumProtos;
193 if (stream.fail()) {
194 continue;
195 }
196 if (unicharset.contains_unichar(unichar)) {
197 unichar_id = unicharset.unichar_to_id(unichar);
198 Protos = NormProtos->Protos[unichar_id];
199 for (int i = 0; i < NumProtos; i++) {
200 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
201 }
202 NormProtos->Protos[unichar_id] = Protos;
203 } else {
204 tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
205 for (int i = 0; i < NumProtos; i++) {
207 }
208 }
209 }
210 return NormProtos;
211} /* ReadNormProtos */
#define UNICHAR_LEN
Definition: unichar.h:31
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:114
void FreePrototype(void *arg)
Definition: cluster.cpp:1608
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:192
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:168
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:134

◆ RefreshDebugWindow()

void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)

Definition at line 240 of file adaptmatch.cpp.

241 {
242 const int kSampleSpaceWidth = 500;
243 if (*win == nullptr) {
244 *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,
245 200, true);
246 }
247 (*win)->Clear();
248 (*win)->Pen(64, 64, 64);
249 (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);
250 (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,
252 (*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
253}

◆ RemoveBadMatches()

void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results)

This routine steps through each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters
Resultscontains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"

Definition at line 1942 of file adaptmatch.cpp.

1942 {
1943 unsigned Next, NextGood;
1944 float BadMatchThreshold;
1945 static const char *romans = "i v x I V X";
1946 BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
1947
1948 if (classify_bln_numeric_mode) {
1949 UNICHAR_ID unichar_id_one =
1951 UNICHAR_ID unichar_id_zero =
1953 float scored_one = ScoredUnichar(unichar_id_one, *Results);
1954 float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
1955
1956 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1957 const UnicharRating &match = Results->match[Next];
1958 if (match.rating >= BadMatchThreshold) {
1959 if (!unicharset.get_isalpha(match.unichar_id) ||
1960 strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
1961 } else if (unicharset.eq(match.unichar_id, "l") && scored_one < BadMatchThreshold) {
1962 Results->match[Next].unichar_id = unichar_id_one;
1963 } else if (unicharset.eq(match.unichar_id, "O") && scored_zero < BadMatchThreshold) {
1964 Results->match[Next].unichar_id = unichar_id_zero;
1965 } else {
1966 Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
1967 }
1968 if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
1969 if (NextGood == Next) {
1970 ++NextGood;
1971 } else {
1972 Results->match[NextGood++] = Results->match[Next];
1973 }
1974 }
1975 }
1976 }
1977 } else {
1978 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1979 if (Results->match[Next].rating >= BadMatchThreshold) {
1980 if (NextGood == Next) {
1981 ++NextGood;
1982 } else {
1983 Results->match[NextGood++] = Results->match[Next];
1984 }
1985 }
1986 }
1987 }
1988 Results->match.resize(NextGood);
1989} /* RemoveBadMatches */
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713

◆ RemoveExtraPuncs()

void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results)

This routine discards extra digits or punctuation from the results. We keep only the top 2 punctuation answers and the top 1 digit answer if present.

Parameters
Resultscontains matches to be filtered

Definition at line 1999 of file adaptmatch.cpp.

1999 {
2000 unsigned Next, NextGood;
2001 int punc_count; /*no of garbage characters */
2002 int digit_count;
2003 /*garbage characters */
2004 static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2005 static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2006
2007 punc_count = 0;
2008 digit_count = 0;
2009 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2010 const UnicharRating &match = Results->match[Next];
2011 bool keep = true;
2012 if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2013 if (punc_count >= 2) {
2014 keep = false;
2015 }
2016 punc_count++;
2017 } else {
2018 if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2019 if (digit_count >= 1) {
2020 keep = false;
2021 }
2022 digit_count++;
2023 }
2024 }
2025 if (keep) {
2026 if (NextGood == Next) {
2027 ++NextGood;
2028 } else {
2029 Results->match[NextGood++] = match;
2030 }
2031 }
2032 }
2033 Results->match.resize(NextGood);
2034} /* RemoveExtraPuncs */

◆ ResetAdaptiveClassifierInternal()

void tesseract::Classify::ResetAdaptiveClassifierInternal ( )

Definition at line 596 of file adaptmatch.cpp.

596 {
597 if (classify_learning_debug_level > 0) {
598 tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n", NumAdaptationsFailed);
599 }
600 delete AdaptedTemplates;
601 AdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
603 BackupAdaptedTemplates = nullptr;
604 NumAdaptationsFailed = 0;
605}

◆ SetAdaptiveThreshold()

void tesseract::Classify::SetAdaptiveThreshold ( float  Threshold)

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters
Thresholdthreshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating

Definition at line 2047 of file adaptmatch.cpp.

2047 {
2048 Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);
2049 classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2050 classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2051} /* SetAdaptiveThreshold */

◆ SetStaticClassifier()

void tesseract::Classify::SetStaticClassifier ( ShapeClassifier static_classifier)

Definition at line 161 of file classify.cpp.

161 {
162 delete static_classifier_;
163 static_classifier_ = static_classifier;
164}

◆ SettupPass1()

void tesseract::Classify::SettupPass1 ( )

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Definition at line 647 of file adaptmatch.cpp.

647 {
648 EnableLearning = classify_enable_learning;
649
651
652} /* SettupPass1 */
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:362

◆ SettupPass2()

void tesseract::Classify::SettupPass2 ( )

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Definition at line 663 of file adaptmatch.cpp.

663 {
664 EnableLearning = false;
666
667} /* SettupPass2 */
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:366

◆ SetupBLCNDenorms()

void tesseract::Classify::SetupBLCNDenorms ( const TBLOB blob,
bool  nonlinear_norm,
DENORM bl_denorm,
DENORM cn_denorm,
INT_FX_RESULT_STRUCT fx_info 
)
static

Definition at line 129 of file intfx.cpp.

130 {
131 // Compute 1st and 2nd moments of the original outline.
132 FCOORD center, second_moments;
133 int length = blob.ComputeMoments(&center, &second_moments);
134 if (fx_info != nullptr) {
135 fx_info->Length = length;
136 fx_info->Rx = IntCastRounded(second_moments.y());
137 fx_info->Ry = IntCastRounded(second_moments.x());
138
139 fx_info->Xmean = IntCastRounded(center.x());
140 fx_info->Ymean = IntCastRounded(center.y());
141 }
142 // Setup the denorm for Baseline normalization.
143 bl_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), 128.0f, 1.0f, 1.0f,
144 128.0f, 128.0f);
145 // Setup the denorm for character normalization.
146 if (nonlinear_norm) {
147 std::vector<std::vector<int>> x_coords;
148 std::vector<std::vector<int>> y_coords;
149 TBOX box;
150 blob.GetPreciseBoundingBox(&box);
151 box.pad(1, 1);
152 blob.GetEdgeCoords(box, x_coords, y_coords);
153 cn_denorm->SetupNonLinear(&blob.denorm(), box, UINT8_MAX, UINT8_MAX, 0.0f, 0.0f, x_coords,
154 y_coords);
155 } else {
156 cn_denorm->SetupNormalization(nullptr, nullptr, &blob.denorm(), center.x(), center.y(),
157 51.2f / second_moments.x(), 51.2f / second_moments.y(), 128.0f,
158 128.0f);
159 }
160}

◆ shape_table()

const ShapeTable * tesseract::Classify::shape_table ( ) const
inline

Definition at line 102 of file classify.h.

102 {
103 return shape_table_;
104 }

◆ ShapeIDToClassID()

int tesseract::Classify::ShapeIDToClassID ( int  shape_id) const

Definition at line 2121 of file adaptmatch.cpp.

2121 {
2122 for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2123 int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2124 ASSERT_HOST(font_set_id >= 0);
2125 const FontSet &fs = fontset_table_.at(font_set_id);
2126 for (auto f : fs) {
2127 if (f == shape_id) {
2128 return id;
2129 }
2130 }
2131 }
2132 tprintf("Shape %d not found\n", shape_id);
2133 return -1;
2134}

◆ ShowBestMatchFor()

void tesseract::Classify::ShowBestMatchFor ( int  shape_id,
const INT_FEATURE_STRUCT features,
int  num_features 
)

This routine displays debug information for the best config of the given shape_id for the given set of features.

Parameters
shape_idclassifier id to work with
featuresfeatures of the unknown character
num_featuresNumber of features in the features array.

Definition at line 2065 of file adaptmatch.cpp.

2066 {
2067 uint32_t config_mask;
2068 if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2069 tprintf("No built-in templates for class/shape %d\n", shape_id);
2070 return;
2071 }
2072 if (num_features <= 0) {
2073 tprintf("Illegal blob (char norm features)!\n");
2074 return;
2075 }
2076 UnicharRating cn_result;
2077 classify_norm_method.set_value(character);
2079 features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,
2080 matcher_debug_separate_windows);
2081 tprintf("\n");
2082 config_mask = 1 << cn_result.config;
2083
2084 tprintf("Static Shape ID: %d\n", shape_id);
2086 im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,
2087 features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,
2088 matcher_debug_separate_windows);
2090} /* ShowBestMatchFor */

◆ ShowMatchDisplay()

void tesseract::Classify::ShowMatchDisplay ( )

This routine sends the shapes in the global display lists to the match debugger window.

Globals:

  • FeatureShapes display list containing feature matches
  • ProtoShapes display list containing proto matches

Definition at line 868 of file intproto.cpp.

868 {
870 if (ProtoDisplayWindow) {
871 ProtoDisplayWindow->Clear();
872 }
873 if (FeatureDisplayWindow) {
874 FeatureDisplayWindow->Clear();
875 }
876 ClearFeatureSpaceWindow(static_cast<NORM_METHOD>(static_cast<int>(classify_norm_method)),
877 IntMatchWindow);
879 if (ProtoDisplayWindow) {
880 ProtoDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y, INT_MAX_X, INT_MAX_Y);
881 }
882 if (FeatureDisplayWindow) {
883 FeatureDisplayWindow->ZoomToRectangle(INT_MIN_X, INT_MIN_Y, INT_MAX_X, INT_MAX_Y);
884 }
885} /* ShowMatchDisplay */
#define INT_MAX_Y
Definition: intproto.cpp:64
#define INT_MIN_Y
Definition: intproto.cpp:62
#define INT_MIN_X
Definition: intproto.cpp:61
#define INT_MAX_X
Definition: intproto.cpp:63
void InitIntMatchWindowIfReqd()
Definition: intproto.cpp:1587
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:889
void void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:742

◆ StartBackupAdaptiveClassifier()

void tesseract::Classify::StartBackupAdaptiveClassifier ( )

Definition at line 625 of file adaptmatch.cpp.

625 {
627 BackupAdaptedTemplates = new ADAPT_TEMPLATES_STRUCT(unicharset);
628}

◆ STRING_VAR_H()

tesseract::Classify::STRING_VAR_H ( classify_learn_debug_str  )

◆ SwitchAdaptiveClassifier()

void tesseract::Classify::SwitchAdaptiveClassifier ( )

Definition at line 609 of file adaptmatch.cpp.

609 {
610 if (BackupAdaptedTemplates == nullptr) {
612 return;
613 }
614 if (classify_learning_debug_level > 0) {
615 tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
616 NumAdaptationsFailed);
617 }
618 delete AdaptedTemplates;
620 BackupAdaptedTemplates = nullptr;
621 NumAdaptationsFailed = 0;
622}
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:596

◆ TempConfigReliable()

bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG_STRUCT config 
)

Definition at line 2138 of file adaptmatch.cpp.

2138 {
2139 if (classify_learning_debug_level >= 1) {
2140 tprintf("NumTimesSeen for config of %s is %d\n",
2141 getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);
2142 }
2143 if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
2144 return true;
2145 } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2146 return false;
2147 } else if (use_ambigs_for_adaption) {
2148 // Go through the ambigs vector and see whether we have already seen
2149 // enough times all the characters represented by the ambigs vector.
2150 const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
2151 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2152 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2153 ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2154 assert(ambig_class != nullptr);
2155 if (ambig_class->NumPermConfigs == 0 &&
2156 ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {
2157 if (classify_learning_debug_level >= 1) {
2158 tprintf(
2159 "Ambig %s has not been seen enough times,"
2160 " not making config for %s permanent\n",
2161 getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),
2162 getDict().getUnicharset().debug_str(class_id).c_str());
2163 }
2164 return false;
2165 }
2166 }
2167 }
2168 return true;
2169}
std::vector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:38
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:198
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111

◆ UpdateAmbigsGroup()

void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
TBLOB Blob 
)

Definition at line 2171 of file adaptmatch.cpp.

2171 {
2173 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2174 if (classify_learning_debug_level >= 1) {
2175 tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2176 getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
2177 }
2178 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2179 CLASS_ID ambig_class_id = (*ambigs)[ambig];
2180 const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2181 for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2182 if (ConfigIsPermanent(ambigs_class, cfg)) {
2183 continue;
2184 }
2185 const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2186 if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2187 if (classify_learning_debug_level >= 1) {
2188 tprintf("Making config %d of %s permanent\n", cfg,
2189 getDict().getUnicharset().debug_str(ambig_class_id).c_str());
2190 }
2191 MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2192 }
2193 }
2194 }
2195}
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:208

◆ WriteAdaptedTemplates()

void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES_STRUCT Templates 
)

This routine saves Templates to File in a binary format.

Parameters
Fileopen text file to write Templates to
Templatesset of adapted templates to write to File
Note
Globals: none

Definition at line 345 of file adaptive.cpp.

345 {
346 /* first write the high level adaptive template struct */
347 fwrite(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1, File);
348
349 /* then write out the basic integer templates */
350 WriteIntTemplates(File, Templates->Templates, unicharset);
351
352 /* then write out the adaptive info for each class */
353 for (unsigned i = 0; i < (Templates->Templates)->NumClasses; i++) {
354 WriteAdaptedClass(File, Templates->Class[i], Templates->Templates->Class[i]->NumConfigs);
355 }
356} /* WriteAdaptedTemplates */
void WriteAdaptedClass(FILE *File, ADAPT_CLASS_STRUCT *Class, int NumConfigs)
Definition: adaptive.cpp:307
void WriteIntTemplates(FILE *File, INT_TEMPLATES_STRUCT *Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:919

◆ WriteIntTemplates()

void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES_STRUCT Templates,
const UNICHARSET target_unicharset 
)

This routine writes Templates to File. The format is an efficient binary format. File must already be open for writing.

Parameters
Fileopen file to write templates to
Templatestemplates to save into File
target_unicharsetthe UNICHARSET to use

Definition at line 919 of file intproto.cpp.

920 {
921 INT_CLASS_STRUCT *Class;
922 uint32_t unicharset_size = target_unicharset.size();
923 int version_id = -5; // When negated by the reader -1 becomes +1 etc.
924
925 if (Templates->NumClasses != unicharset_size) {
926 tprintf(
927 "Warning: executing WriteIntTemplates() with %d classes in"
928 " Templates, while target_unicharset size is %" PRIu32 "\n",
929 Templates->NumClasses, unicharset_size);
930 }
931
932 /* first write the high level template struct */
933 fwrite(&unicharset_size, sizeof(unicharset_size), 1, File);
934 fwrite(&version_id, sizeof(version_id), 1, File);
935 fwrite(&Templates->NumClassPruners, sizeof(Templates->NumClassPruners), 1, File);
936 fwrite(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, File);
937
938 /* then write out the class pruners */
939 for (unsigned i = 0; i < Templates->NumClassPruners; i++) {
940 fwrite(Templates->ClassPruners[i], sizeof(CLASS_PRUNER_STRUCT), 1, File);
941 }
942
943 /* then write out each class */
944 for (unsigned i = 0; i < Templates->NumClasses; i++) {
945 Class = Templates->Class[i];
946
947 /* first write out the high level struct for the class */
948 fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
949 fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
950 ASSERT_HOST(Class->NumConfigs == this->fontset_table_.at(Class->font_set_id).size());
951 fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
952 for (int j = 0; j < Class->NumConfigs; ++j) {
953 fwrite(&Class->ConfigLengths[j], sizeof(uint16_t), 1, File);
954 }
955
956 /* then write out the proto lengths */
957 if (MaxNumIntProtosIn(Class) > 0) {
958 fwrite(&Class->ProtoLengths[0], sizeof(uint8_t), MaxNumIntProtosIn(Class), File);
959 }
960
961 /* then write out the proto sets */
962 for (int j = 0; j < Class->NumProtoSets; j++) {
963 fwrite(Class->ProtoSets[j], sizeof(PROTO_SET_STRUCT), 1, File);
964 }
965
966 /* then write the fonts info */
967 fwrite(&Class->font_set_id, sizeof(int), 1, File);
968 }
969
970 /* Write the fonts info tables */
971 using namespace std::placeholders; // for _1, _2
972 this->fontinfo_table_.write(File, std::bind(write_info, _1, _2));
973 this->fontinfo_table_.write(File, std::bind(write_spacing_info, _1, _2));
974 this->fontset_table_.write(File, std::bind(write_set, _1, _2));
975} /* WriteIntTemplates */
bool write_set(FILE *f, const FontSet &fs)
Definition: fontinfo.cpp:222
bool write_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:157
bool write_spacing_info(FILE *f, const FontInfo &fi)
Definition: fontinfo.cpp:194
bool write(FILE *f, const std::function< bool(FILE *, const T &)> &cb) const

◆ WriteTRFile()

bool tesseract::Classify::WriteTRFile ( const char *  filename)

Definition at line 60 of file blobclass.cpp.

60 {
61 bool result = false;
62 std::string tr_filename = filename;
63 tr_filename += ".tr";
64 FILE *fp = fopen(tr_filename.c_str(), "wb");
65 if (fp) {
66 result = tesseract::Serialize(fp, &tr_file_data_[0], tr_file_data_.length());
67 fclose(fp);
68 }
69 tr_file_data_.resize(0);
70 return result;
71}
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236

Member Data Documentation

◆ AdaptedTemplates

ADAPT_TEMPLATES_STRUCT* tesseract::Classify::AdaptedTemplates = nullptr

Definition at line 420 of file classify.h.

◆ AllConfigsOff

BIT_VECTOR tesseract::Classify::AllConfigsOff = nullptr

Definition at line 429 of file classify.h.

◆ AllConfigsOn

BIT_VECTOR tesseract::Classify::AllConfigsOn = nullptr

Definition at line 428 of file classify.h.

◆ AllProtosOn

BIT_VECTOR tesseract::Classify::AllProtosOn = nullptr

Definition at line 427 of file classify.h.

◆ BackupAdaptedTemplates

ADAPT_TEMPLATES_STRUCT* tesseract::Classify::BackupAdaptedTemplates = nullptr

Definition at line 424 of file classify.h.

◆ EnableLearning

bool tesseract::Classify::EnableLearning = true

Definition at line 484 of file classify.h.

◆ feature_defs_

FEATURE_DEFS_STRUCT tesseract::Classify::feature_defs_
protected

Definition at line 446 of file classify.h.

◆ fontinfo_table_

UnicityTable<FontInfo> tesseract::Classify::fontinfo_table_

Definition at line 434 of file classify.h.

◆ fontset_table_

UnicityTable<FontSet> tesseract::Classify::fontset_table_

Definition at line 442 of file classify.h.

◆ im_

IntegerMatcher tesseract::Classify::im_
protected

Definition at line 445 of file classify.h.

◆ NormProtos

NORM_PROTOS* tesseract::Classify::NormProtos = nullptr

Definition at line 432 of file classify.h.

◆ PreTrainedTemplates

INT_TEMPLATES_STRUCT* tesseract::Classify::PreTrainedTemplates = nullptr

Definition at line 419 of file classify.h.

◆ shape_table_

ShapeTable* tesseract::Classify::shape_table_ = nullptr
protected

Definition at line 451 of file classify.h.

◆ TempProtoMask

BIT_VECTOR tesseract::Classify::TempProtoMask = nullptr

Definition at line 430 of file classify.h.


The documentation for this class was generated from the following files: