tesseract v5.3.3.20231005
adaptmatch.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: adaptmatch.cpp
3 ** Purpose: High level adaptive matcher.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17
18/*-----------------------------------------------------------------------------
19 Include Files and Type Defines
20-----------------------------------------------------------------------------*/
21#ifdef HAVE_CONFIG_H
22# include "config_auto.h"
23#endif
24
25#include "adaptive.h" // for ADAPT_CLASS
26#include "ambigs.h" // for UnicharIdVector, UnicharAmbigs
27#include "bitvec.h" // for FreeBitVector, NewBitVector, BIT_VECTOR
28#include "blobs.h" // for TBLOB, TWERD
29#include "classify.h" // for Classify, CST_FRAGMENT, CST_WHOLE
30#include "dict.h" // for Dict
31#include "errcode.h" // for ASSERT_HOST
32#include "featdefs.h" // for CharNormDesc
33#include "float2int.h" // for BASELINE_Y_SHIFT
34#include "fontinfo.h" // for ScoredFont, FontSet
35#include "intfx.h" // for BlobToTrainingSample, INT_FX_RESULT_S...
36#include "intmatcher.h" // for CP_RESULT_STRUCT, IntegerMatcher
37#include "intproto.h" // for INT_FEATURE_STRUCT, (anonymous), Clas...
38#include "matchdefs.h" // for CLASS_ID, FEATURE_ID, PROTO_ID, NO_PROTO
39#include "mfoutline.h" // for baseline, character, MF_SCALE_FACTOR
40#include "normalis.h" // for DENORM, kBlnBaselineOffset, kBlnXHeight
41#include "normfeat.h" // for ActualOutlineLength, CharNormLength
42#include "ocrfeatures.h" // for FEATURE_STRUCT, FEATURE
43#include "oldlist.h" // for push, delete_d
44#include "outfeat.h" // for OutlineFeatDir, OutlineFeatLength
45#include "pageres.h" // for WERD_RES
46#include "params.h" // for IntParam, BoolParam, DoubleParam, Str...
47#include "picofeat.h" // for PicoFeatDir, PicoFeatX, PicoFeatY
48#include "protos.h" // for PROTO_STRUCT, FillABC
49#include "ratngs.h" // for BLOB_CHOICE_IT, BLOB_CHOICE_LIST, BLO...
50#include "rect.h" // for TBOX
51#include "scrollview.h" // for ScrollView, ScrollView::BROWN, Scroll...
52#include "seam.h" // for SEAM
53#include "shapeclassifier.h" // for ShapeClassifier
54#include "shapetable.h" // for UnicharRating, ShapeTable, Shape, Uni...
55#include "tessclassifier.h" // for TessClassifier
56#include "tessdatamanager.h" // for TessdataManager, TESSDATA_INTTEMP
57#include "tprintf.h" // for tprintf
58#include "trainingsample.h" // for TrainingSample
59#include "unicharset.h" // for UNICHARSET, CHAR_FRAGMENT, UNICHAR_SPACE
60#include "unicity_table.h" // for UnicityTable
61
62#include <tesseract/unichar.h> // for UNICHAR_ID, INVALID_UNICHAR_ID
63#include "helpers.h" // for IntCastRounded, ClipToRange
64#include "serialis.h" // for TFile
65
66#include <algorithm> // for max, min
67#include <cassert> // for assert
68#include <cmath> // for fabs
69#include <cstdint> // for INT32_MAX, UINT8_MAX
70#include <cstdio> // for fflush, fclose, fopen, stdout, FILE
71#include <cstring> // for strstr, memset, strcmp
72
73namespace tesseract {
74
75// TODO: The parameter classify_enable_adaptive_matcher can cause
76// a segmentation fault if it is set to false (issue #256),
77// so override it here.
78#define classify_enable_adaptive_matcher true
79
80#define ADAPT_TEMPLATE_SUFFIX ".a"
81
82#define MAX_MATCHES 10
83#define UNLIKELY_NUM_FEAT 200
84#define NO_DEBUG 0
85#define MAX_ADAPTABLE_WERD_SIZE 40
86
87#define ADAPTABLE_WERD_ADJUSTMENT (0.05)
88
89#define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT)
90
91#define WORST_POSSIBLE_RATING (0.0f)
92
94 int32_t BlobLength;
99 std::vector<UnicharRating> match;
100 std::vector<CP_RESULT_STRUCT> CPResults;
101
104 inline void Initialize() {
105 BlobLength = INT32_MAX;
106 HasNonfragment = false;
107 ComputeBest();
108 }
109 // Computes best_unichar_id, best_match_index and best_rating.
110 void ComputeBest() {
111 best_unichar_id = INVALID_UNICHAR_ID;
112 best_match_index = -1;
114 for (unsigned i = 0; i < match.size(); ++i) {
115 if (match[i].rating > best_rating) {
116 best_rating = match[i].rating;
117 best_unichar_id = match[i].unichar_id;
119 }
120 }
121 }
122};
123
124struct PROTO_KEY {
128};
129
130// Sort function to sort ratings appropriately by descending rating.
131static bool SortDescendingRating(const UnicharRating &a, const UnicharRating &b) {
132 if (a.rating != b.rating) {
133 return a.rating > b.rating;
134 } else {
135 return a.unichar_id < b.unichar_id;
136 }
137}
138
139/*-----------------------------------------------------------------------------
140 Private Macros
141-----------------------------------------------------------------------------*/
142inline bool MarginalMatch(float confidence, float matcher_great_threshold) {
143 return (1.0f - confidence) > matcher_great_threshold;
144}
145
146/*-----------------------------------------------------------------------------
147 Private Function Prototypes
148-----------------------------------------------------------------------------*/
149// Returns the index of the given id in results, if present, or the size of the
150// vector (index it will go at) if not present.
151static unsigned FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
152 for (unsigned i = 0; i < results.match.size(); i++) {
153 if (results.match[i].unichar_id == id) {
154 return i;
155 }
156 }
157 return results.match.size();
158}
159
160// Returns the current rating for a unichar id if we have rated it, defaulting
161// to WORST_POSSIBLE_RATING.
162static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS &results) {
163 unsigned index = FindScoredUnichar(id, results);
164 if (index >= results.match.size()) {
166 }
167 return results.match[index].rating;
168}
169
170void InitMatcherRatings(float *Rating);
171
172int MakeTempProtoPerm(void *item1, void *item2);
173
174void SetAdaptiveThreshold(float Threshold);
175
176/*-----------------------------------------------------------------------------
177 Public Code
178-----------------------------------------------------------------------------*/
202void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) {
203 assert(Choices != nullptr);
204 auto *Results = new ADAPT_RESULTS;
205 Results->Initialize();
206
207 ASSERT_HOST(AdaptedTemplates != nullptr);
208
209 DoAdaptiveMatch(Blob, Results);
210
211 RemoveBadMatches(Results);
212 std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
213 RemoveExtraPuncs(Results);
214 Results->ComputeBest();
215 ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices);
216
217 // TODO(rays) Move to before ConvertMatchesToChoices!
218 if (LargeSpeckle(*Blob) || Choices->empty()) {
219 AddLargeSpeckleTo(Results->BlobLength, Choices);
220 }
221
222 if (matcher_debug_level >= 1) {
223 tprintf("AD Matches = ");
225 }
226
227#ifndef GRAPHICS_DISABLED
228 if (classify_enable_adaptive_debugger) {
229 DebugAdaptiveClassifier(Blob, Results);
230 }
231#endif
232
233 delete Results;
234} /* AdaptiveClassifier */
235
236#ifndef GRAPHICS_DISABLED
237
238// If *win is nullptr, sets it to a new ScrollView() object with title msg.
239// Clears the window and draws baselines.
240void Classify::RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset,
241 const TBOX &wbox) {
242 const int kSampleSpaceWidth = 500;
243 if (*win == nullptr) {
244 *win = new ScrollView(msg, 100, y_offset, kSampleSpaceWidth * 2, 200, kSampleSpaceWidth * 2,
245 200, true);
246 }
247 (*win)->Clear();
248 (*win)->Pen(64, 64, 64);
249 (*win)->Line(-kSampleSpaceWidth, kBlnBaselineOffset, kSampleSpaceWidth, kBlnBaselineOffset);
250 (*win)->Line(-kSampleSpaceWidth, kBlnXHeight + kBlnBaselineOffset, kSampleSpaceWidth,
252 (*win)->ZoomToRectangle(wbox.left(), wbox.top(), wbox.right(), wbox.bottom());
253}
254
255#endif // !GRAPHICS_DISABLED
256
257// Learns the given word using its chopped_word, seam_array, denorm,
258// box_word, best_state, and correct_text to learn both correctly and
259// incorrectly segmented blobs. If fontname is not nullptr, then LearnBlob
260// is called and the data will be saved in an internal buffer.
261// Otherwise AdaptToBlob is called for adaption within a document.
262void Classify::LearnWord(const char *fontname, WERD_RES *word) {
263 int word_len = word->correct_text.size();
264 if (word_len == 0) {
265 return;
266 }
267
268 float *thresholds = nullptr;
269 if (fontname == nullptr) {
270 // Adaption mode.
271 if (!EnableLearning || word->best_choice == nullptr) {
272 return; // Can't or won't adapt.
273 }
274
275 if (classify_learning_debug_level >= 1) {
276 tprintf("\n\nAdapting to word = %s\n", word->best_choice->debug_string().c_str());
277 }
278 thresholds = new float[word_len];
279 word->ComputeAdaptionThresholds(getDict().certainty_scale, matcher_perfect_threshold,
280 matcher_good_threshold, matcher_rating_margin, thresholds);
281 }
282 int start_blob = 0;
283
284#ifndef GRAPHICS_DISABLED
285 if (classify_debug_character_fragments) {
286 if (learn_fragmented_word_debug_win_ != nullptr) {
287 learn_fragmented_word_debug_win_->Wait();
288 }
289 RefreshDebugWindow(&learn_fragments_debug_win_, "LearnPieces", 400,
290 word->chopped_word->bounding_box());
291 RefreshDebugWindow(&learn_fragmented_word_debug_win_, "LearnWord", 200,
292 word->chopped_word->bounding_box());
293 word->chopped_word->plot(learn_fragmented_word_debug_win_);
295 }
296#endif // !GRAPHICS_DISABLED
297
298 for (int ch = 0; ch < word_len; ++ch) {
299 if (classify_debug_character_fragments) {
300 tprintf("\nLearning %s\n", word->correct_text[ch].c_str());
301 }
302 if (word->correct_text[ch].length() > 0) {
303 float threshold = thresholds != nullptr ? thresholds[ch] : 0.0f;
304
305 LearnPieces(fontname, start_blob, word->best_state[ch], threshold, CST_WHOLE,
306 word->correct_text[ch].c_str(), word);
307
308 if (word->best_state[ch] > 1 && !disable_character_fragments) {
309 // Check that the character breaks into meaningful fragments
310 // that each match a whole character with at least
311 // classify_character_fragments_garbage_certainty_threshold
312 bool garbage = false;
313 int frag;
314 for (frag = 0; frag < word->best_state[ch]; ++frag) {
315 TBLOB *frag_blob = word->chopped_word->blobs[start_blob + frag];
316 if (classify_character_fragments_garbage_certainty_threshold < 0) {
317 garbage |= LooksLikeGarbage(frag_blob);
318 }
319 }
320 // Learn the fragments.
321 if (!garbage) {
322 bool pieces_all_natural = word->PiecesAllNatural(start_blob, word->best_state[ch]);
323 if (pieces_all_natural || !prioritize_division) {
324 for (frag = 0; frag < word->best_state[ch]; ++frag) {
325 std::vector<std::string> tokens = split(word->correct_text[ch], ' ');
326
327 tokens[0] = CHAR_FRAGMENT::to_string(tokens[0].c_str(), frag, word->best_state[ch],
328 pieces_all_natural);
329
330 std::string full_string;
331 for (unsigned i = 0; i < tokens.size(); i++) {
332 full_string += tokens[i];
333 if (i != tokens.size() - 1) {
334 full_string += ' ';
335 }
336 }
337 LearnPieces(fontname, start_blob + frag, 1, threshold, CST_FRAGMENT,
338 full_string.c_str(), word);
339 }
340 }
341 }
342 }
343
344 // TODO(rays): re-enable this part of the code when we switch to the
345 // new classifier that needs to see examples of garbage.
346 /*
347if (word->best_state[ch] > 1) {
348 // If the next blob is good, make junk with the rightmost fragment.
349 if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
350 LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
351 word->best_state[ch + 1] + 1,
352 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
353 }
354 // If the previous blob is good, make junk with the leftmost fragment.
355 if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
356 LearnPieces(fontname, start_blob - word->best_state[ch - 1],
357 word->best_state[ch - 1] + 1,
358 threshold, CST_IMPROPER, INVALID_UNICHAR, word);
359 }
360}
361// If the next blob is good, make a join with it.
362if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
363 std::string joined_text = word->correct_text[ch];
364 joined_text += word->correct_text[ch + 1];
365 LearnPieces(fontname, start_blob,
366 word->best_state[ch] + word->best_state[ch + 1],
367 threshold, CST_NGRAM, joined_text.c_str(), word);
368}
369*/
370 }
371 start_blob += word->best_state[ch];
372 }
373 delete[] thresholds;
374} // LearnWord.
375
376// Builds a blob of length fragments, from the word, starting at start,
377// and then learns it, as having the given correct_text.
378// If fontname is not nullptr, then LearnBlob is called and the data will be
379// saved in an internal buffer for static training.
380// Otherwise AdaptToBlob is called for adaption within a document.
381// threshold is a magic number required by AdaptToChar and generated by
382// ComputeAdaptionThresholds.
383// Although it can be partly inferred from the string, segmentation is
384// provided to explicitly clarify the character segmentation.
385void Classify::LearnPieces(const char *fontname, int start, int length, float threshold,
386 CharSegmentationType segmentation, const char *correct_text,
387 WERD_RES *word) {
388 // TODO(daria) Remove/modify this if/when we want
389 // to train and/or adapt to n-grams.
390 if (segmentation != CST_WHOLE && (segmentation != CST_FRAGMENT || disable_character_fragments)) {
391 return;
392 }
393
394 if (length > 1) {
395 SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
396 }
397 TBLOB *blob = word->chopped_word->blobs[start];
398 // Rotate the blob if needed for classification.
399 TBLOB *rotated_blob = blob->ClassifyNormalizeIfNeeded();
400 if (rotated_blob == nullptr) {
401 rotated_blob = blob;
402 }
403
404#ifndef GRAPHICS_DISABLED
405 // Draw debug windows showing the blob that is being learned if needed.
406 if (strcmp(classify_learn_debug_str.c_str(), correct_text) == 0) {
407 RefreshDebugWindow(&learn_debug_win_, "LearnPieces", 600, word->chopped_word->bounding_box());
408 rotated_blob->plot(learn_debug_win_, ScrollView::GREEN, ScrollView::BROWN);
409 learn_debug_win_->Update();
410 learn_debug_win_->Wait();
411 }
412 if (classify_debug_character_fragments && segmentation == CST_FRAGMENT) {
413 ASSERT_HOST(learn_fragments_debug_win_ != nullptr); // set up in LearnWord
414 blob->plot(learn_fragments_debug_win_, ScrollView::BLUE, ScrollView::BROWN);
415 learn_fragments_debug_win_->Update();
416 }
417#endif // !GRAPHICS_DISABLED
418
419 if (fontname != nullptr) {
420 classify_norm_method.set_value(character); // force char norm spc 30/11/93
421 tess_bn_matching.set_value(false); // turn it off
422 tess_cn_matching.set_value(false);
423 DENORM bl_denorm, cn_denorm;
424 INT_FX_RESULT_STRUCT fx_info;
425 SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info);
426 LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
427 } else if (unicharset.contains_unichar(correct_text)) {
428 UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
429 int font_id = word->fontinfo != nullptr ? fontinfo_table_.get_index(*word->fontinfo) : 0;
430 if (classify_learning_debug_level >= 1) {
431 tprintf("Adapting to char = %s, thr= %g font_id= %d\n", unicharset.id_to_unichar(class_id),
432 threshold, font_id);
433 }
434 // If filename is not nullptr we are doing recognition
435 // (as opposed to training), so we must have already set word fonts.
436 AdaptToChar(rotated_blob, class_id, font_id, threshold, AdaptedTemplates);
437 if (BackupAdaptedTemplates != nullptr) {
438 // Adapt the backup templates too. They will be used if the primary gets
439 // too full.
440 AdaptToChar(rotated_blob, class_id, font_id, threshold, BackupAdaptedTemplates);
441 }
442 } else if (classify_debug_level >= 1) {
443 tprintf("Can't adapt to %s not in unicharset\n", correct_text);
444 }
445 if (rotated_blob != blob) {
446 delete rotated_blob;
447 }
448
449 SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, start + length - 1);
450} // LearnPieces.
451
452/*---------------------------------------------------------------------------*/
465 std::string Filename;
466 FILE *File;
467
469 classify_save_adapted_templates) {
470 Filename = imagefile + ADAPT_TEMPLATE_SUFFIX;
471 File = fopen(Filename.c_str(), "wb");
472 if (File == nullptr) {
473 tprintf("Unable to save adapted templates to %s!\n", Filename.c_str());
474 } else {
475 tprintf("\nSaving adapted templates to %s ...", Filename.c_str());
476 fflush(stdout);
478 tprintf("\n");
479 fclose(File);
480 }
481 }
482
483 delete AdaptedTemplates;
484 AdaptedTemplates = nullptr;
486 BackupAdaptedTemplates = nullptr;
487
488 if (PreTrainedTemplates != nullptr) {
489 delete PreTrainedTemplates;
490 PreTrainedTemplates = nullptr;
491 }
494 if (AllProtosOn != nullptr) {
495 FreeBitVector(AllProtosOn);
496 FreeBitVector(AllConfigsOn);
497 FreeBitVector(AllConfigsOff);
498 FreeBitVector(TempProtoMask);
499 AllProtosOn = nullptr;
500 AllConfigsOn = nullptr;
501 AllConfigsOff = nullptr;
502 TempProtoMask = nullptr;
503 }
504 delete shape_table_;
505 shape_table_ = nullptr;
506 delete static_classifier_;
507 static_classifier_ = nullptr;
508} /* EndAdaptiveClassifier */
509
510/*---------------------------------------------------------------------------*/
529 return;
530 }
531 if (AllProtosOn != nullptr) {
532 EndAdaptiveClassifier(); // Don't leak with multiple inits.
533 }
534
535 // If there is no language_data_path_prefix, the classifier will be
536 // adaptive only.
537 if (language_data_path_prefix.length() > 0 && mgr != nullptr) {
538 TFile fp;
541
542 if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) {
544 if (!shape_table_->DeSerialize(&fp)) {
545 tprintf("Error loading shape table!\n");
546 delete shape_table_;
547 shape_table_ = nullptr;
548 }
549 }
550
552 ReadNewCutoffs(&fp, CharNormCutoffs);
553
556 static_classifier_ = new TessClassifier(false, this);
557 }
558
560
561 AllProtosOn = NewBitVector(MAX_NUM_PROTOS);
562 AllConfigsOn = NewBitVector(MAX_NUM_CONFIGS);
563 AllConfigsOff = NewBitVector(MAX_NUM_CONFIGS);
564 TempProtoMask = NewBitVector(MAX_NUM_PROTOS);
565 set_all_bits(AllProtosOn, WordsInVectorOfSize(MAX_NUM_PROTOS));
566 set_all_bits(AllConfigsOn, WordsInVectorOfSize(MAX_NUM_CONFIGS));
567 zero_all_bits(AllConfigsOff, WordsInVectorOfSize(MAX_NUM_CONFIGS));
568
569 for (uint16_t &BaselineCutoff : BaselineCutoffs) {
570 BaselineCutoff = 0;
571 }
572
573 if (classify_use_pre_adapted_templates) {
574 TFile fp;
575 std::string Filename = imagefile;
576 Filename += ADAPT_TEMPLATE_SUFFIX;
577 if (!fp.Open(Filename.c_str(), nullptr)) {
579 } else {
580 tprintf("\nReading pre-adapted templates from %s ...\n", Filename.c_str());
581 fflush(stdout);
583 tprintf("\n");
585
586 for (unsigned i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) {
587 BaselineCutoffs[i] = CharNormCutoffs[i];
588 }
589 }
590 } else {
591 delete AdaptedTemplates;
593 }
594} /* InitAdaptiveClassifier */
595
597 if (classify_learning_debug_level > 0) {
598 tprintf("Resetting adaptive classifier (NumAdaptationsFailed=%d)\n", NumAdaptationsFailed);
599 }
600 delete AdaptedTemplates;
603 BackupAdaptedTemplates = nullptr;
604 NumAdaptationsFailed = 0;
605}
606
607// If there are backup adapted templates, switches to those, otherwise resets
608// the main adaptive classifier (because it is full.)
610 if (BackupAdaptedTemplates == nullptr) {
612 return;
613 }
614 if (classify_learning_debug_level > 0) {
615 tprintf("Switch to backup adaptive classifier (NumAdaptationsFailed=%d)\n",
616 NumAdaptationsFailed);
617 }
618 delete AdaptedTemplates;
620 BackupAdaptedTemplates = nullptr;
621 NumAdaptationsFailed = 0;
622}
623
624// Resets the backup adaptive classifier to empty.
628}
629
630/*---------------------------------------------------------------------------*/
648 EnableLearning = classify_enable_learning;
649
651
652} /* SettupPass1 */
653
654/*---------------------------------------------------------------------------*/
664 EnableLearning = false;
666
667} /* SettupPass2 */
668
669/*---------------------------------------------------------------------------*/
686void Classify::InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class,
687 ADAPT_TEMPLATES_STRUCT *Templates) {
688 FEATURE_SET Features;
689 int Fid, Pid;
690 FEATURE Feature;
691 int NumFeatures;
692 PROTO_STRUCT *Proto;
693 INT_CLASS_STRUCT *IClass;
695
696 classify_norm_method.set_value(baseline);
697 Features = ExtractOutlineFeatures(Blob);
698 NumFeatures = Features->NumFeatures;
699 if (NumFeatures > UNLIKELY_NUM_FEAT || NumFeatures <= 0) {
700 delete Features;
701 return;
702 }
703
704 Config = new TEMP_CONFIG_STRUCT(NumFeatures - 1, FontinfoId);
705 TempConfigFor(Class, 0) = Config;
706
707 /* this is a kludge to construct cutoffs for adapted templates */
708 if (Templates == AdaptedTemplates) {
709 BaselineCutoffs[ClassId] = CharNormCutoffs[ClassId];
710 }
711
712 IClass = ClassForClassId(Templates->Templates, ClassId);
713
714 for (Fid = 0; Fid < Features->NumFeatures; Fid++) {
715 Pid = AddIntProto(IClass);
716 assert(Pid != NO_PROTO);
717
718 Feature = Features->Features[Fid];
719 auto TempProto = new TEMP_PROTO_STRUCT;
720 Proto = &(TempProto->Proto);
721
722 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
723 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
724 instead of the -0.25 to 0.75 used in baseline normalization */
725 Proto->Angle = Feature->Params[OutlineFeatDir];
726 Proto->X = Feature->Params[OutlineFeatX];
727 Proto->Y = Feature->Params[OutlineFeatY] - Y_DIM_OFFSET;
728 Proto->Length = Feature->Params[OutlineFeatLength];
729 FillABC(Proto);
730
731 TempProto->ProtoId = Pid;
732 SET_BIT(Config->Protos, Pid);
733
734 ConvertProto(Proto, Pid, IClass);
735 AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
736
737 Class->TempProtos = push(Class->TempProtos, TempProto);
738 }
739 delete Features;
740
741 AddIntConfig(IClass);
742 ConvertConfig(AllProtosOn, 0, IClass);
743
744 if (classify_learning_debug_level >= 1) {
745 tprintf("Added new class '%s' with class id %d and %d protos.\n",
746 unicharset.id_to_unichar(ClassId), ClassId, NumFeatures);
747#ifndef GRAPHICS_DISABLED
748 if (classify_learning_debug_level > 1) {
749 DisplayAdaptedChar(Blob, IClass);
750 }
751#endif
752 }
753
754 if (IsEmptyAdaptedClass(Class)) {
755 (Templates->NumNonEmptyClasses)++;
756 }
757} /* InitAdaptedClass */
758
759/*---------------------------------------------------------------------------*/
779 FEATURE_SET *FloatFeatures) {
780 FEATURE_SET Features;
781 int NumFeatures;
782
783 classify_norm_method.set_value(baseline);
784 Features = ExtractPicoFeatures(Blob);
785
786 NumFeatures = Features->NumFeatures;
787 if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) {
788 delete Features;
789 return 0;
790 }
791
792 ComputeIntFeatures(Features, IntFeatures);
793 *FloatFeatures = Features;
794
795 return NumFeatures;
796} /* GetAdaptiveFeatures */
797
798/*-----------------------------------------------------------------------------
799 Private Code
800-----------------------------------------------------------------------------*/
801/*---------------------------------------------------------------------------*/
812 if (word->best_choice == nullptr) {
813 return false;
814 }
815 auto BestChoiceLength = word->best_choice->length();
816 float adaptable_score = getDict().segment_penalty_dict_case_ok + ADAPTABLE_WERD_ADJUSTMENT;
817 return // rules that apply in general - simplest to compute first
818 BestChoiceLength > 0 && BestChoiceLength == word->rebuild_word->NumBlobs() &&
819 BestChoiceLength <= MAX_ADAPTABLE_WERD_SIZE &&
820 // This basically ensures that the word is at least a dictionary match
821 // (freq word, user word, system dawg word, etc).
822 // Since all the other adjustments will make adjust factor higher
823 // than higher than adaptable_score=1.1+0.05=1.15
824 // Since these are other flags that ensure that the word is dict word,
825 // this check could be at times redundant.
826 word->best_choice->adjust_factor() <= adaptable_score &&
827 // Make sure that alternative choices are not dictionary words.
828 word->AlternativeChoiceAdjustmentsWorseThan(adaptable_score);
829}
830
831/*---------------------------------------------------------------------------*/
843void Classify::AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold,
844 ADAPT_TEMPLATES_STRUCT *adaptive_templates) {
845 int NumFeatures;
846 INT_FEATURE_ARRAY IntFeatures;
847 UnicharRating int_result;
848 INT_CLASS_STRUCT *IClass;
849 ADAPT_CLASS_STRUCT *Class;
850 TEMP_CONFIG_STRUCT *TempConfig;
851 FEATURE_SET FloatFeatures;
852 int NewTempConfigId;
853
854 if (!LegalClassId(ClassId)) {
855 return;
856 }
857
858 int_result.unichar_id = ClassId;
859 Class = adaptive_templates->Class[ClassId];
860 assert(Class != nullptr);
861 if (IsEmptyAdaptedClass(Class)) {
862 InitAdaptedClass(Blob, ClassId, FontinfoId, Class, adaptive_templates);
863 } else {
864 IClass = ClassForClassId(adaptive_templates->Templates, ClassId);
865
866 NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures);
867 if (NumFeatures <= 0) {
868 return; // Features already freed by GetAdaptiveFeatures.
869 }
870
871 // Only match configs with the matching font.
872 BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS);
873 for (int cfg = 0; cfg < IClass->NumConfigs; ++cfg) {
874 if (GetFontinfoId(Class, cfg) == FontinfoId) {
875 SET_BIT(MatchingFontConfigs, cfg);
876 } else {
877 reset_bit(MatchingFontConfigs, cfg);
878 }
879 }
880 im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, &int_result,
881 classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
882 FreeBitVector(MatchingFontConfigs);
883
884 SetAdaptiveThreshold(Threshold);
885
886 if (1.0f - int_result.rating <= Threshold) {
887 if (ConfigIsPermanent(Class, int_result.config)) {
888 if (classify_learning_debug_level >= 1) {
889 tprintf("Found good match to perm config %d = %4.1f%%.\n", int_result.config,
890 int_result.rating * 100.0);
891 }
892 delete FloatFeatures;
893 return;
894 }
895
896 TempConfig = TempConfigFor(Class, int_result.config);
897 IncreaseConfidence(TempConfig);
898 if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) {
899 Class->MaxNumTimesSeen = TempConfig->NumTimesSeen;
900 }
901 if (classify_learning_debug_level >= 1) {
902 tprintf("Increasing reliability of temp config %d to %d.\n", int_result.config,
903 TempConfig->NumTimesSeen);
904 }
905
906 if (TempConfigReliable(ClassId, TempConfig)) {
907 MakePermanent(adaptive_templates, ClassId, int_result.config, Blob);
908 UpdateAmbigsGroup(ClassId, Blob);
909 }
910 } else {
911 if (classify_learning_debug_level >= 1) {
912 tprintf("Found poor match to temp config %d = %4.1f%%.\n", int_result.config,
913 int_result.rating * 100.0);
914#ifndef GRAPHICS_DISABLED
915 if (classify_learning_debug_level > 2) {
916 DisplayAdaptedChar(Blob, IClass);
917 }
918#endif
919 }
920 NewTempConfigId = MakeNewTemporaryConfig(adaptive_templates, ClassId, FontinfoId, NumFeatures,
921 IntFeatures, FloatFeatures);
922 if (NewTempConfigId >= 0 &&
923 TempConfigReliable(ClassId, TempConfigFor(Class, NewTempConfigId))) {
924 MakePermanent(adaptive_templates, ClassId, NewTempConfigId, Blob);
925 UpdateAmbigsGroup(ClassId, Blob);
926 }
927
928#ifndef GRAPHICS_DISABLED
929 if (classify_learning_debug_level > 1) {
930 DisplayAdaptedChar(Blob, IClass);
931 }
932#endif
933 }
934 delete FloatFeatures;
935 }
936} /* AdaptToChar */
937
938#ifndef GRAPHICS_DISABLED
939
941 INT_FX_RESULT_STRUCT fx_info;
942 std::vector<INT_FEATURE_STRUCT> bl_features;
943 TrainingSample *sample =
944 BlobToTrainingSample(*blob, classify_nonlinear_norm, &fx_info, &bl_features);
945 if (sample == nullptr) {
946 return;
947 }
948
949 UnicharRating int_result;
950 im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], &int_result,
951 classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows);
952 tprintf("Best match to temp config %d = %4.1f%%.\n", int_result.config,
953 int_result.rating * 100.0);
954 if (classify_learning_debug_level >= 2) {
955 uint32_t ConfigMask;
956 ConfigMask = 1 << int_result.config;
958 im_.Match(int_class, AllProtosOn, static_cast<BIT_VECTOR>(&ConfigMask), bl_features.size(),
959 &bl_features[0], &int_result, classify_adapt_feature_threshold, 6 | 0x19,
960 matcher_debug_separate_windows);
962 }
963
964 delete sample;
965}
966
967#endif
968
986void Classify::AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results) {
987 auto old_match = FindScoredUnichar(new_result.unichar_id, *results);
988
989 if (new_result.rating + matcher_bad_match_pad < results->best_rating ||
990 (old_match < results->match.size() &&
991 new_result.rating <= results->match[old_match].rating)) {
992 return; // New one not good enough.
993 }
994
995 if (!unicharset.get_fragment(new_result.unichar_id)) {
996 results->HasNonfragment = true;
997 }
998
999 if (old_match < results->match.size()) {
1000 results->match[old_match].rating = new_result.rating;
1001 } else {
1002 results->match.push_back(new_result);
1003 }
1004
1005 if (new_result.rating > results->best_rating &&
1006 // Ensure that fragments do not affect best rating, class and config.
1007 // This is needed so that at least one non-fragmented character is
1008 // always present in the results.
1009 // TODO(daria): verify that this helps accuracy and does not
1010 // hurt performance.
1011 !unicharset.get_fragment(new_result.unichar_id)) {
1012 results->best_match_index = old_match;
1013 results->best_rating = new_result.rating;
1014 results->best_unichar_id = new_result.unichar_id;
1015 }
1016} /* AddNewResult */
1017
1018/*---------------------------------------------------------------------------*/
1037void Classify::AmbigClassifier(const std::vector<INT_FEATURE_STRUCT> &int_features,
1038 const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob,
1039 INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes,
1040 UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) {
1041 if (int_features.empty()) {
1042 return;
1043 }
1044 auto *CharNormArray = new uint8_t[unicharset.size()];
1045 UnicharRating int_result;
1046
1047 results->BlobLength = GetCharNormFeature(fx_info, templates, nullptr, CharNormArray);
1048 bool debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1049 if (debug) {
1050 tprintf("AM Matches = ");
1051 }
1052
1053 int top = blob->bounding_box().top();
1054 int bottom = blob->bounding_box().bottom();
1055 while (*ambiguities >= 0) {
1056 CLASS_ID class_id = *ambiguities;
1057
1058 int_result.unichar_id = class_id;
1059 im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(),
1060 &int_features[0], &int_result, classify_adapt_feature_threshold, NO_DEBUG,
1061 matcher_debug_separate_windows);
1062
1063 ExpandShapesAndApplyCorrections(nullptr, debug, class_id, bottom, top, 0, results->BlobLength,
1064 classify_integer_matcher_multiplier, CharNormArray, &int_result,
1065 results);
1066 ambiguities++;
1067 }
1068 delete[] CharNormArray;
1069} /* AmbigClassifier */
1070
1071/*---------------------------------------------------------------------------*/
1074void Classify::MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features,
1075 const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors,
1076 ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier,
1077 const TBOX &blob_box, const std::vector<CP_RESULT_STRUCT> &results,
1078 ADAPT_RESULTS *final_results) {
1079 int top = blob_box.top();
1080 int bottom = blob_box.bottom();
1081 UnicharRating int_result;
1082 for (auto &&result : results) {
1083 CLASS_ID class_id = result.Class;
1084 BIT_VECTOR protos = classes != nullptr ? classes[class_id]->PermProtos : AllProtosOn;
1085 BIT_VECTOR configs = classes != nullptr ? classes[class_id]->PermConfigs : AllConfigsOn;
1086
1087 int_result.unichar_id = class_id;
1088 im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features,
1089 &int_result, classify_adapt_feature_threshold, debug, matcher_debug_separate_windows);
1090 bool is_debug = matcher_debug_level >= 2 || classify_debug_level > 1;
1091 ExpandShapesAndApplyCorrections(classes, is_debug, class_id, bottom, top, result.Rating,
1092 final_results->BlobLength, matcher_multiplier, norm_factors,
1093 &int_result, final_results);
1094 }
1095}
1096
1097// Converts configs to fonts, and if the result is not adapted, and a
1098// shape_table_ is present, the shape is expanded to include all
1099// unichar_ids represented, before applying a set of corrections to the
1100// distance rating in int_result, (see ComputeCorrectedRating.)
1101// The results are added to the final_results output.
1102void Classify::ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id,
1103 int bottom, int top, float cp_rating,
1104 int blob_length, int matcher_multiplier,
1105 const uint8_t *cn_factors, UnicharRating *int_result,
1106 ADAPT_RESULTS *final_results) {
1107 if (classes != nullptr) {
1108 // Adapted result. Convert configs to fontinfo_ids.
1109 int_result->adapted = true;
1110 for (auto &font : int_result->fonts) {
1111 font.fontinfo_id = GetFontinfoId(classes[class_id], font.fontinfo_id);
1112 }
1113 } else {
1114 // Pre-trained result. Map fonts using font_sets_.
1115 int_result->adapted = false;
1116 for (auto &font : int_result->fonts) {
1117 font.fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, font.fontinfo_id);
1118 }
1119 if (shape_table_ != nullptr) {
1120 // Two possible cases:
1121 // 1. Flat shapetable. All unichar-ids of the shapes referenced by
1122 // int_result->fonts are the same. In this case build a new vector of
1123 // mapped fonts and replace the fonts in int_result.
1124 // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced
1125 // by int_result. In this case, build a vector of UnicharRating to
1126 // gather together different font-ids for each unichar. Also covers case1.
1127 std::vector<UnicharRating> mapped_results;
1128 for (auto &f : int_result->fonts) {
1129 int shape_id = f.fontinfo_id;
1130 const Shape &shape = shape_table_->GetShape(shape_id);
1131 for (int c = 0; c < shape.size(); ++c) {
1132 int unichar_id = shape[c].unichar_id;
1133 if (!unicharset.get_enabled(unichar_id)) {
1134 continue;
1135 }
1136 // Find the mapped_result for unichar_id.
1137 unsigned r = 0;
1138 for (r = 0; r < mapped_results.size() && mapped_results[r].unichar_id != unichar_id;
1139 ++r) {
1140 }
1141 if (r == mapped_results.size()) {
1142 mapped_results.push_back(*int_result);
1143 mapped_results[r].unichar_id = unichar_id;
1144 mapped_results[r].fonts.clear();
1145 }
1146 for (int font_id : shape[c].font_ids) {
1147 mapped_results[r].fonts.emplace_back(font_id, f.score);
1148 }
1149 }
1150 }
1151 for (auto &m : mapped_results) {
1152 m.rating = ComputeCorrectedRating(debug, m.unichar_id, cp_rating, int_result->rating,
1153 int_result->feature_misses, bottom, top, blob_length,
1154 matcher_multiplier, cn_factors);
1155 AddNewResult(m, final_results);
1156 }
1157 return;
1158 }
1159 }
1160 if (unicharset.get_enabled(class_id)) {
1161 int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, int_result->rating,
1162 int_result->feature_misses, bottom, top,
1163 blob_length, matcher_multiplier, cn_factors);
1164 AddNewResult(*int_result, final_results);
1165 }
1166}
1167
1168// Applies a set of corrections to the confidence im_rating,
1169// including the cn_correction, miss penalty and additional penalty
1170// for non-alnums being vertical misfits. Returns the corrected confidence.
1171double Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating,
1172 double im_rating, int feature_misses, int bottom, int top,
1173 int blob_length, int matcher_multiplier,
1174 const uint8_t *cn_factors) {
1175 // Compute class feature corrections.
1176 double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id],
1177 matcher_multiplier);
1178 double miss_penalty = tessedit_class_miss_scale * feature_misses;
1179 double vertical_penalty = 0.0;
1180 // Penalize non-alnums for being vertical misfits.
1181 if (!unicharset.get_isalpha(unichar_id) && !unicharset.get_isdigit(unichar_id) &&
1182 cn_factors[unichar_id] != 0 && classify_misfit_junk_penalty > 0.0) {
1183 int min_bottom, max_bottom, min_top, max_top;
1184 unicharset.get_top_bottom(unichar_id, &min_bottom, &max_bottom, &min_top, &max_top);
1185 if (debug) {
1186 tprintf("top=%d, vs [%d, %d], bottom=%d, vs [%d, %d]\n", top, min_top, max_top, bottom,
1187 min_bottom, max_bottom);
1188 }
1189 if (top < min_top || top > max_top || bottom < min_bottom || bottom > max_bottom) {
1190 vertical_penalty = classify_misfit_junk_penalty;
1191 }
1192 }
1193 double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty);
1194 if (result < WORST_POSSIBLE_RATING) {
1195 result = WORST_POSSIBLE_RATING;
1196 }
1197 if (debug) {
1198 tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n",
1199 unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0,
1200 (1.0 - im_rating) * 100.0, (cn_corrected - (1.0 - im_rating)) * 100.0,
1201 cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0);
1202 }
1203 return result;
1204}
1205
1206/*---------------------------------------------------------------------------*/
1225 const std::vector<INT_FEATURE_STRUCT> &int_features,
1226 const INT_FX_RESULT_STRUCT &fx_info,
1227 ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results) {
1228 if (int_features.empty()) {
1229 return nullptr;
1230 }
1231 auto *CharNormArray = new uint8_t[unicharset.size()];
1232 ClearCharNormArray(CharNormArray);
1233
1235 PruneClasses(Templates->Templates, int_features.size(), -1, &int_features[0], CharNormArray,
1236 BaselineCutoffs, &Results->CPResults);
1237
1238 if (matcher_debug_level >= 2 || classify_debug_level > 1) {
1239 tprintf("BL Matches = ");
1240 }
1241
1242 MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray,
1243 Templates->Class, matcher_debug_flags, 0, Blob->bounding_box(), Results->CPResults,
1244 Results);
1245
1246 delete[] CharNormArray;
1247 CLASS_ID ClassId = Results->best_unichar_id;
1248 if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) {
1249 return nullptr;
1250 }
1251
1252 return Templates->Class[ClassId]
1253 ->Config[Results->match[Results->best_match_index].config]
1254 .Perm->Ambigs;
1255} /* BaselineClassifier */
1256
1257/*---------------------------------------------------------------------------*/
1274 ADAPT_RESULTS *adapt_results) {
1275 // This is the length that is used for scaling ratings vs certainty.
1277 std::vector<UnicharRating> unichar_results;
1278 static_classifier_->UnicharClassifySample(sample, blob->denorm().pix(), 0, -1, &unichar_results);
1279 // Convert results to the format used internally by AdaptiveClassifier.
1280 for (auto &r : unichar_results) {
1281 AddNewResult(r, adapt_results);
1282 }
1283 return sample.num_features();
1284} /* CharNormClassifier */
1285
1286// As CharNormClassifier, but operates on a TrainingSample and outputs to
1287// a vector of ShapeRating without conversion to classes.
1288int Classify::CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample,
1289 std::vector<UnicharRating> *results) {
1290 results->clear();
1291 std::unique_ptr<ADAPT_RESULTS> adapt_results(new ADAPT_RESULTS());
1292 adapt_results->Initialize();
1293 // Compute the bounding box of the features.
1294 uint32_t num_features = sample.num_features();
1295 // Only the top and bottom of the blob_box are used by MasterMatcher, so
1296 // fabricate right and left using top and bottom.
1297 TBOX blob_box(sample.geo_feature(GeoBottom), sample.geo_feature(GeoBottom),
1298 sample.geo_feature(GeoTop), sample.geo_feature(GeoTop));
1299 // Compute the char_norm_array from the saved cn_feature.
1300 FEATURE norm_feature = sample.GetCNFeature();
1301 std::vector<uint8_t> char_norm_array(unicharset.size());
1302 auto num_pruner_classes = std::max(static_cast<unsigned>(unicharset.size()), PreTrainedTemplates->NumClasses);
1303 std::vector<uint8_t> pruner_norm_array(num_pruner_classes);
1304 adapt_results->BlobLength = static_cast<int>(ActualOutlineLength(norm_feature) * 20 + 0.5f);
1305 ComputeCharNormArrays(norm_feature, PreTrainedTemplates, &char_norm_array[0], &pruner_norm_array[0]);
1306
1307 PruneClasses(PreTrainedTemplates, num_features, keep_this, sample.features(), &pruner_norm_array[0],
1308 shape_table_ != nullptr ? &shapetable_cutoffs_[0] : CharNormCutoffs,
1309 &adapt_results->CPResults);
1310 if (keep_this >= 0) {
1311 adapt_results->CPResults[0].Class = keep_this;
1312 adapt_results->CPResults.resize(1);
1313 }
1314 if (pruner_only) {
1315 // Convert pruner results to output format.
1316 for (auto &it : adapt_results->CPResults) {
1317 int class_id = it.Class;
1318 results->push_back(UnicharRating(class_id, 1.0f - it.Rating));
1319 }
1320 } else {
1321 MasterMatcher(PreTrainedTemplates, num_features, sample.features(), &char_norm_array[0], nullptr,
1322 matcher_debug_flags, classify_integer_matcher_multiplier, blob_box,
1323 adapt_results->CPResults, adapt_results.get());
1324 // Convert master matcher results to output format.
1325 for (auto &i : adapt_results->match) {
1326 results->push_back(i);
1327 }
1328 if (results->size() > 1) {
1329 std::sort(results->begin(), results->end(), SortDescendingRating);
1330 }
1331 }
1332 return num_features;
1333} /* CharNormTrainingSample */
1334
1335/*---------------------------------------------------------------------------*/
1348 float rating = results->BlobLength / matcher_avg_noise_size;
1349 rating *= rating;
1350 rating /= 1 + rating;
1351
1352 AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results);
1353} /* ClassifyAsNoise */
1354
1361void Classify::ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box,
1362 ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) {
1363 assert(Choices != nullptr);
1364 float Rating;
1365 float Certainty;
1366 BLOB_CHOICE_IT temp_it;
1367 bool contains_nonfrag = false;
1368 temp_it.set_to_list(Choices);
1369 int choices_length = 0;
1370 // With no shape_table_ maintain the previous MAX_MATCHES as the maximum
1371 // number of returned results, but with a shape_table_ we want to have room
1372 // for at least the biggest shape (which might contain hundreds of Indic
1373 // grapheme fragments) and more, so use double the size of the biggest shape
1374 // if that is more than the default.
1375 int max_matches = MAX_MATCHES;
1376 if (shape_table_ != nullptr) {
1377 max_matches = shape_table_->MaxNumUnichars() * 2;
1378 if (max_matches < MAX_MATCHES) {
1379 max_matches = MAX_MATCHES;
1380 }
1381 }
1382
1383 float best_certainty = -FLT_MAX;
1384 for (auto &it : Results->match) {
1385 const UnicharRating &result = it;
1386 bool adapted = result.adapted;
1387 bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != nullptr);
1388 if (temp_it.length() + 1 == max_matches && !contains_nonfrag && current_is_frag) {
1389 continue; // look for a non-fragmented character to fill the
1390 // last spot in Choices if only fragments are present
1391 }
1392 // BlobLength can never be legally 0, this means recognition failed.
1393 // But we must return a classification result because some invoking
1394 // functions (chopper/permuter) do not anticipate a null blob choice.
1395 // So we need to assign a poor, but not infinitely bad score.
1396 if (Results->BlobLength == 0) {
1397 Certainty = -20;
1398 Rating = 100; // should be -certainty * real_blob_length
1399 } else {
1400 Rating = Certainty = (1.0f - result.rating);
1401 Rating *= rating_scale * Results->BlobLength;
1402 Certainty *= -(getDict().certainty_scale);
1403 }
1404 // Adapted results, by their very nature, should have good certainty.
1405 // Those that don't are at best misleading, and often lead to errors,
1406 // so don't accept adapted results that are too far behind the best result,
1407 // whether adapted or static.
1408 // TODO(rays) find some way of automatically tuning these constants.
1409 if (Certainty > best_certainty) {
1410 best_certainty = std::min(Certainty, static_cast<float>(classify_adapted_pruning_threshold));
1411 } else if (adapted && Certainty / classify_adapted_pruning_factor < best_certainty) {
1412 continue; // Don't accept bad adapted results.
1413 }
1414
1415 float min_xheight, max_xheight, yshift;
1416 denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift);
1417 auto *choice = new BLOB_CHOICE(
1418 result.unichar_id, Rating, Certainty, unicharset.get_script(result.unichar_id), min_xheight,
1419 max_xheight, yshift, adapted ? BCC_ADAPTED_CLASSIFIER : BCC_STATIC_CLASSIFIER);
1420 choice->set_fonts(result.fonts);
1421 temp_it.add_to_end(choice);
1422 contains_nonfrag |= !current_is_frag; // update contains_nonfrag
1423 choices_length++;
1424 if (choices_length >= max_matches) {
1425 break;
1426 }
1427 }
1428 Results->match.resize(choices_length);
1429} // ConvertMatchesToChoices
1430
1431/*---------------------------------------------------------------------------*/
1432#ifndef GRAPHICS_DISABLED
1441 if (static_classifier_ == nullptr) {
1442 return;
1443 }
1444 INT_FX_RESULT_STRUCT fx_info;
1445 std::vector<INT_FEATURE_STRUCT> bl_features;
1446 TrainingSample *sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features);
1447 if (sample == nullptr) {
1448 return;
1449 }
1450 static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), Results->best_unichar_id);
1451} /* DebugAdaptiveClassifier */
1452#endif
1453
1454/*---------------------------------------------------------------------------*/
1475 UNICHAR_ID *Ambiguities;
1476
1477 INT_FX_RESULT_STRUCT fx_info;
1478 std::vector<INT_FEATURE_STRUCT> bl_features;
1479 TrainingSample *sample =
1480 BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1481 if (sample == nullptr) {
1482 return;
1483 }
1484
1485 // TODO: With LSTM, static_classifier_ is nullptr.
1486 // Return to avoid crash in CharNormClassifier.
1487 if (static_classifier_ == nullptr) {
1488 delete sample;
1489 return;
1490 }
1491
1492 if (AdaptedTemplates->NumPermClasses < matcher_permanent_classes_min || tess_cn_matching) {
1493 CharNormClassifier(Blob, *sample, Results);
1494 } else {
1495 Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results);
1496 if ((!Results->match.empty() &&
1497 MarginalMatch(Results->best_rating, matcher_reliable_adaptive_result) &&
1498 !tess_bn_matching) ||
1499 Results->match.empty()) {
1500 CharNormClassifier(Blob, *sample, Results);
1501 } else if (Ambiguities && *Ambiguities >= 0 && !tess_bn_matching) {
1502 AmbigClassifier(bl_features, fx_info, Blob, PreTrainedTemplates, AdaptedTemplates->Class,
1503 Ambiguities, Results);
1504 }
1505 }
1506
1507 // Force the blob to be classified as noise
1508 // if the results contain only fragments.
1509 // TODO(daria): verify that this is better than
1510 // just adding a nullptr classification.
1511 if (!Results->HasNonfragment || Results->match.empty()) {
1512 ClassifyAsNoise(Results);
1513 }
1514 delete sample;
1515} /* DoAdaptiveMatch */
1516
1517/*---------------------------------------------------------------------------*/
1533 auto *Results = new ADAPT_RESULTS();
1534 UNICHAR_ID *Ambiguities;
1535
1536 Results->Initialize();
1537 INT_FX_RESULT_STRUCT fx_info;
1538 std::vector<INT_FEATURE_STRUCT> bl_features;
1539 TrainingSample *sample =
1540 BlobToTrainingSample(*Blob, classify_nonlinear_norm, &fx_info, &bl_features);
1541 if (sample == nullptr) {
1542 delete Results;
1543 return nullptr;
1544 }
1545
1546 CharNormClassifier(Blob, *sample, Results);
1547 delete sample;
1548 RemoveBadMatches(Results);
1549 std::sort(Results->match.begin(), Results->match.end(), SortDescendingRating);
1550
1551 /* copy the class id's into an string of ambiguities - don't copy if
1552 the correct class is the only class id matched */
1553 Ambiguities = new UNICHAR_ID[Results->match.size() + 1];
1554 if (Results->match.size() > 1 ||
1555 (Results->match.size() == 1 && Results->match[0].unichar_id != CorrectClass)) {
1556 unsigned i;
1557 for (i = 0; i < Results->match.size(); i++) {
1558 Ambiguities[i] = Results->match[i].unichar_id;
1559 }
1560 Ambiguities[i] = -1;
1561 } else {
1562 Ambiguities[0] = -1;
1563 }
1564
1565 delete Results;
1566 return Ambiguities;
1567} /* GetAmbiguities */
1568
1569// Returns true if the given blob looks too dissimilar to any character
1570// present in the classifier templates.
1572 auto *ratings = new BLOB_CHOICE_LIST();
1573 AdaptiveClassifier(blob, ratings);
1574 BLOB_CHOICE_IT ratings_it(ratings);
1576 if (classify_debug_character_fragments) {
1577 print_ratings_list("======================\nLooksLikeGarbage() got ", ratings, unicharset);
1578 }
1579 for (ratings_it.mark_cycle_pt(); !ratings_it.cycled_list(); ratings_it.forward()) {
1580 if (unicharset.get_fragment(ratings_it.data()->unichar_id()) != nullptr) {
1581 continue;
1582 }
1583 float certainty = ratings_it.data()->certainty();
1584 delete ratings;
1585 return certainty < classify_character_fragments_garbage_certainty_threshold;
1586 }
1587 delete ratings;
1588 return true; // no whole characters in ratings
1589}
1590
1591/*---------------------------------------------------------------------------*/
1614 uint8_t *pruner_norm_array, uint8_t *char_norm_array) {
1615 auto norm_feature = new FEATURE_STRUCT(&CharNormDesc);
1617 float scale = MF_SCALE_FACTOR;
1618 norm_feature->Params[CharNormY] = (fx_info.Ymean - baseline) * scale;
1619 norm_feature->Params[CharNormLength] = fx_info.Length * scale / LENGTH_COMPRESSION;
1620 norm_feature->Params[CharNormRx] = fx_info.Rx * scale;
1621 norm_feature->Params[CharNormRy] = fx_info.Ry * scale;
1622 // Deletes norm_feature.
1623 ComputeCharNormArrays(norm_feature, templates, char_norm_array, pruner_norm_array);
1625} /* GetCharNormFeature */
1626
1627// Computes the char_norm_array for the unicharset and, if not nullptr, the
1628// pruner_array as appropriate according to the existence of the shape_table.
1630 uint8_t *char_norm_array, uint8_t *pruner_array) {
1631 ComputeIntCharNormArray(*norm_feature, char_norm_array);
1632 //if (pruner_array != nullptr) {
1633 if (shape_table_ == nullptr) {
1634 ComputeIntCharNormArray(*norm_feature, pruner_array);
1635 } else {
1636 memset(&pruner_array[0], UINT8_MAX, templates->NumClasses * sizeof(pruner_array[0]));
1637 // Each entry in the pruner norm array is the MIN of all the entries of
1638 // the corresponding unichars in the CharNormArray.
1639 for (unsigned id = 0; id < templates->NumClasses; ++id) {
1640 int font_set_id = templates->Class[id]->font_set_id;
1641 const FontSet &fs = fontset_table_.at(font_set_id);
1642 for (auto f : fs) {
1643 const Shape &shape = shape_table_->GetShape(f);
1644 for (int c = 0; c < shape.size(); ++c) {
1645 if (char_norm_array[shape[c].unichar_id] < pruner_array[id]) {
1646 pruner_array[id] = char_norm_array[shape[c].unichar_id];
1647 }
1648 }
1649 }
1650 }
1651 }
1652 //}
1653 delete norm_feature;
1654}
1655
1656/*---------------------------------------------------------------------------*/
1670 int NumFeatures, INT_FEATURE_ARRAY Features,
1671 FEATURE_SET FloatFeatures) {
1672 INT_CLASS_STRUCT *IClass;
1673 ADAPT_CLASS_STRUCT *Class;
1674 PROTO_ID OldProtos[MAX_NUM_PROTOS];
1675 FEATURE_ID BadFeatures[MAX_NUM_INT_FEATURES];
1676 int NumOldProtos;
1677 int NumBadFeatures;
1678 int MaxProtoId, OldMaxProtoId;
1679 int MaskSize;
1680 int ConfigId;
1681 int i;
1682 int debug_level = NO_DEBUG;
1683
1684 if (classify_learning_debug_level >= 3) {
1686 }
1687
1688 IClass = ClassForClassId(Templates->Templates, ClassId);
1689 Class = Templates->Class[ClassId];
1690
1691 if (IClass->NumConfigs >= MAX_NUM_CONFIGS) {
1692 ++NumAdaptationsFailed;
1693 if (classify_learning_debug_level >= 1) {
1694 tprintf("Cannot make new temporary config: maximum number exceeded.\n");
1695 }
1696 return -1;
1697 }
1698
1699 OldMaxProtoId = IClass->NumProtos - 1;
1700
1701 NumOldProtos = im_.FindGoodProtos(IClass, AllProtosOn, AllConfigsOff, NumFeatures, Features,
1702 OldProtos, classify_adapt_proto_threshold, debug_level);
1703
1704 MaskSize = WordsInVectorOfSize(MAX_NUM_PROTOS);
1705 zero_all_bits(TempProtoMask, MaskSize);
1706 for (i = 0; i < NumOldProtos; i++) {
1707 SET_BIT(TempProtoMask, OldProtos[i]);
1708 }
1709
1710 NumBadFeatures = im_.FindBadFeatures(IClass, TempProtoMask, AllConfigsOn, NumFeatures, Features,
1711 BadFeatures, classify_adapt_feature_threshold, debug_level);
1712
1713 MaxProtoId =
1714 MakeNewTempProtos(FloatFeatures, NumBadFeatures, BadFeatures, IClass, Class, TempProtoMask);
1715 if (MaxProtoId == NO_PROTO) {
1716 ++NumAdaptationsFailed;
1717 if (classify_learning_debug_level >= 1) {
1718 tprintf("Cannot make new temp protos: maximum number exceeded.\n");
1719 }
1720 return -1;
1721 }
1722
1723 ConfigId = AddIntConfig(IClass);
1724 ConvertConfig(TempProtoMask, ConfigId, IClass);
1725 auto Config = new TEMP_CONFIG_STRUCT(MaxProtoId, FontinfoId);
1726 TempConfigFor(Class, ConfigId) = Config;
1727 copy_all_bits(TempProtoMask, Config->Protos, Config->ProtoVectorSize);
1728
1729 if (classify_learning_debug_level >= 1) {
1730 tprintf(
1731 "Making new temp config %d fontinfo id %d"
1732 " using %d old and %d new protos.\n",
1733 ConfigId, Config->FontinfoId, NumOldProtos, MaxProtoId - OldMaxProtoId);
1734 }
1735
1736 return ConfigId;
1737} /* MakeNewTemporaryConfig */
1738
1739/*---------------------------------------------------------------------------*/
1759 INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class,
1760 BIT_VECTOR TempProtoMask) {
1761 FEATURE_ID *ProtoStart;
1762 FEATURE_ID *ProtoEnd;
1763 FEATURE_ID *LastBad;
1764 PROTO_STRUCT *Proto;
1765 FEATURE F1, F2;
1766 float X1, X2, Y1, Y2;
1767 float A1, A2, AngleDelta;
1768 float SegmentLength;
1769 PROTO_ID Pid;
1770
1771 for (ProtoStart = BadFeat, LastBad = ProtoStart + NumBadFeat; ProtoStart < LastBad;
1772 ProtoStart = ProtoEnd) {
1773 F1 = Features->Features[*ProtoStart];
1774 X1 = F1->Params[PicoFeatX];
1775 Y1 = F1->Params[PicoFeatY];
1776 A1 = F1->Params[PicoFeatDir];
1777
1778 for (ProtoEnd = ProtoStart + 1, SegmentLength = GetPicoFeatureLength(); ProtoEnd < LastBad;
1779 ProtoEnd++, SegmentLength += GetPicoFeatureLength()) {
1780 F2 = Features->Features[*ProtoEnd];
1781 X2 = F2->Params[PicoFeatX];
1782 Y2 = F2->Params[PicoFeatY];
1783 A2 = F2->Params[PicoFeatDir];
1784
1785 AngleDelta = std::fabs(A1 - A2);
1786 if (AngleDelta > 0.5f) {
1787 AngleDelta = 1 - AngleDelta;
1788 }
1789
1790 if (AngleDelta > matcher_clustering_max_angle_delta || std::fabs(X1 - X2) > SegmentLength ||
1791 std::fabs(Y1 - Y2) > SegmentLength) {
1792 break;
1793 }
1794 }
1795
1796 F2 = Features->Features[*(ProtoEnd - 1)];
1797 X2 = F2->Params[PicoFeatX];
1798 Y2 = F2->Params[PicoFeatY];
1799 A2 = F2->Params[PicoFeatDir];
1800
1801 Pid = AddIntProto(IClass);
1802 if (Pid == NO_PROTO) {
1803 return (NO_PROTO);
1804 }
1805
1806 auto TempProto = new TEMP_PROTO_STRUCT;
1807 Proto = &(TempProto->Proto);
1808
1809 /* compute proto params - NOTE that Y_DIM_OFFSET must be used because
1810 ConvertProto assumes that the Y dimension varies from -0.5 to 0.5
1811 instead of the -0.25 to 0.75 used in baseline normalization */
1812 Proto->Length = SegmentLength;
1813 Proto->Angle = A1;
1814 Proto->X = (X1 + X2) / 2;
1815 Proto->Y = (Y1 + Y2) / 2 - Y_DIM_OFFSET;
1816 FillABC(Proto);
1817
1818 TempProto->ProtoId = Pid;
1819 SET_BIT(TempProtoMask, Pid);
1820
1821 ConvertProto(Proto, Pid, IClass);
1822 AddProtoToProtoPruner(Proto, Pid, IClass, classify_learning_debug_level >= 2);
1823
1824 Class->TempProtos = push(Class->TempProtos, TempProto);
1825 }
1826 return IClass->NumProtos - 1;
1827} /* MakeNewTempProtos */
1828
1829/*---------------------------------------------------------------------------*/
1839void Classify::MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId,
1840 TBLOB *Blob) {
1841 UNICHAR_ID *Ambigs;
1842 PROTO_KEY ProtoKey;
1843
1844 auto Class = Templates->Class[ClassId];
1845 auto Config = TempConfigFor(Class, ConfigId);
1846
1847 MakeConfigPermanent(Class, ConfigId);
1848 if (Class->NumPermConfigs == 0) {
1849 Templates->NumPermClasses++;
1850 }
1851 Class->NumPermConfigs++;
1852
1853 // Initialize permanent config.
1854 Ambigs = GetAmbiguities(Blob, ClassId);
1855 auto Perm = new PERM_CONFIG_STRUCT;
1856 Perm->Ambigs = Ambigs;
1857 Perm->FontinfoId = Config->FontinfoId;
1858
1859 // Free memory associated with temporary config (since ADAPTED_CONFIG
1860 // is a union we need to clean up before we record permanent config).
1861 ProtoKey.Templates = Templates;
1862 ProtoKey.ClassId = ClassId;
1863 ProtoKey.ConfigId = ConfigId;
1864 Class->TempProtos = delete_d(Class->TempProtos, &ProtoKey, MakeTempProtoPerm);
1865 delete Config;
1866
1867 // Record permanent config.
1868 PermConfigFor(Class, ConfigId) = Perm;
1869
1870 if (classify_learning_debug_level >= 1) {
1871 tprintf(
1872 "Making config %d for %s (ClassId %d) permanent:"
1873 " fontinfo id %d, ambiguities '",
1874 ConfigId, getDict().getUnicharset().debug_str(ClassId).c_str(), ClassId,
1875 PermConfigFor(Class, ConfigId)->FontinfoId);
1876 for (UNICHAR_ID *AmbigsPointer = Ambigs; *AmbigsPointer >= 0; ++AmbigsPointer) {
1877 tprintf("%s", unicharset.id_to_unichar(*AmbigsPointer));
1878 }
1879 tprintf("'.\n");
1880 }
1881} /* MakePermanent */
1882
1883/*---------------------------------------------------------------------------*/
1896int MakeTempProtoPerm(void *item1, void *item2) {
1897 auto TempProto = static_cast<TEMP_PROTO_STRUCT *>(item1);
1898 auto ProtoKey = static_cast<PROTO_KEY *>(item2);
1899
1900 auto Class = ProtoKey->Templates->Class[ProtoKey->ClassId];
1901 auto Config = TempConfigFor(Class, ProtoKey->ConfigId);
1902
1903 if (TempProto->ProtoId > Config->MaxProtoId || !test_bit(Config->Protos, TempProto->ProtoId)) {
1904 return false;
1905 }
1906
1907 MakeProtoPermanent(Class, TempProto->ProtoId);
1908 AddProtoToClassPruner(&(TempProto->Proto), ProtoKey->ClassId, ProtoKey->Templates->Templates);
1909 delete TempProto;
1910
1911 return true;
1912} /* MakeTempProtoPerm */
1913
1914/*---------------------------------------------------------------------------*/
1923 for (auto &it : results.match) {
1924 tprintf("%s ", unicharset.debug_str(it.unichar_id).c_str());
1925 it.Print();
1926 }
1927} /* PrintAdaptiveMatchResults */
1928
1929/*---------------------------------------------------------------------------*/
1943 unsigned Next, NextGood;
1944 float BadMatchThreshold;
1945 static const char *romans = "i v x I V X";
1946 BadMatchThreshold = Results->best_rating - matcher_bad_match_pad;
1947
1948 if (classify_bln_numeric_mode) {
1949 UNICHAR_ID unichar_id_one =
1951 UNICHAR_ID unichar_id_zero =
1953 float scored_one = ScoredUnichar(unichar_id_one, *Results);
1954 float scored_zero = ScoredUnichar(unichar_id_zero, *Results);
1955
1956 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1957 const UnicharRating &match = Results->match[Next];
1958 if (match.rating >= BadMatchThreshold) {
1959 if (!unicharset.get_isalpha(match.unichar_id) ||
1960 strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
1961 } else if (unicharset.eq(match.unichar_id, "l") && scored_one < BadMatchThreshold) {
1962 Results->match[Next].unichar_id = unichar_id_one;
1963 } else if (unicharset.eq(match.unichar_id, "O") && scored_zero < BadMatchThreshold) {
1964 Results->match[Next].unichar_id = unichar_id_zero;
1965 } else {
1966 Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy.
1967 }
1968 if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) {
1969 if (NextGood == Next) {
1970 ++NextGood;
1971 } else {
1972 Results->match[NextGood++] = Results->match[Next];
1973 }
1974 }
1975 }
1976 }
1977 } else {
1978 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
1979 if (Results->match[Next].rating >= BadMatchThreshold) {
1980 if (NextGood == Next) {
1981 ++NextGood;
1982 } else {
1983 Results->match[NextGood++] = Results->match[Next];
1984 }
1985 }
1986 }
1987 }
1988 Results->match.resize(NextGood);
1989} /* RemoveBadMatches */
1990
1991/*----------------------------------------------------------------------------*/
2000 unsigned Next, NextGood;
2001 int punc_count; /*no of garbage characters */
2002 int digit_count;
2003 /*garbage characters */
2004 static char punc_chars[] = ". , ; : / ` ~ ' - = \\ | \" ! _ ^";
2005 static char digit_chars[] = "0 1 2 3 4 5 6 7 8 9";
2006
2007 punc_count = 0;
2008 digit_count = 0;
2009 for (Next = NextGood = 0; Next < Results->match.size(); Next++) {
2010 const UnicharRating &match = Results->match[Next];
2011 bool keep = true;
2012 if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2013 if (punc_count >= 2) {
2014 keep = false;
2015 }
2016 punc_count++;
2017 } else {
2018 if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != nullptr) {
2019 if (digit_count >= 1) {
2020 keep = false;
2021 }
2022 digit_count++;
2023 }
2024 }
2025 if (keep) {
2026 if (NextGood == Next) {
2027 ++NextGood;
2028 } else {
2029 Results->match[NextGood++] = match;
2030 }
2031 }
2032 }
2033 Results->match.resize(NextGood);
2034} /* RemoveExtraPuncs */
2035
2036/*---------------------------------------------------------------------------*/
2047void Classify::SetAdaptiveThreshold(float Threshold) {
2048 Threshold = (Threshold == matcher_good_threshold) ? 0.9f : (1 - Threshold);
2049 classify_adapt_proto_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2050 classify_adapt_feature_threshold.set_value(ClipToRange<int>(255 * Threshold, 0, 255));
2051} /* SetAdaptiveThreshold */
2052
2053#ifndef GRAPHICS_DISABLED
2054
2055/*---------------------------------------------------------------------------*/
2065void Classify::ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features,
2066 int num_features) {
2067 uint32_t config_mask;
2068 if (UnusedClassIdIn(PreTrainedTemplates, shape_id)) {
2069 tprintf("No built-in templates for class/shape %d\n", shape_id);
2070 return;
2071 }
2072 if (num_features <= 0) {
2073 tprintf("Illegal blob (char norm features)!\n");
2074 return;
2075 }
2076 UnicharRating cn_result;
2077 classify_norm_method.set_value(character);
2079 features, &cn_result, classify_adapt_feature_threshold, NO_DEBUG,
2080 matcher_debug_separate_windows);
2081 tprintf("\n");
2082 config_mask = 1 << cn_result.config;
2083
2084 tprintf("Static Shape ID: %d\n", shape_id);
2086 im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, &config_mask, num_features,
2087 features, &cn_result, classify_adapt_feature_threshold, matcher_debug_flags,
2088 matcher_debug_separate_windows);
2090} /* ShowBestMatchFor */
2091
2092#endif // !GRAPHICS_DISABLED
2093
2094// Returns a string for the classifier class_id: either the corresponding
2095// unicharset debug_str or the shape_table_ debug str.
2096std::string Classify::ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id,
2097 int config_id) const {
2098 std::string class_string;
2099 if (templates == PreTrainedTemplates && shape_table_ != nullptr) {
2100 int shape_id = ClassAndConfigIDToFontOrShapeID(class_id, config_id);
2101 class_string = shape_table_->DebugStr(shape_id);
2102 } else {
2103 class_string = unicharset.debug_str(class_id);
2104 }
2105 return class_string;
2106}
2107
2108// Converts a classifier class_id index to a shape_table_ index
2109int Classify::ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const {
2110 int font_set_id = PreTrainedTemplates->Class[class_id]->font_set_id;
2111 // Older inttemps have no font_ids.
2112 if (font_set_id < 0) {
2113 return kBlankFontinfoId;
2114 }
2115 const FontSet &fs = fontset_table_.at(font_set_id);
2116 return fs.at(int_result_config);
2117}
2118
2119// Converts a shape_table_ index to a classifier class_id index (not a
2120// unichar-id!). Uses a search, so not fast.
2121int Classify::ShapeIDToClassID(int shape_id) const {
2122 for (unsigned id = 0; id < PreTrainedTemplates->NumClasses; ++id) {
2123 int font_set_id = PreTrainedTemplates->Class[id]->font_set_id;
2124 ASSERT_HOST(font_set_id >= 0);
2125 const FontSet &fs = fontset_table_.at(font_set_id);
2126 for (auto f : fs) {
2127 if (f == shape_id) {
2128 return id;
2129 }
2130 }
2131 }
2132 tprintf("Shape %d not found\n", shape_id);
2133 return -1;
2134}
2135
2136// Returns true if the given TEMP_CONFIG_STRUCT is good enough to make it
2137// a permanent config.
2139 if (classify_learning_debug_level >= 1) {
2140 tprintf("NumTimesSeen for config of %s is %d\n",
2141 getDict().getUnicharset().debug_str(class_id).c_str(), config->NumTimesSeen);
2142 }
2143 if (config->NumTimesSeen >= matcher_sufficient_examples_for_prototyping) {
2144 return true;
2145 } else if (config->NumTimesSeen < matcher_min_examples_for_prototyping) {
2146 return false;
2147 } else if (use_ambigs_for_adaption) {
2148 // Go through the ambigs vector and see whether we have already seen
2149 // enough times all the characters represented by the ambigs vector.
2150 const UnicharIdVector *ambigs = getDict().getUnicharAmbigs().AmbigsForAdaption(class_id);
2151 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2152 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2153 ADAPT_CLASS_STRUCT *ambig_class = AdaptedTemplates->Class[(*ambigs)[ambig]];
2154 assert(ambig_class != nullptr);
2155 if (ambig_class->NumPermConfigs == 0 &&
2156 ambig_class->MaxNumTimesSeen < matcher_min_examples_for_prototyping) {
2157 if (classify_learning_debug_level >= 1) {
2158 tprintf(
2159 "Ambig %s has not been seen enough times,"
2160 " not making config for %s permanent\n",
2161 getDict().getUnicharset().debug_str((*ambigs)[ambig]).c_str(),
2162 getDict().getUnicharset().debug_str(class_id).c_str());
2163 }
2164 return false;
2165 }
2166 }
2167 }
2168 return true;
2169}
2170
2173 int ambigs_size = (ambigs == nullptr) ? 0 : ambigs->size();
2174 if (classify_learning_debug_level >= 1) {
2175 tprintf("Running UpdateAmbigsGroup for %s class_id=%d\n",
2176 getDict().getUnicharset().debug_str(class_id).c_str(), class_id);
2177 }
2178 for (int ambig = 0; ambig < ambigs_size; ++ambig) {
2179 CLASS_ID ambig_class_id = (*ambigs)[ambig];
2180 const ADAPT_CLASS_STRUCT *ambigs_class = AdaptedTemplates->Class[ambig_class_id];
2181 for (int cfg = 0; cfg < MAX_NUM_CONFIGS; ++cfg) {
2182 if (ConfigIsPermanent(ambigs_class, cfg)) {
2183 continue;
2184 }
2185 const TEMP_CONFIG_STRUCT *config = TempConfigFor(AdaptedTemplates->Class[ambig_class_id], cfg);
2186 if (config != nullptr && TempConfigReliable(ambig_class_id, config)) {
2187 if (classify_learning_debug_level >= 1) {
2188 tprintf("Making config %d of %s permanent\n", cfg,
2189 getDict().getUnicharset().debug_str(ambig_class_id).c_str());
2190 }
2191 MakePermanent(AdaptedTemplates, ambig_class_id, cfg, Blob);
2192 }
2193 }
2194 }
2195}
2196
2197} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define NO_PROTO
Definition: matchdefs.h:41
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define reset_bit(array, bit)
Definition: bitvec.h:57
#define test_bit(array, bit)
Definition: bitvec.h:59
#define SET_BIT(array, bit)
Definition: bitvec.h:55
#define UnusedClassIdIn(T, c)
Definition: intproto.h:155
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:116
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
#define ClassForClassId(T, c)
Definition: intproto.h:156
#define LegalClassId(c)
Definition: intproto.h:154
#define PRINT_MATCH_SUMMARY
Definition: intproto.h:165
#define PRINT_PROTO_MATCHES
Definition: intproto.h:169
#define PRINT_FEATURE_MATCHES
Definition: intproto.h:168
#define LENGTH_COMPRESSION
Definition: normfeat.h:26
#define MAX_ADAPTABLE_WERD_SIZE
Definition: adaptmatch.cpp:85
#define UNLIKELY_NUM_FEAT
Definition: adaptmatch.cpp:83
#define NO_DEBUG
Definition: adaptmatch.cpp:84
#define ADAPTABLE_WERD_ADJUSTMENT
Definition: adaptmatch.cpp:87
#define ADAPT_TEMPLATE_SUFFIX
Definition: adaptmatch.cpp:80
#define WORST_POSSIBLE_RATING
Definition: adaptmatch.cpp:91
#define classify_enable_adaptive_matcher
Definition: adaptmatch.cpp:78
#define Y_DIM_OFFSET
Definition: adaptmatch.cpp:89
#define MAX_MATCHES
Definition: adaptmatch.cpp:82
#define IsEmptyAdaptedClass(Class)
Definition: adaptive.h:83
#define MakeProtoPermanent(Class, ProtoId)
Definition: adaptive.h:89
#define MakeConfigPermanent(Class, ConfigId)
Definition: adaptive.h:87
#define ConfigIsPermanent(Class, ConfigId)
Definition: adaptive.h:85
#define IncreaseConfidence(TempConfig)
Definition: adaptive.h:95
#define PermConfigFor(Class, ConfigId)
Definition: adaptive.h:93
#define TempConfigFor(Class, ConfigId)
Definition: adaptive.h:91
#define GetPicoFeatureLength()
Definition: picofeat.h:56
bool MarginalMatch(float confidence, float matcher_great_threshold)
Definition: adaptmatch.cpp:142
const double kStandardFeatureLength
Definition: intfx.h:44
@ OutlineFeatLength
Definition: outfeat.h:30
@ OutlineFeatY
Definition: outfeat.h:29
@ OutlineFeatX
Definition: outfeat.h:28
@ OutlineFeatDir
Definition: outfeat.h:31
void AddProtoToProtoPruner(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class, bool debug)
Definition: intproto.cpp:344
void ConvertConfig(BIT_VECTOR Config, int ConfigId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:430
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:804
@ PicoFeatDir
Definition: picofeat.h:43
@ PicoFeatX
Definition: picofeat.h:43
@ PicoFeatY
Definition: picofeat.h:43
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
TrainingSample * BlobToTrainingSample(const TBLOB &blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT *fx_info, std::vector< INT_FEATURE_STRUCT > *bl_features)
Definition: intfx.cpp:79
int IntCastRounded(double x)
Definition: helpers.h:170
@ character
Definition: mfoutline.h:53
@ baseline
Definition: mfoutline.h:53
const int kBlnXHeight
Definition: normalis.h:33
@ TESSDATA_SHAPE_TABLE
LIST delete_d(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:88
void InitIntegerFX()
Definition: intfx.cpp:54
std::vector< int > FontSet
Definition: fontinfo.h:154
CLUSTERCONFIG Config
const float MF_SCALE_FACTOR
Definition: mfoutline.h:61
void SetAdaptiveThreshold(float Threshold)
@ GeoTop
Definition: picofeat.h:37
@ GeoBottom
Definition: picofeat.h:36
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
int MakeTempProtoPerm(void *item1, void *item2)
int UNICHAR_ID
Definition: unichar.h:34
int16_t PROTO_ID
Definition: matchdefs.h:40
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ BCC_STATIC_CLASSIFIER
Definition: ratngs.h:49
@ BCC_ADAPTED_CLASSIFIER
Definition: ratngs.h:50
void AddProtoToClassPruner(PROTO_STRUCT *Proto, CLASS_ID ClassId, INT_TEMPLATES_STRUCT *Templates)
Definition: intproto.cpp:306
const FEATURE_DESC_STRUCT CharNormDesc
void UpdateMatchDisplay()
Definition: intproto.cpp:413
float ActualOutlineLength(FEATURE Feature)
Definition: normfeat.cpp:27
std::vector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:38
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
int AddIntConfig(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:250
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
INT_FEATURE_STRUCT INT_FEATURE_ARRAY[MAX_NUM_INT_FEATURES]
Definition: intproto.h:137
const int kBlnBaselineOffset
Definition: normalis.h:34
void FillABC(PROTO_STRUCT *Proto)
Definition: protos.cpp:103
uint8_t FEATURE_ID
Definition: matchdefs.h:46
int AddIntProto(INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:270
CharSegmentationType
Definition: classify.h:87
@ CST_WHOLE
Definition: classify.h:89
@ CST_FRAGMENT
Definition: classify.h:88
void InitMatcherRatings(float *Rating)
TBOX bounding_box() const
Definition: blobs.cpp:466
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:509
const DENORM & denorm() const
Definition: blobs.h:368
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:353
TBOX bounding_box() const
Definition: blobs.cpp:863
std::vector< TBLOB * > blobs
Definition: blobs.h:462
unsigned NumBlobs() const
Definition: blobs.h:449
void plot(ScrollView *window)
Definition: blobs.cpp:907
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56
void XHeightRange(int unichar_id, const UNICHARSET &unicharset, const TBOX &bbox, float *min_xht, float *max_xht, float *yshift) const
Definition: normalis.cpp:439
Image pix() const
Definition: normalis.h:237
WERD_CHOICE * best_choice
Definition: pageres.h:239
std::vector< std::string > correct_text
Definition: pageres.h:287
void ComputeAdaptionThresholds(float certainty_scale, float min_rating, float max_rating, float rating_margin, float *thresholds)
Definition: pageres.cpp:570
TWERD * chopped_word
Definition: pageres.h:210
const FontInfo * fontinfo
Definition: pageres.h:307
bool AlternativeChoiceAdjustmentsWorseThan(float threshold) const
Definition: pageres.cpp:441
std::vector< int > best_state
Definition: pageres.h:283
TWERD * rebuild_word
Definition: pageres.h:264
bool PiecesAllNatural(int start, int count) const
Definition: pageres.cpp:1111
std::vector< SEAM * > seam_array
Definition: pageres.h:212
std::string debug_string() const
Definition: ratngs.h:479
unsigned length() const
Definition: ratngs.h:287
float adjust_factor() const
Definition: ratngs.h:290
TDimension left() const
Definition: rect.h:82
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
static void JoinPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:204
static void BreakPieces(const std::vector< SEAM * > &seams, const std::vector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:181
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:198
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:208
std::string language_data_path_prefix
Definition: ccutil.h:60
UNICHARSET unicharset
Definition: ccutil.h:61
std::string imagefile
Definition: ccutil.h:65
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
bool GetComponent(TessdataType type, TFile *fp)
std::string to_string() const
Definition: unicharset.h:91
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:681
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:586
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
bool get_enabled(UNICHAR_ID unichar_id) const
Definition: unicharset.h:911
size_t size() const
Definition: unicharset.h:355
std::string debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:331
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:713
PERM_CONFIG_STRUCT * Perm
Definition: adaptive.h:52
ADAPTED_CONFIG Config[MAX_NUM_CONFIGS]
Definition: adaptive.h:64
ADAPT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: adaptive.h:75
INT_TEMPLATES_STRUCT * Templates
Definition: adaptive.h:72
std::vector< UnicharRating > match
Definition: adaptmatch.cpp:99
std::vector< CP_RESULT_STRUCT > CPResults
Definition: adaptmatch.cpp:100
ADAPT_TEMPLATES_STRUCT * Templates
Definition: adaptmatch.cpp:125
int GetCharNormFeature(const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES_STRUCT *templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array)
BIT_VECTOR AllProtosOn
Definition: classify.h:427
IntegerMatcher im_
Definition: classify.h:445
int PruneClasses(const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, std::vector< CP_RESULT_STRUCT > *results)
Definition: intmatcher.cpp:427
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:811
void DisplayAdaptedChar(TBLOB *blob, INT_CLASS_STRUCT *int_class)
Definition: adaptmatch.cpp:940
void RemoveBadMatches(ADAPT_RESULTS *Results)
bool LooksLikeGarbage(TBLOB *blob)
int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
Definition: adaptmatch.cpp:778
void UpdateAmbigsGroup(CLASS_ID class_id, TBLOB *Blob)
BIT_VECTOR TempProtoMask
Definition: classify.h:430
void AddNewResult(const UnicharRating &new_result, ADAPT_RESULTS *results)
Definition: adaptmatch.cpp:986
void LearnBlob(const std::string &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:35
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:345
ADAPT_TEMPLATES_STRUCT * AdaptedTemplates
Definition: classify.h:420
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:596
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:625
UNICHAR_ID * BaselineClassifier(TBLOB *Blob, const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES_STRUCT *Templates, ADAPT_RESULTS *Results)
ShapeTable * shape_table_
Definition: classify.h:451
void ShowBestMatchFor(int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
int ClassAndConfigIDToFontOrShapeID(int class_id, int int_result_config) const
static void SetupBLCNDenorms(const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
Definition: intfx.cpp:129
void ClearCharNormArray(uint8_t *char_norm_array)
Definition: float2int.cpp:41
bool LargeSpeckle(const TBLOB &blob)
Definition: classify.cpp:190
void ConvertProto(PROTO_STRUCT *Proto, int ProtoId, INT_CLASS_STRUCT *Class)
Definition: intproto.cpp:452
void RefreshDebugWindow(ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
Definition: adaptmatch.cpp:240
PROTO_ID MakeNewTempProtos(FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS_STRUCT *IClass, ADAPT_CLASS_STRUCT *Class, BIT_VECTOR TempProtoMask)
UnicityTable< FontSet > fontset_table_
Definition: classify.h:442
int CharNormClassifier(TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
INT_TEMPLATES_STRUCT * ReadIntTemplates(TFile *fp)
Definition: intproto.cpp:629
ADAPT_TEMPLATES_STRUCT * ReadAdaptedTemplates(TFile *File)
Definition: adaptive.cpp:235
double ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors)
void ClassifyAsNoise(ADAPT_RESULTS *Results)
void DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results)
void ExpandShapesAndApplyCorrections(ADAPT_CLASS_STRUCT **classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:173
BIT_VECTOR AllConfigsOff
Definition: classify.h:429
UNICHAR_ID * GetAmbiguities(TBLOB *Blob, CLASS_ID CorrectClass)
ADAPT_TEMPLATES_STRUCT * BackupAdaptedTemplates
Definition: classify.h:424
void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS_STRUCT *Class, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptmatch.cpp:686
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:609
bool TempConfigReliable(CLASS_ID class_id, const TEMP_CONFIG_STRUCT *config)
void AmbigClassifier(const std::vector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES_STRUCT *templates, ADAPT_CLASS_STRUCT **classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
INT_TEMPLATES_STRUCT * PreTrainedTemplates
Definition: classify.h:419
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:202
int MakeNewTemporaryConfig(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void ComputeIntCharNormArray(const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array)
Definition: float2int.cpp:58
void ComputeCharNormArrays(FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array)
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
void LearnPieces(const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
Definition: adaptmatch.cpp:385
NORM_PROTOS * NormProtos
Definition: classify.h:432
FEATURE_SET ExtractPicoFeatures(TBLOB *Blob)
Definition: picofeat.cpp:60
void MasterMatcher(INT_TEMPLATES_STRUCT *templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS_STRUCT **classes, int debug, int matcher_multiplier, const TBOX &blob_box, const std::vector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
BIT_VECTOR AllConfigsOn
Definition: classify.h:428
void SetAdaptiveThreshold(float Threshold)
int GetFontinfoId(ADAPT_CLASS_STRUCT *Class, uint8_t ConfigId)
Definition: adaptive.cpp:118
void ReadNewCutoffs(TFile *fp, uint16_t *Cutoffs)
Definition: cutoffs.cpp:41
std::string ClassIDToDebugStr(const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int CharNormTrainingSample(bool pruner_only, int keep_this, const TrainingSample &sample, std::vector< UnicharRating > *results)
void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
Definition: float2int.cpp:85
int ShapeIDToClassID(int shape_id) const
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:464
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:434
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES_STRUCT *adaptive_templates)
Definition: adaptmatch.cpp:843
void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES_STRUCT *Templates)
Definition: adaptive.cpp:153
virtual Dict & getDict()
Definition: classify.h:98
void RemoveExtraPuncs(ADAPT_RESULTS *Results)
FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob)
Definition: outfeat.cpp:40
void ConvertMatchesToChoices(const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void MakePermanent(ADAPT_TEMPLATES_STRUCT *Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
void DebugAdaptiveClassifier(TBLOB *Blob, ADAPT_RESULTS *Results)
void AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices)
Definition: classify.cpp:169
void PrintAdaptiveMatchResults(const ADAPT_RESULTS &results)
void Match(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, const INT_FEATURE_STRUCT *Features, tesseract::UnicharRating *Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows)
Definition: intmatcher.cpp:482
int FindBadFeatures(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_ID *FeatureArray, int AdaptFeatureThreshold, int Debug)
Definition: intmatcher.cpp:619
float ApplyCNCorrection(float rating, int blob_length, int normalization_factor, int matcher_multiplier)
int FindGoodProtos(INT_CLASS_STRUCT *ClassTemplate, BIT_VECTOR ProtoMask, BIT_VECTOR ConfigMask, int16_t NumFeatures, INT_FEATURE_ARRAY Features, PROTO_ID *ProtoArray, int AdaptProtoThreshold, int Debug)
Definition: intmatcher.cpp:555
INT_CLASS_STRUCT * Class[MAX_NUM_CLASSES]
Definition: intproto.h:111
std::vector< float > Params
Definition: ocrfeatures.h:66
std::vector< FEATURE_STRUCT * > Features
Definition: ocrfeatures.h:85
virtual int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug, UNICHAR_ID keep_this, std::vector< UnicharRating > *results)
void DebugDisplay(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id)
std::vector< ScoredFont > fonts
Definition: shapetable.h:71
int size() const
Definition: shapetable.h:169
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292
int MaxNumUnichars() const
Definition: shapetable.cpp:472
const INT_FEATURE_STRUCT * features() const
uint32_t num_features() const
FEATURE_STRUCT * GetCNFeature() const
int geo_feature(int index) const
void SettupStopperPass2()
Sets up stopper variables in preparation for the second pass.
Definition: stopper.cpp:366
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:111
void EndDangerousAmbigs()
Definition: stopper.cpp:358
void SettupStopperPass1()
Sets up stopper variables in preparation for the first pass.
Definition: stopper.cpp:362
const UNICHARSET & getUnicharset() const
Definition: dict.h:104
static void Update()
Definition: scrollview.cpp:700