tesseract v5.3.3.20231005
commontraining.cpp
Go to the documentation of this file.
1// Copyright 2008 Google Inc. All Rights Reserved.
2// Author: scharron@google.com (Samuel Charron)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14#define _USE_MATH_DEFINES // for M_PI
15
16#include "commontraining.h"
17
18#ifdef DISABLED_LEGACY_ENGINE
19
20# include "params.h"
21# include "tprintf.h"
22
23namespace tesseract {
24
25INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
26INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
27STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
28STRING_PARAM_FLAG(D, "", "Directory to write output files to");
29STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
30STRING_PARAM_FLAG(X, "", "File listing font xheights");
31STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
32STRING_PARAM_FLAG(O, "", "File to write unicharset to");
33STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
34STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
35STRING_PARAM_FLAG(fonts_dir, "",
36 "If empty it uses system default. Otherwise it overrides "
37 "system default font location");
38STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir");
39
51void ParseArguments(int *argc, char ***argv) {
52 std::string usage;
53 if (*argc) {
54 usage += (*argv)[0];
55 usage += " -v | --version | ";
56 usage += (*argv)[0];
57 }
58 usage += " [.tr files ...]";
59 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
60}
61
62} // namespace tesseract.
63
64#else
65
66# include <allheaders.h>
67# include "ccutil.h"
68# include "classify.h"
69# include "cluster.h"
70# include "clusttool.h"
71# include "featdefs.h"
72# include "fontinfo.h"
73# include "intfeaturespace.h"
74# include "mastertrainer.h"
75# include "mf.h"
76# include "oldlist.h"
77# include "params.h"
78# include "shapetable.h"
79# include "tessdatamanager.h"
80# include "tprintf.h"
81# include "unicity_table.h"
82
83namespace tesseract {
84
85// Global Variables.
86
87// global variable to hold configuration parameters to control clustering
88// -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
89CLUSTERCONFIG Config = {elliptical, 0.625, 0.05, 1.0, 1e-6, 0};
91static CCUtil ccutil;
92
93INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
94static INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
95static STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
96STRING_PARAM_FLAG(D, "", "Directory to write output files to");
97STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
98STRING_PARAM_FLAG(X, "", "File listing font xheights");
99STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
100STRING_PARAM_FLAG(O, "", "File to write unicharset to");
101STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
102STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
103STRING_PARAM_FLAG(fonts_dir, "", "");
104STRING_PARAM_FLAG(fontconfig_tmpdir, "", "");
105static DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
106 "Min number of samples per proto as % of total");
107static DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
108 "Max percentage of samples in a cluster which have more"
109 " than 1 feature in that cluster");
110static DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
111 "Desired independence between dimensions");
112static DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
113 "Desired confidence in prototypes created");
114
125void ParseArguments(int *argc, char ***argv) {
126 std::string usage;
127 if (*argc) {
128 usage += (*argv)[0];
129 usage += " -v | --version | ";
130 usage += (*argv)[0];
131 }
132 usage += " [.tr files ...]";
133 tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
134 // Set some global values based on the flags.
136 std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
137 Config.MaxIllegal = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_max_illegal)));
138 Config.Independence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_independence)));
139 Config.Confidence = std::max(0.0, std::min(1.0, double(FLAGS_clusterconfig_confidence)));
140 // Set additional parameters from config file if specified.
141 if (!FLAGS_configfile.empty()) {
143 FLAGS_configfile.c_str(), tesseract::SET_PARAM_CONSTRAINT_NON_INIT_ONLY, ccutil.params());
144 }
145}
146
147// Helper loads shape table from the given file.
148ShapeTable *LoadShapeTable(const std::string &file_prefix) {
149 ShapeTable *shape_table = nullptr;
150 std::string shape_table_file = file_prefix;
151 shape_table_file += kShapeTableFileSuffix;
152 TFile shape_fp;
153 if (shape_fp.Open(shape_table_file.c_str(), nullptr)) {
154 shape_table = new ShapeTable;
155 if (!shape_table->DeSerialize(&shape_fp)) {
156 delete shape_table;
157 shape_table = nullptr;
158 tprintf("Error: Failed to read shape table %s\n", shape_table_file.c_str());
159 } else {
160 int num_shapes = shape_table->NumShapes();
161 tprintf("Read shape table %s of %d shapes\n", shape_table_file.c_str(), num_shapes);
162 }
163 } else {
164 tprintf("Warning: No shape table file present: %s\n", shape_table_file.c_str());
165 }
166 return shape_table;
167}
168
169// Helper to write the shape_table.
170void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table) {
171 std::string shape_table_file = file_prefix;
172 shape_table_file += kShapeTableFileSuffix;
173 FILE *fp = fopen(shape_table_file.c_str(), "wb");
174 if (fp != nullptr) {
175 if (!shape_table.Serialize(fp)) {
176 fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.c_str());
177 }
178 fclose(fp);
179 } else {
180 fprintf(stderr, "Error creating shape table: %s\n", shape_table_file.c_str());
181 }
182}
183
200std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
201 ShapeTable **shape_table, std::string &file_prefix) {
204 file_prefix = "";
205 if (!FLAGS_D.empty()) {
206 file_prefix += FLAGS_D.c_str();
207 file_prefix += "/";
208 }
209 // If we are shape clustering (nullptr shape_table) or we successfully load
210 // a shape_table written by a previous shape clustering, then
211 // shape_analysis will be true, meaning that the MasterTrainer will replace
212 // some members of the unicharset with their fragments.
213 bool shape_analysis = false;
214 if (shape_table != nullptr) {
215 *shape_table = LoadShapeTable(file_prefix);
216 if (*shape_table != nullptr) {
217 shape_analysis = true;
218 }
219 } else {
220 shape_analysis = true;
221 }
222 auto trainer = std::make_unique<MasterTrainer>(NM_CHAR_ANISOTROPIC, shape_analysis, replication,
223 FLAGS_debug_level);
226 trainer->LoadUnicharset(FLAGS_U.c_str());
227 // Get basic font information from font_properties.
228 if (!FLAGS_F.empty()) {
229 if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
230 return {};
231 }
232 }
233 if (!FLAGS_X.empty()) {
234 if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
235 return {};
236 }
237 }
238 trainer->SetFeatureSpace(fs);
239 // Load training data from .tr files in filelist (terminated by nullptr).
240 for (const char *page_name = *filelist++; page_name != nullptr; page_name = *filelist++) {
241 tprintf("Reading %s ...\n", page_name);
242 trainer->ReadTrainingSamples(page_name, feature_defs, false);
243
244 // If there is a file with [lang].[fontname].exp[num].fontinfo present,
245 // read font spacing information in to fontinfo_table.
246 int pagename_len = strlen(page_name);
247 char *fontinfo_file_name = new char[pagename_len + 7];
248 strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
249 strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
250 trainer->AddSpacingInfo(fontinfo_file_name);
251 delete[] fontinfo_file_name;
252
253 // Load the images into memory if required by the classifier.
254 if (FLAGS_load_images) {
255 std::string image_name = page_name;
256 // Chop off the tr and replace with tif. Extension must be tif!
257 image_name.resize(image_name.length() - 2);
258 image_name += "tif";
259 trainer->LoadPageImages(image_name.c_str());
260 }
261 }
262 trainer->PostLoadCleanup();
263 // Write the master trainer if required.
264 if (!FLAGS_output_trainer.empty()) {
265 FILE *fp = fopen(FLAGS_output_trainer.c_str(), "wb");
266 if (fp == nullptr) {
267 tprintf("Can't create saved trainer data!\n");
268 } else {
269 trainer->Serialize(fp);
270 fclose(fp);
271 }
272 }
273 trainer->PreTrainingSetup();
274 if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
275 fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
276 return {};
277 }
278
279 if (shape_table != nullptr) {
280 // If we previously failed to load a shapetable, then shape clustering
281 // wasn't run so make a flat one now.
282 if (*shape_table == nullptr) {
283 *shape_table = new ShapeTable;
284 trainer->SetupFlatShapeTable(*shape_table);
285 tprintf("Flat shape table summary: %s\n", (*shape_table)->SummaryStr().c_str());
286 }
287 (*shape_table)->set_unicharset(trainer->unicharset());
288 }
289 return trainer;
290}
291
292/*---------------------------------------------------------------------------*/
302LABELEDLIST FindList(LIST List, const std::string &Label) {
303 LABELEDLIST LabeledList;
304
305 iterate(List) {
306 LabeledList = reinterpret_cast<LABELEDLIST>(List->first_node());
307 if (LabeledList->Label == Label) {
308 return (LabeledList);
309 }
310 }
311 return (nullptr);
312
313} /* FindList */
314
315/*---------------------------------------------------------------------------*/
316// TODO(rays) This is now used only by cntraining. Convert cntraining to use
317// the new method or get rid of it entirely.
330void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,
331 int max_samples, UNICHARSET *unicharset, FILE *file,
332 LIST *training_samples) {
333 char buffer[2048];
334 char unichar[UNICHAR_LEN + 1];
335 LABELEDLIST char_sample;
336 FEATURE_SET feature_samples;
337 uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);
338
339 // Zero out the font_sample_count for all the classes.
340 LIST it = *training_samples;
341 iterate(it) {
342 char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());
343 char_sample->font_sample_count = 0;
344 }
345
346 while (fgets(buffer, 2048, file) != nullptr) {
347 if (buffer[0] == '\n') {
348 continue;
349 }
350
351 sscanf(buffer, "%*s %s", unichar);
352 if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
353 unicharset->unichar_insert(unichar);
354 if (unicharset->size() > MAX_NUM_CLASSES) {
355 tprintf(
356 "Error: Size of unicharset in training is "
357 "greater than MAX_NUM_CLASSES\n");
358 exit(1);
359 }
360 }
361 char_sample = FindList(*training_samples, unichar);
362 if (char_sample == nullptr) {
363 char_sample = new LABELEDLISTNODE(unichar);
364 *training_samples = push(*training_samples, char_sample);
365 }
366 auto char_desc = ReadCharDescription(feature_definitions, file);
367 feature_samples = char_desc->FeatureSets[feature_type];
368 if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
369 char_sample->List = push(char_sample->List, feature_samples);
370 char_sample->SampleCount++;
371 char_sample->font_sample_count++;
372 } else {
373 delete feature_samples;
374 }
375 for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
376 if (feature_type != i) {
377 delete char_desc->FeatureSets[i];
378 }
379 char_desc->FeatureSets[i] = nullptr;
380 }
381 delete char_desc;
382 }
383} // ReadTrainingSamples
384
385/*---------------------------------------------------------------------------*/
391void FreeTrainingSamples(LIST CharList) {
392 LABELEDLIST char_sample;
393 FEATURE_SET FeatureSet;
394 LIST FeatureList;
395
396 LIST nodes = CharList;
397 iterate(CharList) { /* iterate through all of the fonts */
398 char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
399 FeatureList = char_sample->List;
400 iterate(FeatureList) { /* iterate through all of the classes */
401 FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
402 delete FeatureSet;
403 }
404 FreeLabeledList(char_sample);
405 }
406 destroy(nodes);
407} /* FreeTrainingSamples */
408
409/*---------------------------------------------------------------------------*/
417void FreeLabeledList(LABELEDLIST LabeledList) {
418 destroy(LabeledList->List);
419 delete LabeledList;
420} /* FreeLabeledList */
421
422/*---------------------------------------------------------------------------*/
435 const char *program_feature_type) {
436 uint16_t N;
437 CLUSTERER *Clusterer;
438 LIST FeatureList = nullptr;
439 FEATURE_SET FeatureSet = nullptr;
440
441 int32_t desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
442 N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
443 Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
444
445 FeatureList = char_sample->List;
446 uint32_t CharID = 0;
447 std::vector<float> Sample;
448 iterate(FeatureList) {
449 FeatureSet = reinterpret_cast<FEATURE_SET>(FeatureList->first_node());
450 for (int i = 0; i < FeatureSet->MaxNumFeatures; i++) {
451 if (Sample.empty()) {
452 Sample.resize(N);
453 }
454 for (int j = 0; j < N; j++) {
455 Sample[j] = FeatureSet->Features[i]->Params[j];
456 }
457 MakeSample(Clusterer, &Sample[0], CharID);
458 }
459 CharID++;
460 }
461 return Clusterer;
462
463} /* SetUpForClustering */
464
465/*------------------------------------------------------------------------*/
466void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer,
467 CLUSTERCONFIG *clusterconfig) {
468 PROTOTYPE *Prototype;
469 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
470
471 LIST pProtoList = ProtoList;
472 iterate(pProtoList) {
473 Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
474 if (Prototype->Significant || Prototype->Merged) {
475 continue;
476 }
477 float best_dist = 0.125;
478 PROTOTYPE *best_match = nullptr;
479 // Find the nearest alive prototype.
480 LIST list_it = ProtoList;
481 iterate(list_it) {
482 auto *test_p = reinterpret_cast<PROTOTYPE *>(list_it->first_node());
483 if (test_p != Prototype && !test_p->Merged) {
484 float dist = ComputeDistance(Clusterer->SampleSize, Clusterer->ParamDesc, &Prototype->Mean[0],
485 &test_p->Mean[0]);
486 if (dist < best_dist) {
487 best_match = test_p;
488 best_dist = dist;
489 }
490 }
491 }
492 if (best_match != nullptr && !best_match->Significant) {
493 if (debug) {
494 auto bestMatchNumSamples = best_match->NumSamples;
495 auto prototypeNumSamples = Prototype->NumSamples;
496 tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", bestMatchNumSamples,
497 prototypeNumSamples, best_match->Mean[0], best_match->Mean[1], Prototype->Mean[0],
498 Prototype->Mean[1]);
499 }
500 best_match->NumSamples =
501 MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc, best_match->NumSamples,
502 Prototype->NumSamples, &best_match->Mean[0], &best_match->Mean[0], &Prototype->Mean[0]);
503 Prototype->NumSamples = 0;
504 Prototype->Merged = true;
505 } else if (best_match != nullptr) {
506 if (debug) {
507 tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0],
508 Prototype->Mean[1], best_match->Mean[0], best_match->Mean[1]);
509 }
510 Prototype->Merged = true;
511 }
512 }
513 // Mark significant those that now have enough samples.
514 int min_samples = static_cast<int32_t>(clusterconfig->MinSamples * Clusterer->NumChar);
515 pProtoList = ProtoList;
516 iterate(pProtoList) {
517 Prototype = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
518 // Process insignificant protos that do not match a green one
519 if (!Prototype->Significant && Prototype->NumSamples >= min_samples && !Prototype->Merged) {
520 if (debug) {
521 tprintf("Red proto at %g,%g becoming green\n", Prototype->Mean[0], Prototype->Mean[1]);
522 }
523 Prototype->Significant = true;
524 }
525 }
526} /* MergeInsignificantProtos */
527
528/*-----------------------------------------------------------------------------*/
529void CleanUpUnusedData(LIST ProtoList) {
530 PROTOTYPE *Prototype;
531
532 iterate(ProtoList) {
533 Prototype = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
534 delete[] Prototype->Variance.Elliptical;
535 Prototype->Variance.Elliptical = nullptr;
536 delete[] Prototype->Magnitude.Elliptical;
537 Prototype->Magnitude.Elliptical = nullptr;
538 delete[] Prototype->Weight.Elliptical;
539 Prototype->Weight.Elliptical = nullptr;
540 }
541}
542
543/*------------------------------------------------------------------------*/
544LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
545
546{
547 LIST NewProtoList = NIL_LIST;
548 auto pProtoList = ProtoList;
549 iterate(pProtoList) {
550 auto Proto = reinterpret_cast<PROTOTYPE *>(pProtoList->first_node());
551 if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) {
552 auto NewProto = new PROTOTYPE;
553 NewProto->Mean = Proto->Mean;
554 NewProto->Significant = Proto->Significant;
555 NewProto->Style = Proto->Style;
556 NewProto->NumSamples = Proto->NumSamples;
557 NewProto->Cluster = nullptr;
558 NewProto->Distrib.clear();
559
560 if (Proto->Variance.Elliptical != nullptr) {
561 NewProto->Variance.Elliptical = new float[N];
562 for (int i = 0; i < N; i++) {
563 NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
564 }
565 } else {
566 NewProto->Variance.Elliptical = nullptr;
567 }
568 //---------------------------------------------
569 if (Proto->Magnitude.Elliptical != nullptr) {
570 NewProto->Magnitude.Elliptical = new float[N];
571 for (int i = 0; i < N; i++) {
572 NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
573 }
574 } else {
575 NewProto->Magnitude.Elliptical = nullptr;
576 }
577 //------------------------------------------------
578 if (Proto->Weight.Elliptical != nullptr) {
579 NewProto->Weight.Elliptical = new float[N];
580 for (int i = 0; i < N; i++) {
581 NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
582 }
583 } else {
584 NewProto->Weight.Elliptical = nullptr;
585 }
586
587 NewProto->TotalMagnitude = Proto->TotalMagnitude;
588 NewProto->LogMagnitude = Proto->LogMagnitude;
589 NewProtoList = push_last(NewProtoList, NewProto);
590 }
591 }
592 FreeProtoList(&ProtoList);
593 return (NewProtoList);
594} /* RemoveInsignificantProtos */
595
596/*----------------------------------------------------------------------------*/
597MERGE_CLASS FindClass(LIST List, const std::string &Label) {
598 MERGE_CLASS MergeClass;
599
600 iterate(List) {
601 MergeClass = reinterpret_cast<MERGE_CLASS>(List->first_node());
602 if (MergeClass->Label == Label) {
603 return (MergeClass);
604 }
605 }
606 return (nullptr);
607
608} /* FindClass */
609
610/*-----------------------------------------------------------------------------*/
616void FreeLabeledClassList(LIST ClassList) {
617 MERGE_CLASS MergeClass;
618
619 LIST nodes = ClassList;
620 iterate(ClassList) /* iterate through all of the fonts */
621 {
622 MergeClass = reinterpret_cast<MERGE_CLASS>(ClassList->first_node());
623 FreeClass(MergeClass->Class);
624 delete MergeClass;
625 }
626 destroy(nodes);
627
628} /* FreeLabeledClassList */
629
630/* SetUpForFloat2Int */
631CLASS_STRUCT *SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList) {
632 MERGE_CLASS MergeClass;
633 CLASS_TYPE Class;
634 int NumProtos;
635 int NumConfigs;
636 int NumWords;
637 int i, j;
638 float Values[3];
639 PROTO_STRUCT *NewProto;
640 PROTO_STRUCT *OldProto;
641 BIT_VECTOR NewConfig;
642 BIT_VECTOR OldConfig;
643
644 // printf("Float2Int ...\n");
645
646 auto *float_classes = new CLASS_STRUCT[unicharset.size()];
647 iterate(LabeledClassList) {
648 UnicityTable<int> font_set;
649 MergeClass = reinterpret_cast<MERGE_CLASS>(LabeledClassList->first_node());
650 Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label.c_str())];
651 NumProtos = MergeClass->Class->NumProtos;
652 NumConfigs = MergeClass->Class->NumConfigs;
653 font_set.move(&MergeClass->Class->font_set);
654 Class->NumProtos = NumProtos;
655 Class->MaxNumProtos = NumProtos;
656 Class->Prototypes.resize(NumProtos);
657 for (i = 0; i < NumProtos; i++) {
658 NewProto = ProtoIn(Class, i);
659 OldProto = ProtoIn(MergeClass->Class, i);
660 Values[0] = OldProto->X;
661 Values[1] = OldProto->Y;
662 Values[2] = OldProto->Angle;
664 NewProto->X = OldProto->X;
665 NewProto->Y = OldProto->Y;
666 NewProto->Length = OldProto->Length;
667 NewProto->Angle = OldProto->Angle;
668 NewProto->A = Values[0];
669 NewProto->B = Values[1];
670 NewProto->C = Values[2];
671 }
672
673 Class->NumConfigs = NumConfigs;
674 Class->MaxNumConfigs = NumConfigs;
675 Class->font_set.move(&font_set);
676 Class->Configurations.resize(NumConfigs);
677 NumWords = WordsInVectorOfSize(NumProtos);
678 for (i = 0; i < NumConfigs; i++) {
679 NewConfig = NewBitVector(NumProtos);
680 OldConfig = MergeClass->Class->Configurations[i];
681 for (j = 0; j < NumWords; j++) {
682 NewConfig[j] = OldConfig[j];
683 }
684 Class->Configurations[i] = NewConfig;
685 }
686 }
687 return float_classes;
688} // SetUpForFloat2Int
689
690/*--------------------------------------------------------------------------*/
691void Normalize(float *Values) {
692 float Slope;
693 float Intercept;
694 float Normalizer;
695
696 Slope = tan(Values[2] * 2 * M_PI);
697 Intercept = Values[1] - Slope * Values[0];
698 Normalizer = 1 / sqrt(Slope * Slope + 1.0);
699
700 Values[0] = Slope * Normalizer;
701 Values[1] = -Normalizer;
702 Values[2] = Intercept * Normalizer;
703} // Normalize
704
705/*-------------------------------------------------------------------------*/
707
708{
709 LABELEDLIST char_sample;
710
711 LIST nodes = CharList;
712 iterate(CharList) /* iterate through all of the fonts */
713 {
714 char_sample = reinterpret_cast<LABELEDLIST>(CharList->first_node());
715 FreeLabeledList(char_sample);
716 }
717 destroy(nodes);
718
719} // FreeNormProtoList
720
721/*---------------------------------------------------------------------------*/
722void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName) {
723 auto LabeledProtoList = new LABELEDLISTNODE(CharName.c_str());
724 iterate(ProtoList) {
725 auto Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
726 LabeledProtoList->List = push(LabeledProtoList->List, Proto);
727 }
728 *NormProtoList = push(*NormProtoList, LabeledProtoList);
729}
730
731/*---------------------------------------------------------------------------*/
732int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos) {
733 int N = 0;
734 iterate(ProtoList) {
735 auto *Proto = reinterpret_cast<PROTOTYPE *>(ProtoList->first_node());
736 if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) {
737 N++;
738 }
739 }
740 return (N);
741}
742
743} // namespace tesseract.
744
745#endif // def DISABLED_LEGACY_ENGINE
#define UNICHAR_LEN
Definition: unichar.h:31
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
uint32_t * BIT_VECTOR
Definition: bitvec.h:28
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define DOUBLE_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
#define ProtoIn(Class, Pid)
Definition: protos.h:70
const int kBoostXYBuckets
const int kBoostDirBuckets
MERGE_CLASS FindClass(LIST List, const std::string &Label)
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:43
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203
void Normalize(float *Values)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
Definition: kdtree.cpp:400
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:172
void FreeLabeledList(LABELEDLIST LabeledList)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:1870
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
LIST destroy(LIST list)
Definition: oldlist.cpp:121
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
ShapeTable * LoadShapeTable(const std::string &file_prefix)
void InitIntegerFX()
Definition: intfx.cpp:54
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:87
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1440
void FreeTrainingSamples(LIST CharList)
void CleanUpUnusedData(LIST ProtoList)
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:192
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:119
@ elliptical
Definition: cluster.h:53
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
@ NM_CHAR_ANISOTROPIC
Definition: normalis.h:49
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1491
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void FreeLabeledClassList(LIST ClassList)
LABELEDLIST FindList(LIST List, const std::string &Label)
internal::ValueArray< T... > Values(T... v)
void move(UnicityTable< T > *from)
ParamsVectors * params()
Definition: ccutil.h:53
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
float * Elliptical
Definition: cluster.h:69
std::vector< float > Mean
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:87
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:88
int16_t SampleSize
Definition: cluster.h:92
PARAM_DESC * ParamDesc
Definition: cluster.h:93
uint32_t NumChar
Definition: cluster.h:98
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54
std::vector< FEATURE_STRUCT * > Features
Definition: ocrfeatures.h:85
std::vector< BIT_VECTOR > Configurations
Definition: protos.h:46
UnicityTable< int > font_set
Definition: protos.h:47
int16_t MaxNumConfigs
Definition: protos.h:44
int16_t MaxNumProtos
Definition: protos.h:42
std::vector< PROTO_STRUCT > Prototypes
Definition: protos.h:45
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:255
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
unsigned NumShapes() const
Definition: shapetable.h:248
list_rec * first_node()
Definition: oldlist.h:107
tesseract::CLASS_TYPE Class