14#define _USE_MATH_DEFINES
18#ifdef DISABLED_LEGACY_ENGINE
36 "If empty it uses system default. Otherwise it overrides "
37 "system default font location");
38STRING_PARAM_FLAG(fontconfig_tmpdir,
"/tmp",
"Overrides fontconfig default temporary dir");
55 usage +=
" -v | --version | ";
58 usage +=
" [.tr files ...]";
66# include <allheaders.h>
106 "Min number of samples per proto as % of total");
108 "Max percentage of samples in a cluster which have more"
109 " than 1 feature in that cluster");
111 "Desired independence between dimensions");
113 "Desired confidence in prototypes created");
129 usage +=
" -v | --version | ";
132 usage +=
" [.tr files ...]";
136 std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_min_samples_fraction)));
137 Config.
MaxIllegal = std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_max_illegal)));
138 Config.
Independence = std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_independence)));
139 Config.
Confidence = std::max(0.0, std::min(1.0,
double(FLAGS_clusterconfig_confidence)));
141 if (!FLAGS_configfile.empty()) {
150 std::string shape_table_file = file_prefix;
151 shape_table_file += kShapeTableFileSuffix;
153 if (shape_fp.
Open(shape_table_file.c_str(),
nullptr)) {
157 shape_table =
nullptr;
158 tprintf(
"Error: Failed to read shape table %s\n", shape_table_file.c_str());
160 int num_shapes = shape_table->
NumShapes();
161 tprintf(
"Read shape table %s of %d shapes\n", shape_table_file.c_str(), num_shapes);
164 tprintf(
"Warning: No shape table file present: %s\n", shape_table_file.c_str());
171 std::string shape_table_file = file_prefix;
172 shape_table_file += kShapeTableFileSuffix;
173 FILE *fp = fopen(shape_table_file.c_str(),
"wb");
176 fprintf(stderr,
"Error writing shape table: %s\n", shape_table_file.c_str());
180 fprintf(stderr,
"Error creating shape table: %s\n", shape_table_file.c_str());
200std::unique_ptr<MasterTrainer>
LoadTrainingData(
const char *
const *filelist,
bool replication,
201 ShapeTable **shape_table, std::string &file_prefix) {
205 if (!FLAGS_D.empty()) {
206 file_prefix += FLAGS_D.c_str();
213 bool shape_analysis =
false;
214 if (shape_table !=
nullptr) {
216 if (*shape_table !=
nullptr) {
217 shape_analysis =
true;
220 shape_analysis =
true;
222 auto trainer = std::make_unique<MasterTrainer>(
NM_CHAR_ANISOTROPIC, shape_analysis, replication,
226 trainer->LoadUnicharset(FLAGS_U.c_str());
228 if (!FLAGS_F.empty()) {
229 if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
233 if (!FLAGS_X.empty()) {
234 if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
238 trainer->SetFeatureSpace(fs);
240 for (
const char *page_name = *filelist++; page_name !=
nullptr; page_name = *filelist++) {
241 tprintf(
"Reading %s ...\n", page_name);
242 trainer->ReadTrainingSamples(page_name,
feature_defs,
false);
246 int pagename_len = strlen(page_name);
247 char *fontinfo_file_name =
new char[pagename_len + 7];
248 strncpy(fontinfo_file_name, page_name, pagename_len - 2);
249 strcpy(fontinfo_file_name + pagename_len - 2,
"fontinfo");
250 trainer->AddSpacingInfo(fontinfo_file_name);
251 delete[] fontinfo_file_name;
254 if (FLAGS_load_images) {
255 std::string image_name = page_name;
257 image_name.resize(image_name.length() - 2);
259 trainer->LoadPageImages(image_name.c_str());
262 trainer->PostLoadCleanup();
264 if (!FLAGS_output_trainer.empty()) {
265 FILE *fp = fopen(FLAGS_output_trainer.c_str(),
"wb");
267 tprintf(
"Can't create saved trainer data!\n");
269 trainer->Serialize(fp);
273 trainer->PreTrainingSetup();
274 if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
275 fprintf(stderr,
"Failed to save unicharset to file %s\n", FLAGS_O.c_str());
279 if (shape_table !=
nullptr) {
282 if (*shape_table ==
nullptr) {
284 trainer->SetupFlatShapeTable(*shape_table);
285 tprintf(
"Flat shape table summary: %s\n", (*shape_table)->SummaryStr().c_str());
287 (*shape_table)->set_unicharset(trainer->unicharset());
307 if (LabeledList->
Label == Label) {
308 return (LabeledList);
332 LIST *training_samples) {
340 LIST it = *training_samples;
346 while (fgets(buffer, 2048,
file) !=
nullptr) {
347 if (buffer[0] ==
'\n') {
351 sscanf(buffer,
"%*s %s", unichar);
356 "Error: Size of unicharset in training is "
357 "greater than MAX_NUM_CLASSES\n");
361 char_sample =
FindList(*training_samples, unichar);
362 if (char_sample ==
nullptr) {
364 *training_samples =
push(*training_samples, char_sample);
367 feature_samples = char_desc->FeatureSets[feature_type];
369 char_sample->
List =
push(char_sample->
List, feature_samples);
373 delete feature_samples;
375 for (
size_t i = 0;
i < char_desc->NumFeatureSets;
i++) {
376 if (feature_type !=
i) {
377 delete char_desc->FeatureSets[
i];
379 char_desc->FeatureSets[
i] =
nullptr;
396 LIST nodes = CharList;
399 FeatureList = char_sample->
List;
435 const char *program_feature_type) {
438 LIST FeatureList =
nullptr;
445 FeatureList = char_sample->
List;
447 std::vector<float> Sample;
451 if (Sample.empty()) {
454 for (
int j = 0; j < N; j++) {
455 Sample[j] = FeatureSet->
Features[
i]->Params[j];
469 bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
471 LIST pProtoList = ProtoList;
477 float best_dist = 0.125;
480 LIST list_it = ProtoList;
483 if (test_p != Prototype && !test_p->
Merged) {
486 if (dist < best_dist) {
492 if (best_match !=
nullptr && !best_match->
Significant) {
494 auto bestMatchNumSamples = best_match->
NumSamples;
495 auto prototypeNumSamples = Prototype->
NumSamples;
496 tprintf(
"Merging red clusters (%d+%d) at %g,%g and %g,%g\n", bestMatchNumSamples,
497 prototypeNumSamples, best_match->
Mean[0], best_match->
Mean[1], Prototype->
Mean[0],
505 }
else if (best_match !=
nullptr) {
507 tprintf(
"Red proto at %g,%g matched a green one at %g,%g\n", Prototype->
Mean[0],
508 Prototype->
Mean[1], best_match->
Mean[0], best_match->
Mean[1]);
514 int min_samples =
static_cast<int32_t
>(clusterconfig->
MinSamples * Clusterer->
NumChar);
515 pProtoList = ProtoList;
521 tprintf(
"Red proto at %g,%g becoming green\n", Prototype->
Mean[0], Prototype->
Mean[1]);
548 auto pProtoList = ProtoList;
550 auto Proto =
reinterpret_cast<PROTOTYPE *
>(pProtoList->first_node());
551 if ((Proto->Significant && KeepSigProtos) || (!Proto->Significant && KeepInsigProtos)) {
553 NewProto->
Mean = Proto->Mean;
554 NewProto->Significant = Proto->Significant;
555 NewProto->Style = Proto->Style;
556 NewProto->NumSamples = Proto->NumSamples;
557 NewProto->Cluster =
nullptr;
558 NewProto->Distrib.clear();
560 if (Proto->Variance.Elliptical !=
nullptr) {
561 NewProto->Variance.Elliptical =
new float[N];
562 for (
int i = 0;
i < N;
i++) {
563 NewProto->Variance.Elliptical[
i] = Proto->Variance.Elliptical[
i];
566 NewProto->Variance.Elliptical =
nullptr;
569 if (Proto->Magnitude.Elliptical !=
nullptr) {
570 NewProto->Magnitude.Elliptical =
new float[N];
571 for (
int i = 0;
i < N;
i++) {
572 NewProto->Magnitude.Elliptical[
i] = Proto->Magnitude.Elliptical[
i];
575 NewProto->Magnitude.Elliptical =
nullptr;
578 if (Proto->Weight.Elliptical !=
nullptr) {
579 NewProto->Weight.Elliptical =
new float[N];
580 for (
int i = 0;
i < N;
i++) {
581 NewProto->Weight.Elliptical[
i] = Proto->Weight.Elliptical[
i];
584 NewProto->Weight.Elliptical =
nullptr;
587 NewProto->TotalMagnitude = Proto->TotalMagnitude;
588 NewProto->LogMagnitude = Proto->LogMagnitude;
589 NewProtoList =
push_last(NewProtoList, NewProto);
593 return (NewProtoList);
602 if (MergeClass->
Label == Label) {
619 LIST nodes = ClassList;
657 for (
i = 0;
i < NumProtos;
i++) {
664 NewProto->
X = OldProto->
X;
665 NewProto->
Y = OldProto->
Y;
677 NumWords = WordsInVectorOfSize(NumProtos);
678 for (
i = 0;
i < NumConfigs;
i++) {
679 NewConfig = NewBitVector(NumProtos);
681 for (j = 0; j < NumWords; j++) {
682 NewConfig[j] = OldConfig[j];
687 return float_classes;
696 Slope = tan(
Values[2] * 2 * M_PI);
698 Normalizer = 1 / sqrt(Slope * Slope + 1.0);
700 Values[0] = Slope * Normalizer;
702 Values[2] = Intercept * Normalizer;
711 LIST nodes = CharList;
726 LabeledProtoList->List =
push(LabeledProtoList->List, Proto);
728 *NormProtoList =
push(*NormProtoList, LabeledProtoList);
736 if ((Proto->Significant && CountSigProtos) || (!Proto->Significant && CountInsigProtos)) {
#define DOUBLE_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
#define ProtoIn(Class, Pid)
const int kBoostXYBuckets
const int kBoostDirBuckets
MERGE_CLASS FindClass(LIST List, const std::string &Label)
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
void Normalize(float *Values)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
float ComputeDistance(int k, PARAM_DESC *dim, float p1[], float p2[])
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
void FreeLabeledList(LABELEDLIST LabeledList)
void tprintf(const char *format,...)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
ShapeTable * LoadShapeTable(const std::string &file_prefix)
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
void FreeProtoList(LIST *ProtoList)
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
void FreeTrainingSamples(LIST CharList)
void CleanUpUnusedData(LIST ProtoList)
LIST push_last(LIST list, void *item)
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
void FreeClass(CLASS_TYPE Class)
LIST push(LIST list, void *element)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void FreeLabeledClassList(LIST ClassList)
LABELEDLIST FindList(LIST List, const std::string &Label)
internal::ValueArray< T... > Values(T... v)
void move(UnicityTable< T > *from)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
bool Open(const char *filename, FileReader reader)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool contains_unichar(const char *const unichar_repr) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
std::vector< float > Mean
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
void Init(uint8_t xbuckets, uint8_t ybuckets, uint8_t thetabuckets)
const PARAM_DESC * ParamDesc
std::vector< FEATURE_STRUCT * > Features
std::vector< BIT_VECTOR > Configurations
UnicityTable< int > font_set
std::vector< PROTO_STRUCT > Prototypes
bool DeSerialize(TFile *fp)
bool Serialize(FILE *fp) const
unsigned NumShapes() const
tesseract::CLASS_TYPE Class