tesseract v5.3.3.20231005
commontraining.h
Go to the documentation of this file.
1// Copyright 2008 Google Inc. All Rights Reserved.
2// Author: scharron@google.com (Samuel Charron)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13
14#ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
15#define TESSERACT_TRAINING_COMMONTRAINING_H_
16
17#ifdef HAVE_CONFIG_H
18# include "config_auto.h"
19#endif
20
21#include "commandlineflags.h"
22#include "export.h"
23#include "tprintf.h"
24
25#include <tesseract/baseapi.h>
26
27#include <memory>
28
29namespace tesseract {
30
31TESS_COMMON_TRAINING_API
32void ParseArguments(int *argc, char ***argv);
33
34// Check whether the shared tesseract library is the right one.
35// This function must be inline because otherwise it would be part of
36// the shared library, so it could not compare the versions.
37static inline void CheckSharedLibraryVersion() {
38#ifdef HAVE_CONFIG_H
39 if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
40 tprintf(
41 "ERROR: shared library version mismatch (was %s, expected %s\n"
42 "Did you use a wrong shared tesseract library?\n",
43 TessBaseAPI::Version(), TESSERACT_VERSION_STR);
44 exit(1);
45 }
46#endif
47}
48
49} // namespace tesseract
50
51#ifndef DISABLED_LEGACY_ENGINE
52
53# include "cluster.h"
54# include "featdefs.h"
55# include "intproto.h"
56# include "oldlist.h"
57
58namespace tesseract {
59
60class Classify;
61class MasterTrainer;
62class ShapeTable;
63
65// Globals ///////////////////////////////////////////////////////////////////
67
68TESS_COMMON_TRAINING_API
69extern FEATURE_DEFS_STRUCT feature_defs;
70
71// Must be defined in the file that "implements" commonTraining facilities.
72TESS_COMMON_TRAINING_API
73extern CLUSTERCONFIG Config;
74
76// Structs ///////////////////////////////////////////////////////////////////
82 LABELEDLISTNODE(const char *label) : Label(label) {
83 }
84 std::string Label;
85 int SampleCount = 0;
87 LIST List = nullptr;
88};
90
93 }
94 std::string Label;
97};
99
101// Functions /////////////////////////////////////////////////////////////////
103
104// Helper loads shape table from the given file.
105ShapeTable *LoadShapeTable(const std::string &file_prefix);
106// Helper to write the shape_table.
107TESS_COMMON_TRAINING_API
108void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table);
109
110// Creates a MasterTraininer and loads the training data into it:
111// Initializes feature_defs and IntegerFX.
112// Loads the shape_table if shape_table != nullptr.
113// Loads initial unicharset from -U command-line option.
114// If FLAGS_input_trainer is set, loads the majority of data from there, else:
115// Loads font info from -F option.
116// Loads xheights from -X option.
117// Loads samples from .tr files in remaining command-line args.
118// Deletes outliers and computes canonical samples.
119// If FLAGS_output_trainer is set, saves the trainer for future use.
120// Computes canonical and cloud features.
121// If shape_table is not nullptr, but failed to load, make a fake flat one,
122// as shape clustering was not run.
123TESS_COMMON_TRAINING_API
124std::unique_ptr<MasterTrainer> LoadTrainingData(const char *const *filelist, bool replication,
125 ShapeTable **shape_table, std::string &file_prefix);
126
127LABELEDLIST FindList(tesseract::LIST List, const std::string &Label);
128
129TESS_COMMON_TRAINING_API
131 const char *feature_name, int max_samples,
132 tesseract::UNICHARSET *unicharset, FILE *file,
133 tesseract::LIST *training_samples);
134
135void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory,
136 tesseract::LIST CharList, const char *program_feature_type);
137
138TESS_COMMON_TRAINING_API
140
141TESS_COMMON_TRAINING_API
142void FreeLabeledList(LABELEDLIST LabeledList);
143
144TESS_COMMON_TRAINING_API
145void FreeLabeledClassList(tesseract::LIST ClassListList);
146
147TESS_COMMON_TRAINING_API
149 LABELEDLIST CharSample, const char *program_feature_type);
150
151TESS_COMMON_TRAINING_API
152tesseract::LIST RemoveInsignificantProtos(tesseract::LIST ProtoList, bool KeepSigProtos,
153 bool KeepInsigProtos, int N);
154
155TESS_COMMON_TRAINING_API
156void CleanUpUnusedData(tesseract::LIST ProtoList);
157
158TESS_COMMON_TRAINING_API
159void MergeInsignificantProtos(tesseract::LIST ProtoList, const char *label,
161
162TESS_COMMON_TRAINING_API
163MERGE_CLASS FindClass(tesseract::LIST List, const std::string &Label);
164
165TESS_COMMON_TRAINING_API
167 tesseract::LIST LabeledClassList);
168
169void Normalize(float *Values);
170
171TESS_COMMON_TRAINING_API
173
174TESS_COMMON_TRAINING_API
175void AddToNormProtosList(tesseract::LIST *NormProtoList, tesseract::LIST ProtoList, const std::string &CharName);
176
177TESS_COMMON_TRAINING_API
178int NumberOfProtos(tesseract::LIST ProtoList, bool CountSigProtos, bool CountInsigProtos);
179
181
182} // namespace tesseract
183
184#endif // def DISABLED_LEGACY_ENGINE
185
186#endif // TESSERACT_TRAINING_COMMONTRAINING_H_
#define MAX_NUM_PROTOS
Definition: intproto.h:48
#define MAX_NUM_CONFIGS
Definition: intproto.h:47
void WriteTrainingSamples(const tesseract::FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, tesseract::LIST CharList, const char *program_feature_type)
MERGE_CLASS FindClass(LIST List, const std::string &Label)
void Normalize(float *Values)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
void FreeLabeledList(LABELEDLIST LabeledList)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
ShapeTable * LoadShapeTable(const std::string &file_prefix)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
void FreeTrainingSamples(LIST CharList)
void CleanUpUnusedData(LIST ProtoList)
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:145
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
void FreeLabeledClassList(LIST ClassList)
LABELEDLIST FindList(LIST List, const std::string &Label)
void allocNormProtos()
internal::ValueArray< T... > Values(T... v)
static const char * Version()
Definition: baseapi.cpp:241
LABELEDLISTNODE(const char *label)
MERGE_CLASS_NODE(const char *label)
tesseract::CLASS_TYPE Class
int NumMerged[MAX_NUM_PROTOS]