tesseract v5.3.3.20231005
mftraining.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: mftraining.c
3 ** Purpose: Separates training pages into files for each character.
4 ** Strips from files only the features and there parameters of
5 ** the feature type mf.
6 ** Author: Dan Johnson
7 ** Revisment: Christy Russon
8 **
9 ** (c) Copyright Hewlett-Packard Company, 1988.
10 ** Licensed under the Apache License, Version 2.0 (the "License");
11 ** you may not use this file except in compliance with the License.
12 ** You may obtain a copy of the License at
13 ** http://www.apache.org/licenses/LICENSE-2.0
14 ** Unless required by applicable law or agreed to in writing, software
15 ** distributed under the License is distributed on an "AS IS" BASIS,
16 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 ** See the License for the specific language governing permissions and
18 ** limitations under the License.
19 ******************************************************************************/
20/*----------------------------------------------------------------------------
21 Include Files and Type Defines
22----------------------------------------------------------------------------*/
23
24#define _USE_MATH_DEFINES // for M_PI
25#ifdef HAVE_CONFIG_H
26# include "config_auto.h"
27#endif
28
29#include <cmath> // for M_PI
30#include <cstdio>
31#include <cstring>
32
33#include "classify.h"
34#include "cluster.h"
35#include "clusttool.h"
36#include "commontraining.h"
37#include "featdefs.h"
38#include "fontinfo.h"
39#include "indexmapbidi.h"
40#include "intproto.h"
41#include "mastertrainer.h"
42#include "mergenf.h"
43#include "mf.h"
44#include "ocrfeatures.h"
45#include "oldlist.h"
46#include "protos.h"
47#include "shapetable.h"
48#include "tprintf.h"
49#include "unicity_table.h"
50
51using namespace tesseract;
52
53/*----------------------------------------------------------------------------
54 Public Code
55-----------------------------------------------------------------------------*/
56#ifndef GRAPHICS_DISABLED
57static void DisplayProtoList(const char *ch, LIST protolist) {
58 auto window = std::make_unique<ScrollView>("Char samples", 50, 200, 520, 520, 260, 260, true);
59 LIST proto = protolist;
60 iterate(proto) {
61 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto->first_node());
62 if (prototype->Significant) {
63 window->Pen(ScrollView::GREEN);
64 } else if (prototype->NumSamples == 0) {
65 window->Pen(ScrollView::BLUE);
66 } else if (prototype->Merged) {
67 window->Pen(ScrollView::MAGENTA);
68 } else {
69 window->Pen(ScrollView::RED);
70 }
71 float x = CenterX(prototype->Mean);
72 float y = CenterY(prototype->Mean);
73 double angle = OrientationOf(prototype->Mean) * 2 * M_PI;
74 auto dx = static_cast<float>(LengthOf(prototype->Mean) * cos(angle) / 2);
75 auto dy = static_cast<float>(LengthOf(prototype->Mean) * sin(angle) / 2);
76 window->SetCursor((x - dx) * 256, (y - dy) * 256);
77 window->DrawTo((x + dx) * 256, (y + dy) * 256);
78 auto prototypeNumSamples = prototype->NumSamples;
79 if (prototype->Significant) {
80 tprintf("Green proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples);
81 } else if (prototype->NumSamples > 0 && !prototype->Merged) {
82 tprintf("Red proto at (%g,%g)+(%g,%g) %d samples\n", x, y, dx, dy, prototypeNumSamples);
83 }
84 }
85 window->Update();
86}
87#endif // !GRAPHICS_DISABLED
88
89// Helper to run clustering on a single config.
90// Mostly copied from the old mftraining, but with renamed variables.
91static LIST ClusterOneConfig(int shape_id, const char *class_label, LIST mf_classes,
92 const ShapeTable &shape_table, MasterTrainer *trainer) {
93 int num_samples;
94 CLUSTERER *clusterer =
95 trainer->SetupForClustering(shape_table, feature_defs, shape_id, &num_samples);
96 Config.MagicSamples = num_samples;
97 LIST proto_list = ClusterSamples(clusterer, &Config);
98 CleanUpUnusedData(proto_list);
99
100 // Merge protos where reasonable to make more of them significant by
101 // representing almost all samples of the class/font.
102 MergeInsignificantProtos(proto_list, class_label, clusterer, &Config);
103#ifndef GRAPHICS_DISABLED
104 if (strcmp(FLAGS_test_ch.c_str(), class_label) == 0) {
105 DisplayProtoList(FLAGS_test_ch.c_str(), proto_list);
106 }
107#endif // !GRAPHICS_DISABLED
108 // Delete the protos that will not be used in the inttemp output file.
109 proto_list = RemoveInsignificantProtos(proto_list, true, false, clusterer->SampleSize);
110 FreeClusterer(clusterer);
111 MERGE_CLASS merge_class = FindClass(mf_classes, class_label);
112 if (merge_class == nullptr) {
113 merge_class = new MERGE_CLASS_NODE(class_label);
114 mf_classes = push(mf_classes, merge_class);
115 }
116 int config_id = AddConfigToClass(merge_class->Class);
117 merge_class->Class->font_set.push_back(shape_id);
118 LIST proto_it = proto_list;
119 iterate(proto_it) {
120 auto *prototype = reinterpret_cast<PROTOTYPE *>(proto_it->first_node());
121 // See if proto can be approximated by existing proto.
122 int p_id = FindClosestExistingProto(merge_class->Class, merge_class->NumMerged, prototype);
123 if (p_id == NO_PROTO) {
124 // Need to make a new proto, as it doesn't match anything.
125 p_id = AddProtoToClass(merge_class->Class);
126 MakeNewFromOld(ProtoIn(merge_class->Class, p_id), prototype);
127 merge_class->NumMerged[p_id] = 1;
128 } else {
129 PROTO_STRUCT dummy_proto;
130 MakeNewFromOld(&dummy_proto, prototype);
131 // Merge with the similar proto.
132 ComputeMergedProto(ProtoIn(merge_class->Class, p_id), &dummy_proto,
133 static_cast<float>(merge_class->NumMerged[p_id]), 1.0,
134 ProtoIn(merge_class->Class, p_id));
135 merge_class->NumMerged[p_id]++;
136 }
137 AddProtoToConfig(p_id, merge_class->Class->Configurations[config_id]);
138 }
139 FreeProtoList(&proto_list);
140 return mf_classes;
141}
142
143// Helper to setup the config map.
144// Setup an index mapping from the shapes in the shape table to the classes
145// that will be trained. In keeping with the original design, each shape
146// with the same list of unichars becomes a different class and the configs
147// represent the different combinations of fonts.
148static void SetupConfigMap(ShapeTable *shape_table, IndexMapBiDi *config_map) {
149 int num_configs = shape_table->NumShapes();
150 config_map->Init(num_configs, true);
151 config_map->Setup();
152 for (int c1 = 0; c1 < num_configs; ++c1) {
153 // Only process ids that are not already merged.
154 if (config_map->SparseToCompact(c1) == c1) {
155 Shape *shape1 = shape_table->MutableShape(c1);
156 // Find all the subsequent shapes that are equal.
157 for (int c2 = c1 + 1; c2 < num_configs; ++c2) {
158 if (shape_table->MutableShape(c2)->IsEqualUnichars(shape1)) {
159 config_map->Merge(c1, c2);
160 }
161 }
162 }
163 }
164 config_map->CompleteMerges();
165}
166
194int main(int argc, char **argv) {
195 tesseract::CheckSharedLibraryVersion();
196
197 ParseArguments(&argc, &argv);
198
199 ShapeTable *shape_table = nullptr;
200 std::string file_prefix;
201 // Load the training data.
202 auto trainer = tesseract::LoadTrainingData(argv + 1, false, &shape_table, file_prefix);
203 if (trainer == nullptr) {
204 return EXIT_FAILURE; // Failed.
205 }
206
207 // Setup an index mapping from the shapes in the shape table to the classes
208 // that will be trained. In keeping with the original design, each shape
209 // with the same list of unichars becomes a different class and the configs
210 // represent the different combinations of fonts.
211 IndexMapBiDi config_map;
212 SetupConfigMap(shape_table, &config_map);
213
214 WriteShapeTable(file_prefix, *shape_table);
215 // If the shape_table is flat, then either we didn't run shape clustering, or
216 // it did nothing, so we just output the trainer's unicharset.
217 // Otherwise shape_set will hold a fake unicharset with an entry for each
218 // shape in the shape table, and we will output that instead.
219 UNICHARSET shape_set;
220 const UNICHARSET *unicharset = &trainer->unicharset();
221 // If we ran shapeclustering (and it worked) then at least one shape will
222 // have multiple unichars, so we have to build a fake unicharset.
223 if (shape_table->AnyMultipleUnichars()) {
224 unicharset = &shape_set;
225 // Now build a fake unicharset for the compact shape space to keep the
226 // output modules happy that we are doing things correctly.
227 int num_shapes = config_map.CompactSize();
228 for (int s = 0; s < num_shapes; ++s) {
229 char shape_label[14];
230 snprintf(shape_label, sizeof(shape_label), "sh%04d", s);
231 shape_set.unichar_insert(shape_label);
232 }
233 }
234
235 // Now train each config separately.
236 int num_configs = shape_table->NumShapes();
237 LIST mf_classes = NIL_LIST;
238 for (int s = 0; s < num_configs; ++s) {
239 int unichar_id, font_id;
240 if (unicharset == &shape_set) {
241 // Using fake unichar_ids from the config_map/shape_set.
242 unichar_id = config_map.SparseToCompact(s);
243 } else {
244 // Get the real unichar_id from the shape table/unicharset.
245 shape_table->GetFirstUnicharAndFont(s, &unichar_id, &font_id);
246 }
247 const char *class_label = unicharset->id_to_unichar(unichar_id);
248 mf_classes = ClusterOneConfig(s, class_label, mf_classes, *shape_table, trainer.get());
249 }
250 std::string inttemp_file = file_prefix;
251 inttemp_file += "inttemp";
252 std::string pffmtable_file = file_prefix;
253 pffmtable_file += "pffmtable";
254 CLASS_STRUCT *float_classes = SetUpForFloat2Int(*unicharset, mf_classes);
255 // Now write the inttemp and pffmtable.
256 trainer->WriteInttempAndPFFMTable(trainer->unicharset(), *unicharset, *shape_table, float_classes,
257 inttemp_file.c_str(), pffmtable_file.c_str());
258 for (size_t c = 0; c < unicharset->size(); ++c) {
259 FreeClassFields(&float_classes[c]);
260 }
261 delete[] float_classes;
262 FreeLabeledClassList(mf_classes);
263 delete shape_table;
264 printf("Done!\n");
265 if (!FLAGS_test_ch.empty()) {
266 // If we are displaying debug window(s), wait for the user to look at them.
267 printf("Hit return to exit...\n");
268 while (getchar() != '\n') {
269 ;
270 }
271 }
272 return EXIT_SUCCESS;
273} /* main */
#define NO_PROTO
Definition: matchdefs.h:41
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define CenterX(M)
Definition: mergenf.h:48
#define CenterY(M)
Definition: mergenf.h:49
#define LengthOf(M)
Definition: mergenf.h:50
#define OrientationOf(M)
Definition: mergenf.h:51
void ComputeMergedProto(PROTO_STRUCT *p1, PROTO_STRUCT *p2, float w1, float w2, PROTO_STRUCT *MergedProto)
Definition: mergenf.cpp:130
int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], PROTOTYPE *Prototype)
Definition: mergenf.cpp:158
void MakeNewFromOld(PROTO_STRUCT *New, PROTOTYPE *Old)
Definition: mergenf.cpp:194
int main(int argc, char **argv)
Definition: mftraining.cpp:194
#define AddProtoToConfig(Pid, Config)
Definition: protos.h:61
#define ProtoIn(Class, Pid)
Definition: protos.h:70
const double y
MERGE_CLASS FindClass(LIST List, const std::string &Label)
void WriteShapeTable(const std::string &file_prefix, const ShapeTable &shape_table)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void ParseArguments(int *argc, char ***argv)
int AddConfigToClass(CLASS_TYPE Class)
Definition: protos.cpp:49
int AddProtoToClass(CLASS_TYPE Class)
Definition: protos.cpp:82
CLUSTERCONFIG Config
void FreeClassFields(CLASS_TYPE Class)
Definition: protos.cpp:131
FEATURE_DEFS_STRUCT feature_defs
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *clusterconfig)
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
std::unique_ptr< MasterTrainer > LoadTrainingData(const char *const *filelist, bool replication, ShapeTable **shape_table, std::string &file_prefix)
void CleanUpUnusedData(LIST ProtoList)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1575
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
LIST RemoveInsignificantProtos(LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1543
void FreeLabeledClassList(LIST ClassList)
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:80
int CompactSize() const
Definition: indexmapbidi.h:63
void Init(int size, bool all_mapped)
bool Merge(int compact_index1, int compact_index2)
int SparseToCompact(int sparse_index) const override
Definition: indexmapbidi.h:140
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
size_t size() const
Definition: unicharset.h:355
int16_t SampleSize
Definition: cluster.h:92
std::vector< BIT_VECTOR > Configurations
Definition: protos.h:46
UnicityTable< int > font_set
Definition: protos.h:47
bool IsEqualUnichars(Shape *other)
Definition: shapetable.cpp:222
bool AnyMultipleUnichars() const
Definition: shapetable.cpp:458
Shape * MutableShape(unsigned shape_id)
Definition: shapetable.h:295
unsigned NumShapes() const
Definition: shapetable.h:248
void GetFirstUnicharAndFont(unsigned shape_id, int *unichar_id, int *font_id) const
Definition: shapetable.cpp:420
list_rec * first_node()
Definition: oldlist.h:107
tesseract::CLASS_TYPE Class
int NumMerged[MAX_NUM_PROTOS]
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
const UNICHARSET & unicharset() const
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)