tesseract v5.3.3.20231005
cntraining.cpp File Reference
#include <tesseract/unichar.h>
#include <cmath>
#include <cstdio>
#include <cstring>
#include "cluster.h"
#include "clusttool.h"
#include "commontraining.h"
#include "featdefs.h"
#include "ocrfeatures.h"
#include "oldlist.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

int main (int argc, char *argv[])
 

Macro Definition Documentation

◆ PROGRAM_FEATURE_TYPE

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 33 of file cntraining.cpp.

Function Documentation

◆ main()

int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
0 on success

Definition at line 103 of file cntraining.cpp.

103 {
104 tesseract::CheckSharedLibraryVersion();
105
106 // Set the global Config parameters before parsing the command line.
107 Config = CNConfig;
108
109 LIST CharList = NIL_LIST;
110 CLUSTERER *Clusterer = nullptr;
111 LIST ProtoList = NIL_LIST;
112 LIST NormProtoList = NIL_LIST;
113 LIST pCharList;
114 LABELEDLIST CharSample;
115 FEATURE_DEFS_STRUCT FeatureDefs;
116 InitFeatureDefs(&FeatureDefs);
117
118 ParseArguments(&argc, &argv);
119 int num_fonts = 0;
120 for (const char *PageName = *++argv; PageName != nullptr; PageName = *++argv) {
121 printf("Reading %s ...\n", PageName);
122 FILE *TrainingPage = fopen(PageName, "rb");
123 ASSERT_HOST(TrainingPage);
124 if (TrainingPage) {
125 ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, TrainingPage, &CharList);
126 fclose(TrainingPage);
127 ++num_fonts;
128 }
129 }
130 printf("Clustering ...\n");
131 // To allow an individual font to form a separate cluster,
132 // reduce the min samples:
133 // Config.MinSamples = 0.5 / num_fonts;
134 pCharList = CharList;
135 // The norm protos will count the source protos, so we keep them here in
136 // freeable_protos, so they can be freed later.
137 std::vector<LIST> freeable_protos;
138 iterate(pCharList) {
139 // Cluster
140 CharSample = reinterpret_cast<LABELEDLIST>(pCharList->first_node());
141 Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
142 if (Clusterer == nullptr) { // To avoid a SIGSEGV
143 fprintf(stderr, "Error: nullptr clusterer!\n");
144 return EXIT_FAILURE;
145 }
146 float SavedMinSamples = Config.MinSamples;
147 // To disable the tendency to produce a single cluster for all fonts,
148 // make MagicSamples an impossible to achieve number:
149 // Config.MagicSamples = CharSample->SampleCount * 10;
150 Config.MagicSamples = CharSample->SampleCount;
151 while (Config.MinSamples > 0.001) {
152 ProtoList = ClusterSamples(Clusterer, &Config);
153 if (NumberOfProtos(ProtoList, true, false) > 0) {
154 break;
155 } else {
156 Config.MinSamples *= 0.95;
157 printf(
158 "0 significant protos for %s."
159 " Retrying clustering with MinSamples = %f%%\n",
160 CharSample->Label.c_str(), Config.MinSamples);
161 }
162 }
163 Config.MinSamples = SavedMinSamples;
164 AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
165 freeable_protos.push_back(ProtoList);
166 FreeClusterer(Clusterer);
167 }
168 FreeTrainingSamples(CharList);
169 int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
170 WriteNormProtos(FLAGS_D.c_str(), NormProtoList, FeatureDefs.FeatureDesc[desc_index]);
171 FreeNormProtoList(NormProtoList);
172 for (auto &freeable_proto : freeable_protos) {
173 FreeProtoList(&freeable_proto);
174 }
175 printf("\n");
176 return EXIT_SUCCESS;
177} // main
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:33
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
void ParseArguments(int *argc, char ***argv)
void FreeNormProtoList(LIST CharList)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
CLUSTERCONFIG Config
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, const std::string &CharName)
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:87
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
void FreeTrainingSamples(LIST CharList)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1575
int NumberOfProtos(LIST ProtoList, bool CountSigProtos, bool CountInsigProtos)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1543
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43
list_rec * first_node()
Definition: oldlist.h:107