All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 
24 /*----------------------------------------------------------------------------
25  Include Files and Type Defines
26 ----------------------------------------------------------------------------*/
27 #include "oldlist.h"
28 #include "efio.h"
29 #include "emalloc.h"
30 #include "featdefs.h"
31 #include "tessopt.h"
32 #include "ocrfeatures.h"
33 #include "clusttool.h"
34 #include "cluster.h"
35 #include <string.h>
36 #include <stdio.h>
37 #include <math.h>
38 #include "unichar.h"
39 #include "commontraining.h"
40 
41 #define PROGRAM_FEATURE_TYPE "cn"
42 
44 
45 /*----------------------------------------------------------------------------
46  Public Function Prototypes
47 ----------------------------------------------------------------------------*/
48 int main (
49  int argc,
50  char **argv);
51 
52 /*----------------------------------------------------------------------------
53  Private Function Prototypes
54 ----------------------------------------------------------------------------*/
55 
56 void WriteNormProtos (
57  const char *Directory,
58  LIST LabeledProtoList,
59  CLUSTERER *Clusterer);
60 
61 /*
62 PARAMDESC *ConvertToPARAMDESC(
63  PARAM_DESC* Param_Desc,
64  int N);
65 */
66 
67 void WriteProtos(
68  FILE *File,
69  uinT16 N,
70  LIST ProtoList,
71  BOOL8 WriteSigProtos,
72  BOOL8 WriteInsigProtos);
73 
74 /*----------------------------------------------------------------------------
75  Global Data Definitions and Declarations
76 ----------------------------------------------------------------------------*/
77 /* global variable to hold configuration parameters to control clustering */
78 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
80 {
81  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
82 };
83 
84 
85 /*----------------------------------------------------------------------------
86  Public Code
87 ----------------------------------------------------------------------------*/
88 /*---------------------------------------------------------------------------*/
137 int main(int argc, char* argv[])
138 {
139  // Set the global Config parameters before parsing the command line.
140  Config = CNConfig;
141 
142  const char *PageName;
143  FILE *TrainingPage;
144  LIST CharList = NIL_LIST;
145  CLUSTERER *Clusterer = NULL;
146  LIST ProtoList = NIL_LIST;
147  LIST NormProtoList = NIL_LIST;
148  LIST pCharList;
149  LABELEDLIST CharSample;
150  FEATURE_DEFS_STRUCT FeatureDefs;
151  InitFeatureDefs(&FeatureDefs);
152 
153  ParseArguments(&argc, &argv);
154  int num_fonts = 0;
155  while ((PageName = GetNextFilename(argc, argv)) != NULL) {
156  printf("Reading %s ...\n", PageName);
157  TrainingPage = Efopen(PageName, "rb");
159  100, NULL, TrainingPage, &CharList);
160  fclose(TrainingPage);
161  ++num_fonts;
162  }
163  printf("Clustering ...\n");
164  // To allow an individual font to form a separate cluster,
165  // reduce the min samples:
166  // Config.MinSamples = 0.5 / num_fonts;
167  pCharList = CharList;
168  iterate(pCharList) {
169  //Cluster
170  CharSample = (LABELEDLIST)first_node(pCharList);
171  Clusterer =
172  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
173  float SavedMinSamples = Config.MinSamples;
174  // To disable the tendency to produce a single cluster for all fonts,
175  // make MagicSamples an impossible to achieve number:
176  // Config.MagicSamples = CharSample->SampleCount * 10;
177  Config.MagicSamples = CharSample->SampleCount;
178  while (Config.MinSamples > 0.001) {
179  ProtoList = ClusterSamples(Clusterer, &Config);
180  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
181  break;
182  } else {
183  Config.MinSamples *= 0.95;
184  printf("0 significant protos for %s."
185  " Retrying clustering with MinSamples = %f%%\n",
186  CharSample->Label, Config.MinSamples);
187  }
188  }
189  Config.MinSamples = SavedMinSamples;
190  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
191  }
192  FreeTrainingSamples(CharList);
193  if (Clusterer == NULL) { // To avoid a SIGSEGV
194  fprintf(stderr, "Error: NULL clusterer!\n");
195  return 1;
196  }
197  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
198  FreeNormProtoList(NormProtoList);
199  FreeProtoList(&ProtoList);
200  FreeClusterer(Clusterer);
201  printf ("\n");
202  return 0;
203 } // main
204 
205 
206 /*----------------------------------------------------------------------------
207  Private Code
208 ----------------------------------------------------------------------------*/
209 
210 /*----------------------------------------------------------------------------*/
223  const char *Directory,
224  LIST LabeledProtoList,
225  CLUSTERER *Clusterer)
226 {
227  FILE *File;
228  STRING Filename;
229  LABELEDLIST LabeledProto;
230  int N;
231 
232  Filename = "";
233  if (Directory != NULL && Directory[0] != '\0')
234  {
235  Filename += Directory;
236  Filename += "/";
237  }
238  Filename += "normproto";
239  printf ("\nWriting %s ...", Filename.string());
240  File = Efopen (Filename.string(), "wb");
241  fprintf(File,"%0d\n",Clusterer->SampleSize);
242  WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc);
243  iterate(LabeledProtoList)
244  {
245  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
246  N = NumberOfProtos(LabeledProto->List, true, false);
247  if (N < 1) {
248  printf ("\nError! Not enough protos for %s: %d protos"
249  " (%d significant protos"
250  ", %d insignificant protos)\n",
251  LabeledProto->Label, N,
252  NumberOfProtos(LabeledProto->List, 1, 0),
253  NumberOfProtos(LabeledProto->List, 0, 1));
254  exit(1);
255  }
256  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
257  WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
258  }
259  fclose (File);
260 
261 } // WriteNormProtos
262 
263 /*-------------------------------------------------------------------------*/
265  FILE *File,
266  uinT16 N,
267  LIST ProtoList,
268  BOOL8 WriteSigProtos,
269  BOOL8 WriteInsigProtos)
270 {
271  PROTOTYPE *Proto;
272 
273  // write prototypes
274  iterate(ProtoList)
275  {
276  Proto = (PROTOTYPE *) first_node ( ProtoList );
277  if (( Proto->Significant && WriteSigProtos ) ||
278  ( ! Proto->Significant && WriteInsigProtos ) )
279  WritePrototype( File, N, Proto );
280  }
281 } // WriteProtos
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:343
void FreeNormProtoList(LIST CharList)
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
#define NIL_LIST
Definition: oldlist.h:126
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:264
unsigned char BOOL8
Definition: host.h:113
unsigned Significant
Definition: cluster.h:68
struct LABELEDLISTNODE * LABELEDLIST
const char * GetNextFilename(int argc, const char *const *argv)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, CLUSTERER *Clusterer)
Definition: cntraining.cpp:222
int MagicSamples
Definition: cluster.h:55
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:515
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
int main(int argc, char **argv)
void ParseArguments(int *argc, char ***argv)
void FreeTrainingSamples(LIST CharList)
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:79
FLOAT32 MinSamples
Definition: cluster.h:50
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
CLUSTERCONFIG Config
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:41
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:543
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
inT16 SampleSize
Definition: cluster.h:87
void WriteParamDesc(FILE *File, uinT16 N, PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:314
DECLARE_STRING_PARAM_FLAG(D)
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
unsigned short uinT16
Definition: host.h:101