All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #include "commontraining.h"
15 
16 #include "allheaders.h"
17 #include "ccutil.h"
18 #include "classify.h"
19 #include "cluster.h"
20 #include "clusttool.h"
21 #include "efio.h"
22 #include "emalloc.h"
23 #include "featdefs.h"
24 #include "fontinfo.h"
25 #include "freelist.h"
26 #include "globals.h"
27 #include "intfeaturespace.h"
28 #include "mastertrainer.h"
29 #include "mf.h"
30 #include "ndminx.h"
31 #include "oldlist.h"
32 #include "params.h"
33 #include "shapetable.h"
34 #include "tessdatamanager.h"
35 #include "tessopt.h"
36 #include "tprintf.h"
37 #include "unicity_table.h"
38 
39 #include <math.h>
40 
41 using tesseract::CCUtil;
46 
47 // Global Variables.
48 
49 // global variable to hold configuration parameters to control clustering
50 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
51 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
54 
55 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
56 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
57 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
58 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
59 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
60 STRING_PARAM_FLAG(X, "", "File listing font xheights");
61 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
62 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
63 STRING_PARAM_FLAG(T, "", "File to load trainer from");
64 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
65 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
66 DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
67  "Min number of samples per proto as % of total");
68 DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
69  "Max percentage of samples in a cluster which have more"
70  " than 1 feature in that cluster");
71 DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
72  "Desired independence between dimensions");
73 DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
74  "Desired confidence in prototypes created");
75 
88 void ParseArguments(int* argc, char ***argv) {
89  STRING usage;
90  if (*argc) {
91  usage += (*argv)[0];
92  }
93  usage += " [.tr files ...]";
94  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
95  // Record the index of the first non-flag argument to 1, since we set
96  // remove_flags to true when parsing the flags.
97  tessoptind = 1;
98  // Set some global values based on the flags.
99  Config.MinSamples =
100  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
101  Config.MaxIllegal =
102  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
103  Config.Independence =
104  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
105  Config.Confidence =
106  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
107  // Set additional parameters from config file if specified.
108  if (!FLAGS_configfile.empty()) {
110  FLAGS_configfile.c_str(),
112  ccutil.params());
113  }
114 }
115 
116 namespace tesseract {
117 // Helper loads shape table from the given file.
118 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
119  ShapeTable* shape_table = NULL;
120  STRING shape_table_file = file_prefix;
121  shape_table_file += kShapeTableFileSuffix;
122  FILE* shape_fp = fopen(shape_table_file.string(), "rb");
123  if (shape_fp != NULL) {
124  shape_table = new ShapeTable;
125  if (!shape_table->DeSerialize(false, shape_fp)) {
126  delete shape_table;
127  shape_table = NULL;
128  tprintf("Error: Failed to read shape table %s\n",
129  shape_table_file.string());
130  } else {
131  int num_shapes = shape_table->NumShapes();
132  tprintf("Read shape table %s of %d shapes\n",
133  shape_table_file.string(), num_shapes);
134  }
135  fclose(shape_fp);
136  } else {
137  tprintf("Warning: No shape table file present: %s\n",
138  shape_table_file.string());
139  }
140  return shape_table;
141 }
142 
143 // Helper to write the shape_table.
144 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
145  STRING shape_table_file = file_prefix;
146  shape_table_file += kShapeTableFileSuffix;
147  FILE* fp = fopen(shape_table_file.string(), "wb");
148  if (fp != NULL) {
149  if (!shape_table.Serialize(fp)) {
150  fprintf(stderr, "Error writing shape table: %s\n",
151  shape_table_file.string());
152  }
153  fclose(fp);
154  } else {
155  fprintf(stderr, "Error creating shape table: %s\n",
156  shape_table_file.string());
157  }
158 }
159 
175 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
176  bool replication,
177  ShapeTable** shape_table,
178  STRING* file_prefix) {
179  InitFeatureDefs(&feature_defs);
180  InitIntegerFX();
181  *file_prefix = "";
182  if (!FLAGS_D.empty()) {
183  *file_prefix += FLAGS_D.c_str();
184  *file_prefix += "/";
185  }
186  // If we are shape clustering (NULL shape_table) or we successfully load
187  // a shape_table written by a previous shape clustering, then
188  // shape_analysis will be true, meaning that the MasterTrainer will replace
189  // some members of the unicharset with their fragments.
190  bool shape_analysis = false;
191  if (shape_table != NULL) {
192  *shape_table = LoadShapeTable(*file_prefix);
193  if (*shape_table != NULL)
194  shape_analysis = true;
195  } else {
196  shape_analysis = true;
197  }
199  shape_analysis,
200  replication,
201  FLAGS_debug_level);
202  IntFeatureSpace fs;
204  if (FLAGS_T.empty()) {
205  trainer->LoadUnicharset(FLAGS_U.c_str());
206  // Get basic font information from font_properties.
207  if (!FLAGS_F.empty()) {
208  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
209  delete trainer;
210  return NULL;
211  }
212  }
213  if (!FLAGS_X.empty()) {
214  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
215  delete trainer;
216  return NULL;
217  }
218  }
219  trainer->SetFeatureSpace(fs);
220  const char* page_name;
221  // Load training data from .tr files on the command line.
222  while ((page_name = GetNextFilename(argc, argv)) != NULL) {
223  tprintf("Reading %s ...\n", page_name);
224  trainer->ReadTrainingSamples(page_name, feature_defs, false);
225 
226  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
227  // read font spacing information in to fontinfo_table.
228  int pagename_len = strlen(page_name);
229  char *fontinfo_file_name = new char[pagename_len + 7];
230  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
231  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
232  trainer->AddSpacingInfo(fontinfo_file_name);
233  delete[] fontinfo_file_name;
234 
235  // Load the images into memory if required by the classifier.
236  if (FLAGS_load_images) {
237  STRING image_name = page_name;
238  // Chop off the tr and replace with tif. Extension must be tif!
239  image_name.truncate_at(image_name.length() - 2);
240  image_name += "tif";
241  trainer->LoadPageImages(image_name.string());
242  }
243  }
244  trainer->PostLoadCleanup();
245  // Write the master trainer if required.
246  if (!FLAGS_output_trainer.empty()) {
247  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
248  if (fp == NULL) {
249  tprintf("Can't create saved trainer data!\n");
250  } else {
251  trainer->Serialize(fp);
252  fclose(fp);
253  }
254  }
255  } else {
256  bool success = false;
257  tprintf("Loading master trainer from file:%s\n",
258  FLAGS_T.c_str());
259  FILE* fp = fopen(FLAGS_T.c_str(), "rb");
260  if (fp == NULL) {
261  tprintf("Can't read file %s to initialize master trainer\n",
262  FLAGS_T.c_str());
263  } else {
264  success = trainer->DeSerialize(false, fp);
265  fclose(fp);
266  }
267  if (!success) {
268  tprintf("Deserialize of master trainer failed!\n");
269  delete trainer;
270  return NULL;
271  }
272  trainer->SetFeatureSpace(fs);
273  }
274  trainer->PreTrainingSetup();
275  if (!FLAGS_O.empty() &&
276  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
277  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
278  delete trainer;
279  return NULL;
280  }
281  if (shape_table != NULL) {
282  // If we previously failed to load a shapetable, then shape clustering
283  // wasn't run so make a flat one now.
284  if (*shape_table == NULL) {
285  *shape_table = new ShapeTable;
286  trainer->SetupFlatShapeTable(*shape_table);
287  tprintf("Flat shape table summary: %s\n",
288  (*shape_table)->SummaryStr().string());
289  }
290  (*shape_table)->set_unicharset(trainer->unicharset());
291  }
292  return trainer;
293 }
294 
295 } // namespace tesseract.
296 
297 /*---------------------------------------------------------------------------*/
310 const char *GetNextFilename(int argc, const char* const * argv) {
311  if (tessoptind < argc)
312  return argv[tessoptind++];
313  else
314  return NULL;
315 } /* GetNextFilename */
316 
317 
318 
319 /*---------------------------------------------------------------------------*/
332  LIST List,
333  char *Label)
334 {
335  LABELEDLIST LabeledList;
336 
337  iterate (List)
338  {
339  LabeledList = (LABELEDLIST) first_node (List);
340  if (strcmp (LabeledList->Label, Label) == 0)
341  return (LabeledList);
342  }
343  return (NULL);
344 
345 } /* FindList */
346 
347 /*---------------------------------------------------------------------------*/
358  const char *Label)
359 {
360  LABELEDLIST LabeledList;
361 
362  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
363  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
364  strcpy (LabeledList->Label, Label);
365  LabeledList->List = NIL_LIST;
366  LabeledList->SampleCount = 0;
367  LabeledList->font_sample_count = 0;
368  return (LabeledList);
369 
370 } /* NewLabeledList */
371 
372 /*---------------------------------------------------------------------------*/
373 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
374 // the new method or get rid of it entirely.
394 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
395  const char *feature_name, int max_samples,
396  UNICHARSET* unicharset,
397  FILE* file, LIST* training_samples) {
398  char buffer[2048];
399  char unichar[UNICHAR_LEN + 1];
400  LABELEDLIST char_sample;
401  FEATURE_SET feature_samples;
402  CHAR_DESC char_desc;
403  int i;
404  int feature_type = ShortNameToFeatureType(feature_defs, feature_name);
405  // Zero out the font_sample_count for all the classes.
406  LIST it = *training_samples;
407  iterate(it) {
408  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
409  char_sample->font_sample_count = 0;
410  }
411 
412  while (fgets(buffer, 2048, file) != NULL) {
413  if (buffer[0] == '\n')
414  continue;
415 
416  sscanf(buffer, "%*s %s", unichar);
417  if (unicharset != NULL && !unicharset->contains_unichar(unichar)) {
418  unicharset->unichar_insert(unichar);
419  if (unicharset->size() > MAX_NUM_CLASSES) {
420  tprintf("Error: Size of unicharset in training is "
421  "greater than MAX_NUM_CLASSES\n");
422  exit(1);
423  }
424  }
425  char_sample = FindList(*training_samples, unichar);
426  if (char_sample == NULL) {
427  char_sample = NewLabeledList(unichar);
428  *training_samples = push(*training_samples, char_sample);
429  }
430  char_desc = ReadCharDescription(feature_defs, file);
431  feature_samples = char_desc->FeatureSets[feature_type];
432  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
433  char_sample->List = push(char_sample->List, feature_samples);
434  char_sample->SampleCount++;
435  char_sample->font_sample_count++;
436  } else {
437  FreeFeatureSet(feature_samples);
438  }
439  for (i = 0; i < char_desc->NumFeatureSets; i++) {
440  if (feature_type != i)
441  FreeFeatureSet(char_desc->FeatureSets[i]);
442  }
443  free(char_desc);
444  }
445 } // ReadTrainingSamples
446 
447 
448 /*---------------------------------------------------------------------------*/
458 void FreeTrainingSamples(LIST CharList) {
459  LABELEDLIST char_sample;
460  FEATURE_SET FeatureSet;
461  LIST FeatureList;
462 
463 
464  iterate(CharList) { /* iterate thru all of the fonts */
465  char_sample = (LABELEDLIST) first_node(CharList);
466  FeatureList = char_sample->List;
467  iterate(FeatureList) { /* iterate thru all of the classes */
468  FeatureSet = (FEATURE_SET) first_node(FeatureList);
469  FreeFeatureSet(FeatureSet);
470  }
471  FreeLabeledList(char_sample);
472  }
473  destroy(CharList);
474 } /* FreeTrainingSamples */
475 
476 /*---------------------------------------------------------------------------*/
487 void FreeLabeledList(LABELEDLIST LabeledList) {
488  destroy(LabeledList->List);
489  free(LabeledList->Label);
490  free(LabeledList);
491 } /* FreeLabeledList */
492 
493 /*---------------------------------------------------------------------------*/
508  LABELEDLIST char_sample,
509  const char* program_feature_type) {
510  uinT16 N;
511  int i, j;
512  FLOAT32 *Sample = NULL;
513  CLUSTERER *Clusterer;
514  inT32 CharID;
515  LIST FeatureList = NULL;
516  FEATURE_SET FeatureSet = NULL;
517 
518  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
519  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
520  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
521 
522  FeatureList = char_sample->List;
523  CharID = 0;
524  iterate(FeatureList) {
525  FeatureSet = (FEATURE_SET) first_node(FeatureList);
526  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
527  if (Sample == NULL)
528  Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
529  for (j = 0; j < N; j++)
530  Sample[j] = FeatureSet->Features[i]->Params[j];
531  MakeSample (Clusterer, Sample, CharID);
532  }
533  CharID++;
534  }
535  if ( Sample != NULL ) free( Sample );
536  return( Clusterer );
537 
538 } /* SetUpForClustering */
539 
540 /*------------------------------------------------------------------------*/
541 void MergeInsignificantProtos(LIST ProtoList, const char* label,
542  CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
543  PROTOTYPE *Prototype;
544  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
545 
546  LIST pProtoList = ProtoList;
547  iterate(pProtoList) {
548  Prototype = (PROTOTYPE *) first_node (pProtoList);
549  if (Prototype->Significant || Prototype->Merged)
550  continue;
551  FLOAT32 best_dist = 0.125;
552  PROTOTYPE* best_match = NULL;
553  // Find the nearest alive prototype.
554  LIST list_it = ProtoList;
555  iterate(list_it) {
556  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
557  if (test_p != Prototype && !test_p->Merged) {
558  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
559  Clusterer->ParamDesc,
560  Prototype->Mean, test_p->Mean);
561  if (dist < best_dist) {
562  best_match = test_p;
563  best_dist = dist;
564  }
565  }
566  }
567  if (best_match != NULL && !best_match->Significant) {
568  if (debug)
569  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
570  best_match->NumSamples, Prototype->NumSamples,
571  best_match->Mean[0], best_match->Mean[1],
572  Prototype->Mean[0], Prototype->Mean[1]);
573  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
574  Clusterer->ParamDesc,
575  best_match->NumSamples,
576  Prototype->NumSamples,
577  best_match->Mean,
578  best_match->Mean, Prototype->Mean);
579  Prototype->NumSamples = 0;
580  Prototype->Merged = 1;
581  } else if (best_match != NULL) {
582  if (debug)
583  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
584  Prototype->Mean[0], Prototype->Mean[1],
585  best_match->Mean[0], best_match->Mean[1]);
586  Prototype->Merged = 1;
587  }
588  }
589  // Mark significant those that now have enough samples.
590  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
591  pProtoList = ProtoList;
592  iterate(pProtoList) {
593  Prototype = (PROTOTYPE *) first_node (pProtoList);
594  // Process insignificant protos that do not match a green one
595  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
596  !Prototype->Merged) {
597  if (debug)
598  tprintf("Red proto at %g,%g becoming green\n",
599  Prototype->Mean[0], Prototype->Mean[1]);
600  Prototype->Significant = true;
601  }
602  }
603 } /* MergeInsignificantProtos */
604 
605 /*-----------------------------------------------------------------------------*/
607  LIST ProtoList)
608 {
609  PROTOTYPE* Prototype;
610 
611  iterate(ProtoList)
612  {
613  Prototype = (PROTOTYPE *) first_node (ProtoList);
614  if(Prototype->Variance.Elliptical != NULL)
615  {
616  memfree(Prototype->Variance.Elliptical);
617  Prototype->Variance.Elliptical = NULL;
618  }
619  if(Prototype->Magnitude.Elliptical != NULL)
620  {
621  memfree(Prototype->Magnitude.Elliptical);
622  Prototype->Magnitude.Elliptical = NULL;
623  }
624  if(Prototype->Weight.Elliptical != NULL)
625  {
626  memfree(Prototype->Weight.Elliptical);
627  Prototype->Weight.Elliptical = NULL;
628  }
629  }
630 }
631 
632 /*------------------------------------------------------------------------*/
634  LIST ProtoList,
635  BOOL8 KeepSigProtos,
636  BOOL8 KeepInsigProtos,
637  int N)
638 
639 {
640  LIST NewProtoList = NIL_LIST;
641  LIST pProtoList;
642  PROTOTYPE* Proto;
643  PROTOTYPE* NewProto;
644  int i;
645 
646  pProtoList = ProtoList;
647  iterate(pProtoList)
648  {
649  Proto = (PROTOTYPE *) first_node (pProtoList);
650  if ((Proto->Significant && KeepSigProtos) ||
651  (!Proto->Significant && KeepInsigProtos))
652  {
653  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
654 
655  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
656  NewProto->Significant = Proto->Significant;
657  NewProto->Style = Proto->Style;
658  NewProto->NumSamples = Proto->NumSamples;
659  NewProto->Cluster = NULL;
660  NewProto->Distrib = NULL;
661 
662  for (i=0; i < N; i++)
663  NewProto->Mean[i] = Proto->Mean[i];
664  if (Proto->Variance.Elliptical != NULL)
665  {
666  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
667  for (i=0; i < N; i++)
668  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
669  }
670  else
671  NewProto->Variance.Elliptical = NULL;
672  //---------------------------------------------
673  if (Proto->Magnitude.Elliptical != NULL)
674  {
675  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
676  for (i=0; i < N; i++)
677  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
678  }
679  else
680  NewProto->Magnitude.Elliptical = NULL;
681  //------------------------------------------------
682  if (Proto->Weight.Elliptical != NULL)
683  {
684  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
685  for (i=0; i < N; i++)
686  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
687  }
688  else
689  NewProto->Weight.Elliptical = NULL;
690 
691  NewProto->TotalMagnitude = Proto->TotalMagnitude;
692  NewProto->LogMagnitude = Proto->LogMagnitude;
693  NewProtoList = push_last(NewProtoList, NewProto);
694  }
695  }
696  FreeProtoList(&ProtoList);
697  return (NewProtoList);
698 } /* RemoveInsignificantProtos */
699 
700 /*----------------------------------------------------------------------------*/
702  LIST List,
703  const char *Label)
704 {
705  MERGE_CLASS MergeClass;
706 
707  iterate (List)
708  {
709  MergeClass = (MERGE_CLASS) first_node (List);
710  if (strcmp (MergeClass->Label, Label) == 0)
711  return (MergeClass);
712  }
713  return (NULL);
714 
715 } /* FindClass */
716 
717 /*---------------------------------------------------------------------------*/
719  const char *Label)
720 {
721  MERGE_CLASS MergeClass;
722 
723  MergeClass = new MERGE_CLASS_NODE;
724  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
725  strcpy (MergeClass->Label, Label);
726  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
727  return (MergeClass);
728 
729 } /* NewLabeledClass */
730 
731 /*-----------------------------------------------------------------------------*/
742  LIST ClassList)
743 {
744  MERGE_CLASS MergeClass;
745 
746  iterate (ClassList) /* iterate thru all of the fonts */
747  {
748  MergeClass = (MERGE_CLASS) first_node (ClassList);
749  free (MergeClass->Label);
750  FreeClass(MergeClass->Class);
751  delete MergeClass;
752  }
753  destroy (ClassList);
754 
755 } /* FreeLabeledClassList */
756 
757 /* SetUpForFloat2Int */
759  LIST LabeledClassList) {
760  MERGE_CLASS MergeClass;
761  CLASS_TYPE Class;
762  int NumProtos;
763  int NumConfigs;
764  int NumWords;
765  int i, j;
766  float Values[3];
767  PROTO NewProto;
768  PROTO OldProto;
769  BIT_VECTOR NewConfig;
770  BIT_VECTOR OldConfig;
771 
772  // printf("Float2Int ...\n");
773 
774  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
775  iterate(LabeledClassList)
776  {
777  UnicityTableEqEq<int> font_set;
778  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
779  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
780  NumProtos = MergeClass->Class->NumProtos;
781  NumConfigs = MergeClass->Class->NumConfigs;
782  font_set.move(&MergeClass->Class->font_set);
783  Class->NumProtos = NumProtos;
784  Class->MaxNumProtos = NumProtos;
785  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
786  for(i=0; i < NumProtos; i++)
787  {
788  NewProto = ProtoIn(Class, i);
789  OldProto = ProtoIn(MergeClass->Class, i);
790  Values[0] = OldProto->X;
791  Values[1] = OldProto->Y;
792  Values[2] = OldProto->Angle;
793  Normalize(Values);
794  NewProto->X = OldProto->X;
795  NewProto->Y = OldProto->Y;
796  NewProto->Length = OldProto->Length;
797  NewProto->Angle = OldProto->Angle;
798  NewProto->A = Values[0];
799  NewProto->B = Values[1];
800  NewProto->C = Values[2];
801  }
802 
803  Class->NumConfigs = NumConfigs;
804  Class->MaxNumConfigs = NumConfigs;
805  Class->font_set.move(&font_set);
806  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
807  NumWords = WordsInVectorOfSize(NumProtos);
808  for(i=0; i < NumConfigs; i++)
809  {
810  NewConfig = NewBitVector(NumProtos);
811  OldConfig = MergeClass->Class->Configurations[i];
812  for(j=0; j < NumWords; j++)
813  NewConfig[j] = OldConfig[j];
814  Class->Configurations[i] = NewConfig;
815  }
816  }
817  return float_classes;
818 } // SetUpForFloat2Int
819 
820 /*--------------------------------------------------------------------------*/
821 void Normalize (
822  float *Values)
823 {
824  register float Slope;
825  register float Intercept;
826  register float Normalizer;
827 
828  Slope = tan (Values [2] * 2 * PI);
829  Intercept = Values [1] - Slope * Values [0];
830  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
831 
832  Values [0] = Slope * Normalizer;
833  Values [1] = - Normalizer;
834  Values [2] = Intercept * Normalizer;
835 } // Normalize
836 
837 /*-------------------------------------------------------------------------*/
839  LIST CharList)
840 
841 {
842  LABELEDLIST char_sample;
843 
844  iterate (CharList) /* iterate thru all of the fonts */
845  {
846  char_sample = (LABELEDLIST) first_node (CharList);
847  FreeLabeledList (char_sample);
848  }
849  destroy (CharList);
850 
851 } // FreeNormProtoList
852 
853 /*---------------------------------------------------------------------------*/
855  LIST* NormProtoList,
856  LIST ProtoList,
857  char* CharName)
858 {
859  PROTOTYPE* Proto;
860  LABELEDLIST LabeledProtoList;
861 
862  LabeledProtoList = NewLabeledList(CharName);
863  iterate(ProtoList)
864  {
865  Proto = (PROTOTYPE *) first_node (ProtoList);
866  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
867  }
868  *NormProtoList = push(*NormProtoList, LabeledProtoList);
869 }
870 
871 /*---------------------------------------------------------------------------*/
873  LIST ProtoList,
874  BOOL8 CountSigProtos,
875  BOOL8 CountInsigProtos)
876 {
877  int N = 0;
878  PROTOTYPE *Proto;
879 
880  iterate(ProtoList)
881  {
882  Proto = (PROTOTYPE *) first_node ( ProtoList );
883  if (( Proto->Significant && CountSigProtos ) ||
884  ( ! Proto->Significant && CountInsigProtos ) )
885  N++;
886  }
887  return(N);
888 }
MERGE_CLASS NewLabeledClass(const char *Label)
void memfree(void *element)
Definition: freelist.cpp:30
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
bool LoadFontInfo(const char *filename)
PROTO_STRUCT * PROTO
Definition: protos.h:52
void LoadPageImages(const char *filename)
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
float FLOAT32
Definition: host.h:111
#define MAX(x, y)
Definition: ndminx.h:24
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,"Min number of samples per proto as % of total")
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets)
void FreeNormProtoList(LIST CharList)
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define ProtoIn(Class, Pid)
Definition: protos.h:123
#define NIL_LIST
Definition: oldlist.h:126
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
void FreeLabeledList(LABELEDLIST LabeledList)
STRING_PARAM_FLAG(configfile,"","File to load more configs from")
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
inT16 NumConfigs
Definition: protos.h:62
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:90
const int kBoostXYBuckets
const int kBoostDirBuckets
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
LIST RemoveInsignificantProtos(LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
void SetupFlatShapeTable(ShapeTable *shape_table)
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:400
unsigned char BOOL8
Definition: host.h:113
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:457
inT32 length() const
Definition: strngs.cpp:188
uinT32 NumFeatureSets
Definition: featdefs.h:43
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
UnicityTableEqEq< int > font_set
Definition: protos.h:65
FEATURE Features[1]
Definition: ocrfeatures.h:72
inT16 NumProtos
Definition: protos.h:59
bool Serialize(FILE *fp) const
void FreeLabeledClassList(LIST ClassList)
ShapeTable * LoadShapeTable(const STRING &file_prefix)
unsigned Significant
Definition: cluster.h:68
FLOATUNION Weight
Definition: cluster.h:83
MERGE_CLASS FindClass(LIST List, const char *Label)
struct LABELEDLISTNODE * LABELEDLIST
FLOAT32 Independence
Definition: cluster.h:53
void CleanUpUnusedData(LIST ProtoList)
const char * GetNextFilename(int argc, const char *const *argv)
FLOAT32 MaxIllegal
Definition: cluster.h:51
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:248
FLOAT32 X
Definition: protos.h:47
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FEATURE_DEFS_STRUCT feature_defs
void Normalize(float *Values)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
FLOAT32 Angle
Definition: protos.h:49
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void truncate_at(inT32 index)
Definition: strngs.cpp:264
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
unsigned NumSamples
Definition: cluster.h:75
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:338
bool LoadXHeights(const char *filename)
FLOAT64 Confidence
Definition: cluster.h:54
void * Emalloc(int Size)
Definition: emalloc.cpp:47
void ParseArguments(int *argc, char ***argv)
void FreeTrainingSamples(LIST CharList)
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
inT16 MaxNumConfigs
Definition: protos.h:63
const UNICHARSET & unicharset() const
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
FLOAT32 MinSamples
Definition: cluster.h:50
inT32 NumChar
Definition: cluster.h:93
FLOAT32 B
Definition: protos.h:45
void LoadUnicharset(const char *filename)
#define first_node(l)
Definition: oldlist.h:139
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:473
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:215
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define iterate(l)
Definition: oldlist.h:159
ParamsVectors * params()
Definition: ccutil.h:65
CLUSTERCONFIG Config
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
FLOAT32 Length
Definition: protos.h:50
bool DeSerialize(bool swap, FILE *fp)
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
void InitIntegerFX()
Definition: intfx.cpp:55
#define PI
Definition: const.h:19
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
void move(UnicityTable< T > *from)
unsigned Style
Definition: cluster.h:74
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
FLOAT32 C
Definition: protos.h:46
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
#define NULL
Definition: host.h:144
LABELEDLIST FindList(LIST List, char *Label)
#define MAX_NUM_PROTOS
Definition: intproto.h:47
#define UNICHAR_LEN
Definition: unichar.h:30
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
int size() const
Definition: unicharset.h:297
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:85
const char * string() const
Definition: strngs.cpp:193
FLOAT32 A
Definition: protos.h:44
unsigned Merged
Definition: cluster.h:69
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
inT16 MaxNumProtos
Definition: protos.h:60
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:78
inT16 SampleSize
Definition: cluster.h:87
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
MERGE_CLASS_NODE * MERGE_CLASS
CCUtil ccutil
INT_PARAM_FLAG(debug_level, 0,"Level of Trainer debugging")
PROTO Prototypes
Definition: protos.h:61
CLASS_TYPE Class
CONFIGS Configurations
Definition: protos.h:64
int NumShapes() const
Definition: shapetable.h:278
LABELEDLIST NewLabeledList(const char *Label)
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
unsigned short uinT16
Definition: host.h:101
int inT32
Definition: host.h:102
FLOAT32 Y
Definition: protos.h:48
int tessoptind
Definition: tessopt.cpp:24
bool AddSpacingInfo(const char *filename)
const char * c_str() const
Definition: strngs.cpp:204