tesseract  4.0.0-beta.1-59-g2cc4
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

 DECLARE_STRING_PARAM_FLAG (D)
 
int main (int argc, char **argv)
 
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
 
void WriteProtos (FILE *File, uint16_t N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
 
int main (int argc, char *argv[])
 

Variables

CLUSTERCONFIG CNConfig
 

Macro Definition Documentation

◆ PROGRAM_FEATURE_TYPE

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 40 of file cntraining.cpp.

Function Documentation

◆ DECLARE_STRING_PARAM_FLAG()

DECLARE_STRING_PARAM_FLAG ( )

◆ main() [1/2]

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 422 of file tesseractmain.cpp.

422  {
423  const char* lang = "eng";
424  const char* image = NULL;
425  const char* outputbase = NULL;
426  const char* datapath = NULL;
427  bool list_langs = false;
428  bool print_parameters = false;
429  int arg_i = 1;
432  /* main() calls functions like ParseArgs which call exit().
433  * This results in memory leaks if vars_vec and vars_values are
434  * declared as auto variables (destructor is not called then). */
435  static GenericVector<STRING> vars_vec;
436  static GenericVector<STRING> vars_values;
437 
438 #if !defined(DEBUG)
439  // Disable debugging and informational messages from Leptonica.
440  setMsgSeverity(L_SEVERITY_ERROR);
441 #endif
442 
443 #if defined(HAVE_TIFFIO_H) && defined(_WIN32)
444  /* Show libtiff warnings on console (not in GUI). */
445  TIFFSetWarningHandler(Win32WarningHandler);
446 #endif /* HAVE_TIFFIO_H && _WIN32 */
447 
448  ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
449  &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
450  &enginemode);
451 
452  bool banner = false;
453  if (outputbase != NULL && strcmp(outputbase, "-") &&
454  strcmp(outputbase, "stdout")) {
455  banner = true;
456  }
457 
458  PERF_COUNT_START("Tesseract:main")
459 
460  // Call GlobalDawgCache here to create the global DawgCache object before
461  // the TessBaseAPI object. This fixes the order of destructor calls:
462  // first TessBaseAPI must be destructed, DawgCache must be the last object.
463  tesseract::Dict::GlobalDawgCache();
464 
465  // Avoid memory leak caused by auto variable when return is called.
466  static tesseract::TessBaseAPI api;
467 
468  api.SetOutputName(outputbase);
469 
470  int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
471  argc - arg_i, &vars_vec, &vars_values, false);
472 
473  SetVariablesFromCLArgs(&api, argc, argv);
474 
475  if (list_langs) {
476  PrintLangsList(&api);
477  return EXIT_SUCCESS;
478  }
479 
480  if (init_failed) {
481  fprintf(stderr, "Could not initialize tesseract.\n");
482  return EXIT_FAILURE;
483  }
484 
485  if (print_parameters) {
486  FILE* fout = stdout;
487  fprintf(stdout, "Tesseract parameters:\n");
488  api.PrintVariables(fout);
489  api.End();
490  return EXIT_SUCCESS;
491  }
492 
493  FixPageSegMode(&api, pagesegmode);
494 
495  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
496  int ret_val = EXIT_SUCCESS;
497 
498  Pix* pixs = pixRead(image);
499  if (!pixs) {
500  fprintf(stderr, "Cannot open input file: %s\n", image);
501  return 2;
502  }
503 
504  api.SetImage(pixs);
505 
506  tesseract::Orientation orientation;
509  float deskew_angle;
510 
511  tesseract::PageIterator* it = api.AnalyseLayout();
512  if (it) {
513  it->Orientation(&orientation, &direction, &order, &deskew_angle);
514  tprintf(
515  "Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n"
516  "Deskew angle: %.4f\n",
517  orientation, direction, order, deskew_angle);
518  } else {
519  ret_val = EXIT_FAILURE;
520  }
521 
522  delete it;
523 
524  pixDestroy(&pixs);
525  return ret_val;
526  }
527 
528  // set in_training_mode to true when using one of these configs:
529  // ambigs.train, box.train, box.train.stderr, linebox, rebox
530  bool b = false;
531  bool in_training_mode =
532  (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
533  (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
534  (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
535 
536  // Avoid memory leak caused by auto variable when exit() is called.
538 
539  if (in_training_mode) {
540  renderers.push_back(NULL);
541  } else {
542  PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
543  }
544 
545  if (!renderers.empty()) {
546  if (banner) PrintBanner();
547  bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
548  if (!succeed) {
549  fprintf(stderr, "Error during processing.\n");
550  return EXIT_FAILURE;
551  }
552  }
553 
555 
556  return EXIT_SUCCESS;
557 }
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:168
bool empty() const
Definition: genericvector.h:91
#define PERF_COUNT_START(FUNCT_NAME)
struct TessBaseAPI TessBaseAPI
Definition: capi.h:83
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int push_back(T * object)
#define tprintf(...)
Definition: tprintf.h:31
#define PERF_COUNT_END
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:167

◆ main() [2/2]

int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.

Definition at line 133 of file cntraining.cpp.

133  {
134  // Set the global Config parameters before parsing the command line.
135  Config = CNConfig;
136 
137  const char *PageName;
138  FILE *TrainingPage;
139  LIST CharList = NIL_LIST;
140  CLUSTERER *Clusterer = nullptr;
141  LIST ProtoList = NIL_LIST;
142  LIST NormProtoList = NIL_LIST;
143  LIST pCharList;
144  LABELEDLIST CharSample;
145  FEATURE_DEFS_STRUCT FeatureDefs;
146  InitFeatureDefs(&FeatureDefs);
147 
148  ParseArguments(&argc, &argv);
149  int num_fonts = 0;
150  while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
151  printf("Reading %s ...\n", PageName);
152  TrainingPage = Efopen(PageName, "rb");
153  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
154  TrainingPage, &CharList);
155  fclose(TrainingPage);
156  ++num_fonts;
157  }
158  printf("Clustering ...\n");
159  // To allow an individual font to form a separate cluster,
160  // reduce the min samples:
161  // Config.MinSamples = 0.5 / num_fonts;
162  pCharList = CharList;
163  // The norm protos will count the source protos, so we keep them here in
164  // freeable_protos, so they can be freed later.
165  GenericVector<LIST> freeable_protos;
166  iterate(pCharList) {
167  //Cluster
168  CharSample = (LABELEDLIST)first_node(pCharList);
169  Clusterer =
170  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
171  if (Clusterer == nullptr) { // To avoid a SIGSEGV
172  fprintf(stderr, "Error: NULL clusterer!\n");
173  return 1;
174  }
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  freeable_protos.push_back(ProtoList);
194  FreeClusterer(Clusterer);
195  }
196  FreeTrainingSamples(CharList);
197  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
198  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
199  FeatureDefs.FeatureDesc[desc_index]);
200  FreeNormProtoList(NormProtoList);
201  for (int i = 0; i < freeable_protos.size(); ++i) {
202  FreeProtoList(&freeable_protos[i]);
203  }
204  printf ("\n");
205  return 0;
206 } // main
#define iterate(l)
Definition: oldlist.h:159
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
Definition: cntraining.cpp:224
const char * GetNextFilename(int argc, const char *const *argv)
int MagicSamples
Definition: cluster.h:55
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
int size() const
Definition: genericvector.h:72
int push_back(T object)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:546
void ParseArguments(int *argc, char ***argv)
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:40
struct LABELEDLISTNODE * LABELEDLIST
FLOAT32 MinSamples
Definition: cluster.h:50
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:76
void FreeNormProtoList(LIST CharList)
void FreeTrainingSamples(LIST CharList)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:512
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:117
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:293
#define first_node(l)
Definition: oldlist.h:139
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config

◆ WriteNormProtos()

void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
const FEATURE_DESC_STRUCT feature_desc 
)

This routine writes the specified samples into files which are organized according to the font name and character name of the samples.

Parameters
Directorydirectory to place sample files into
LabeledProtoListList of labeled protos
feature_descDescription of the features
Returns
none
Note
Exceptions: none
History: Fri Aug 18 16:17:06 1989, DSJ, Created.

Definition at line 224 of file cntraining.cpp.

225  {
226  FILE *File;
227  STRING Filename;
228  LABELEDLIST LabeledProto;
229  int N;
230 
231  Filename = "";
232  if (Directory != nullptr && Directory[0] != '\0') {
233  Filename += Directory;
234  Filename += "/";
235  }
236  Filename += "normproto";
237  printf ("\nWriting %s ...", Filename.string());
238  File = Efopen (Filename.string(), "wb");
239  fprintf(File, "%0d\n", feature_desc->NumParams);
240  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
241  iterate(LabeledProtoList)
242  {
243  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
244  N = NumberOfProtos(LabeledProto->List, true, false);
245  if (N < 1) {
246  printf ("\nError! Not enough protos for %s: %d protos"
247  " (%d significant protos"
248  ", %d insignificant protos)\n",
249  LabeledProto->Label, N,
250  NumberOfProtos(LabeledProto->List, 1, 0),
251  NumberOfProtos(LabeledProto->List, 0, 1));
252  exit(1);
253  }
254  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
255  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
256  }
257  fclose (File);
258 
259 } // WriteNormProtos
#define iterate(l)
Definition: oldlist.h:159
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
Definition: strngs.h:45
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:259
struct LABELEDLISTNODE * LABELEDLIST
const char * string() const
Definition: strngs.cpp:198
#define first_node(l)
Definition: oldlist.h:139
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
void WriteProtos(FILE *File, uint16_t N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:262

◆ WriteProtos()

void WriteProtos ( FILE *  File,
uint16_t  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 262 of file cntraining.cpp.

268 {
269  PROTOTYPE *Proto;
270 
271  // write prototypes
272  iterate(ProtoList)
273  {
274  Proto = (PROTOTYPE *) first_node ( ProtoList );
275  if (( Proto->Significant && WriteSigProtos ) ||
276  ( ! Proto->Significant && WriteInsigProtos ) )
277  WritePrototype( File, N, Proto );
278  }
279 } // WriteProtos
#define iterate(l)
Definition: oldlist.h:159
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:288
#define first_node(l)
Definition: oldlist.h:139
unsigned Significant
Definition: cluster.h:68

Variable Documentation

◆ CNConfig

CLUSTERCONFIG CNConfig
Initial value:
=
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Definition at line 76 of file cntraining.cpp.