tesseract  4.00.00dev
cntraining.cpp File Reference
#include "oldlist.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "tessopt.h"
#include "ocrfeatures.h"
#include "clusttool.h"
#include "cluster.h"
#include <string.h>
#include <stdio.h>
#include <math.h>
#include "unichar.h"
#include "commontraining.h"

Go to the source code of this file.

Macros

#define PROGRAM_FEATURE_TYPE   "cn"
 

Functions

 DECLARE_STRING_PARAM_FLAG (D)
 
int main (int argc, char **argv)
 
void WriteNormProtos (const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
 
void WriteProtos (FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
 
int main (int argc, char *argv[])
 

Variables

CLUSTERCONFIG CNConfig
 

Macro Definition Documentation

◆ PROGRAM_FEATURE_TYPE

#define PROGRAM_FEATURE_TYPE   "cn"

Definition at line 40 of file cntraining.cpp.

Function Documentation

◆ DECLARE_STRING_PARAM_FLAG()

DECLARE_STRING_PARAM_FLAG ( )

◆ main() [1/2]

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 428 of file tesseractmain.cpp.

428  {
429  const char* lang = "eng";
430  const char* image = NULL;
431  const char* outputbase = NULL;
432  const char* datapath = NULL;
433  bool list_langs = false;
434  bool print_parameters = false;
435  int arg_i = 1;
438  /* main() calls functions like ParseArgs which call exit().
439  * This results in memory leaks if vars_vec and vars_values are
440  * declared as auto variables (destructor is not called then). */
441  static GenericVector<STRING> vars_vec;
442  static GenericVector<STRING> vars_values;
443 
444 #if !defined(DEBUG)
445  // Disable debugging and informational messages from Leptonica.
446  setMsgSeverity(L_SEVERITY_ERROR);
447 #endif
448 
449 #if defined(HAVE_TIFFIO_H) && defined(_WIN32)
450  /* Show libtiff warnings on console (not in GUI). */
451  TIFFSetWarningHandler(Win32WarningHandler);
452 #endif /* HAVE_TIFFIO_H && _WIN32 */
453 
454  ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
455  &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
456  &enginemode);
457 
458  bool banner = false;
459  if (outputbase != NULL && strcmp(outputbase, "-") &&
460  strcmp(outputbase, "stdout")) {
461  banner = true;
462  }
463 
464  PERF_COUNT_START("Tesseract:main")
465 
466  // Call GlobalDawgCache here to create the global DawgCache object before
467  // the TessBaseAPI object. This fixes the order of destructor calls:
468  // first TessBaseAPI must be destructed, DawgCache must be the last object.
469  tesseract::Dict::GlobalDawgCache();
470 
471  // Avoid memory leak caused by auto variable when return is called.
472  static tesseract::TessBaseAPI api;
473 
474  api.SetOutputName(outputbase);
475 
476  int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
477  argc - arg_i, &vars_vec, &vars_values, false);
478 
479  SetVariablesFromCLArgs(&api, argc, argv);
480 
481  if (list_langs) {
482  PrintLangsList(&api);
483  return EXIT_SUCCESS;
484  }
485 
486  if (init_failed) {
487  fprintf(stderr, "Could not initialize tesseract.\n");
488  return EXIT_FAILURE;
489  }
490 
491  if (print_parameters) {
492  FILE* fout = stdout;
493  fprintf(stdout, "Tesseract parameters:\n");
494  api.PrintVariables(fout);
495  api.End();
496  return EXIT_SUCCESS;
497  }
498 
499  FixPageSegMode(&api, pagesegmode);
500 
501  if (pagesegmode == tesseract::PSM_AUTO_ONLY) {
502  int ret_val = EXIT_SUCCESS;
503 
504  Pix* pixs = pixRead(image);
505  if (!pixs) {
506  fprintf(stderr, "Cannot open input file: %s\n", image);
507  return 2;
508  }
509 
510  api.SetImage(pixs);
511 
512  tesseract::Orientation orientation;
515  float deskew_angle;
516 
517  tesseract::PageIterator* it = api.AnalyseLayout();
518  if (it) {
519  it->Orientation(&orientation, &direction, &order, &deskew_angle);
520  tprintf(
521  "Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n"
522  "Deskew angle: %.4f\n",
523  orientation, direction, order, deskew_angle);
524  } else {
525  ret_val = EXIT_FAILURE;
526  }
527 
528  delete it;
529 
530  pixDestroy(&pixs);
531  return ret_val;
532  }
533 
534  // set in_training_mode to true when using one of these configs:
535  // ambigs.train, box.train, box.train.stderr, linebox, rebox
536  bool b = false;
537  bool in_training_mode =
538  (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
539  (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
540  (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);
541 
542  // Avoid memory leak caused by auto variable when exit() is called.
544 
545  if (in_training_mode) {
546  renderers.push_back(NULL);
547  } else {
548  PreloadRenderers(&api, &renderers, pagesegmode, outputbase);
549  }
550 
551  if (!renderers.empty()) {
552  if (banner) PrintBanner();
553  bool succeed = api.ProcessPages(image, NULL, 0, renderers[0]);
554  if (!succeed) {
555  fprintf(stderr, "Error during processing.\n");
556  return EXIT_FAILURE;
557  }
558  }
559 
561 
562  return EXIT_SUCCESS;
563 }
bool empty() const
Definition: genericvector.h:91
struct TessBaseAPI TessBaseAPI
Definition: capi.h:83
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
#define PERF_COUNT_END
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
int push_back(T * object)
#define PERF_COUNT_START(FUNCT_NAME)
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:167
Fully automatic page segmentation, but no OSD.
Definition: publictypes.h:168

◆ main() [2/2]

int main ( int  argc,
char *  argv[] 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName CharName NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

It then appends these samples into a separate file for each character. The name of the file is

DirectoryName/FontName/CharName.FeatureTypeName

The DirectoryName can be specified via a command line argument. If not specified, it defaults to the current directory. The format of the resulting files is:

   NumberOfFeatures(M)
      Feature1
      ...
      FeatureM
   NumberOfFeatures(M)
   ...

The output files each have a header which describes the type of feature which the file contains. This header is in the format required by the clusterer. A command line argument can also be used to specify that only the first N samples of each class should be used.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.

Definition at line 133 of file cntraining.cpp.

133  {
134  // Set the global Config parameters before parsing the command line.
135  Config = CNConfig;
136 
137  const char *PageName;
138  FILE *TrainingPage;
139  LIST CharList = NIL_LIST;
140  CLUSTERER *Clusterer = nullptr;
141  LIST ProtoList = NIL_LIST;
142  LIST NormProtoList = NIL_LIST;
143  LIST pCharList;
144  LABELEDLIST CharSample;
145  FEATURE_DEFS_STRUCT FeatureDefs;
146  InitFeatureDefs(&FeatureDefs);
147 
148  ParseArguments(&argc, &argv);
149  int num_fonts = 0;
150  while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
151  printf("Reading %s ...\n", PageName);
152  TrainingPage = Efopen(PageName, "rb");
153  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
154  TrainingPage, &CharList);
155  fclose(TrainingPage);
156  ++num_fonts;
157  }
158  printf("Clustering ...\n");
159  // To allow an individual font to form a separate cluster,
160  // reduce the min samples:
161  // Config.MinSamples = 0.5 / num_fonts;
162  pCharList = CharList;
163  // The norm protos will count the source protos, so we keep them here in
164  // freeable_protos, so they can be freed later.
165  GenericVector<LIST> freeable_protos;
166  iterate(pCharList) {
167  //Cluster
168  CharSample = (LABELEDLIST)first_node(pCharList);
169  Clusterer =
170  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
171  if (Clusterer == nullptr) { // To avoid a SIGSEGV
172  fprintf(stderr, "Error: NULL clusterer!\n");
173  return 1;
174  }
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  freeable_protos.push_back(ProtoList);
194  FreeClusterer(Clusterer);
195  }
196  FreeTrainingSamples(CharList);
197  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
198  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
199  FeatureDefs.FeatureDesc[desc_index]);
200  FreeNormProtoList(NormProtoList);
201  for (int i = 0; i < freeable_protos.size(); ++i) {
202  FreeProtoList(&freeable_protos[i]);
203  }
204  printf ("\n");
205  return 0;
206 } // main
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:76
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:40
int MagicSamples
Definition: cluster.h:55
struct LABELEDLISTNODE * LABELEDLIST
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
void FreeNormProtoList(LIST CharList)
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:546
int size() const
Definition: genericvector.h:72
const char * GetNextFilename(int argc, const char *const *argv)
CLUSTERCONFIG Config
void FreeTrainingSamples(LIST CharList)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
Definition: cntraining.cpp:224
int push_back(T object)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
#define NIL_LIST
Definition: oldlist.h:126
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:117
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
#define first_node(l)
Definition: oldlist.h:139
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:512
#define iterate(l)
Definition: oldlist.h:159
FLOAT32 MinSamples
Definition: cluster.h:50
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void ParseArguments(int *argc, char ***argv)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:293

◆ WriteNormProtos()

void WriteNormProtos ( const char *  Directory,
LIST  LabeledProtoList,
const FEATURE_DESC_STRUCT feature_desc 
)

This routine writes the specified samples into files which are organized according to the font name and character name of the samples.

Parameters
Directorydirectory to place sample files into
LabeledProtoListList of labeled protos
feature_descDescription of the features
Returns
none
Note
Exceptions: none
History: Fri Aug 18 16:17:06 1989, DSJ, Created.

Definition at line 224 of file cntraining.cpp.

225  {
226  FILE *File;
227  STRING Filename;
228  LABELEDLIST LabeledProto;
229  int N;
230 
231  Filename = "";
232  if (Directory != nullptr && Directory[0] != '\0') {
233  Filename += Directory;
234  Filename += "/";
235  }
236  Filename += "normproto";
237  printf ("\nWriting %s ...", Filename.string());
238  File = Efopen (Filename.string(), "wb");
239  fprintf(File, "%0d\n", feature_desc->NumParams);
240  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
241  iterate(LabeledProtoList)
242  {
243  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
244  N = NumberOfProtos(LabeledProto->List, true, false);
245  if (N < 1) {
246  printf ("\nError! Not enough protos for %s: %d protos"
247  " (%d significant protos"
248  ", %d insignificant protos)\n",
249  LabeledProto->Label, N,
250  NumberOfProtos(LabeledProto->List, 1, 0),
251  NumberOfProtos(LabeledProto->List, 0, 1));
252  exit(1);
253  }
254  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
255  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
256  }
257  fclose (File);
258 
259 } // WriteNormProtos
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:259
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
struct LABELEDLISTNODE * LABELEDLIST
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:262
const char * string() const
Definition: strngs.cpp:198
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
Definition: strngs.h:45
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159

◆ WriteProtos()

void WriteProtos ( FILE *  File,
uinT16  N,
LIST  ProtoList,
BOOL8  WriteSigProtos,
BOOL8  WriteInsigProtos 
)

Definition at line 262 of file cntraining.cpp.

268 {
269  PROTOTYPE *Proto;
270 
271  // write prototypes
272  iterate(ProtoList)
273  {
274  Proto = (PROTOTYPE *) first_node ( ProtoList );
275  if (( Proto->Significant && WriteSigProtos ) ||
276  ( ! Proto->Significant && WriteInsigProtos ) )
277  WritePrototype( File, N, Proto );
278  }
279 } // WriteProtos
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:288
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
unsigned Significant
Definition: cluster.h:68

Variable Documentation

◆ CNConfig

CLUSTERCONFIG CNConfig
Initial value:
=
{
elliptical, 0.025, 0.05, 0.8, 1e-3, 0
}

Definition at line 76 of file cntraining.cpp.