tesseract  4.00.00dev
unicharset_extractor.cpp File Reference
#include <cstdlib>
#include "boxread.h"
#include "commandlineflags.h"
#include "genericvector.h"
#include "lang_model_helpers.h"
#include "normstrngs.h"
#include "strngs.h"
#include "tprintf.h"
#include "unicharset.h"
#include "unicharset_training_utils.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 STRING_PARAM_FLAG (output_unicharset, "unicharset", "Output file path")
 
 INT_PARAM_FLAG (norm_mode, 1, "Normalization mode: 1=Combine graphemes, " "2=Split graphemes, 3=Pure unicode")
 
int tesseract::Main (int argc, char **argv)
 
int main (int argc, char **argv)
 

Function Documentation

◆ INT_PARAM_FLAG()

INT_PARAM_FLAG ( norm_mode  ,
,
"Normalization mode:  1 = Combine graphemes,
" "  2 = Split graphemes 
)

◆ main()

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 97 of file unicharset_extractor.cpp.

97  {
98  if (argc > 1) {
99  tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
100  }
101  if (argc < 2) {
102  tprintf(
103  "Usage: %s [--output_unicharset filename] [--norm_mode mode]"
104  " box_or_text_file [...]\n",
105  argv[0]);
106  tprintf("Where mode means:\n");
107  tprintf(" 1=combine graphemes (use for Latin and other simple scripts)\n");
108  tprintf(" 2=split graphemes (use for Indic/Khmer/Myanmar)\n");
109  tprintf(" 3=pure unicode (use for Arabic/Hebrew/Thai/Tibetan)\n");
110  tprintf("Reads box or plain text files to extract the unicharset.\n");
111  return EXIT_FAILURE;
112  }
113  return tesseract::Main(argc, argv);
114 }
#define tprintf(...)
Definition: tprintf.h:31
int Main(int argc, char **argv)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)

◆ STRING_PARAM_FLAG()

STRING_PARAM_FLAG ( output_unicharset  ,
"unicharset"  ,
"Output file path"   
)