#include "commandlineflags.h"
#include "lang_model_helpers.h"
#include "tprintf.h"
#include "unicharset_training_utils.h"

Functions
	STRING_PARAM_FLAG (input_unicharset, "", "Unicharset to complete and use in encoding")

	STRING_PARAM_FLAG (script_dir, "", "Directory name for input script unicharsets")

	STRING_PARAM_FLAG (words, "", "File listing words to use for the system dictionary")

	STRING_PARAM_FLAG (puncs, "", "File listing punctuation patterns")

	STRING_PARAM_FLAG (numbers, "", "File listing number patterns")

	STRING_PARAM_FLAG (output_dir, "", "Root directory for output files")

	STRING_PARAM_FLAG (version_str, "", "Version string to add to traineddata file")

	STRING_PARAM_FLAG (lang, "", "Name of language being processed")

	BOOL_PARAM_FLAG (lang_is_rtl, false, "True if lang being processed is written right-to-left")

	BOOL_PARAM_FLAG (pass_through_recoder, false, "If true, the recoder is a simple pass-through of the" " unicharset. Otherwise, potentially a compression of it")

int	main (int argc, char **argv)

Function Documentation

◆ BOOL_PARAM_FLAG() [1/2]

BOOL_PARAM_FLAG	(	lang_is_rtl	,
		false	,
		"True if lang being processed is written right-to-left"
	)

◆ BOOL_PARAM_FLAG() [2/2]

BOOL_PARAM_FLAG	(	pass_through_recoder	,
		false	,
		"If	true,
		the recoder is a simple pass-through of the" " unicharset.	Otherwise,
		potentially a compression of it"
	)

◆ main()

int main	(	int	argc,
		char **	argv
	)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters

argc	number of command line arguments
argv	array of command line arguments

Returns: none

Note: Exceptions: none; History: Fri Aug 18 08:56:17 1989, DSJ, Created.; History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 40 of file combine_lang_model.cpp.

                                 {
   tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
 
   // Check validity of input flags.
   if (FLAGS_input_unicharset.empty() || FLAGS_script_dir.empty() ||
       FLAGS_output_dir.empty() || FLAGS_lang.empty()) {
     tprintf("Usage: %s --input_unicharset filename --script_dir dirname\n",
             argv[0]);
     tprintf("  --output_dir rootdir --lang lang [--lang_is_rtl]\n");
     tprintf("  [--words file --puncs file --numbers file]\n");
     tprintf("Sets properties on the input unicharset file, and writes:\n");
     tprintf("rootdir/lang/lang.charset_size=ddd.txt\n");
     tprintf("rootdir/lang/lang.traineddata\n");
     tprintf("rootdir/lang/lang.unicharset\n");
     tprintf("If the 3 word lists are provided, the dawgs are also added to");
     tprintf(" the traineddata file.\n");
     tprintf("The output unicharset and charset_size files are just for human");
     tprintf(" readability.\n");
     exit(1);
   }
   GenericVector<STRING> words, puncs, numbers;
   // If these reads fail, we get a warning message and an empty list of words.
   tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
   tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
   tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
   // Load the input unicharset
   UNICHARSET unicharset;
   if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
     tprintf("Failed to load unicharset from %s\n",
             FLAGS_input_unicharset.c_str());
     return 1;
   }
   tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(),
           FLAGS_input_unicharset.c_str());
 
   // Set unichar properties
   tprintf("Setting unichar properties\n");
   tesseract::SetupBasicProperties(/*report_errors*/ true,
                                   /*decompose (NFD)*/ false, &unicharset);
   tprintf("Setting script properties\n");
   tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
   // Combine everything into a traineddata file.
   return tesseract::CombineLangModel(
       unicharset, FLAGS_script_dir.c_str(), FLAGS_version_str.c_str(),
       FLAGS_output_dir.c_str(), FLAGS_lang.c_str(), FLAGS_pass_through_recoder,
       words, puncs, numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
       /*writer*/ nullptr);
 }

◆ STRING_PARAM_FLAG() [1/8]

STRING_PARAM_FLAG	(	input_unicharset	,
		""	,
		"Unicharset to complete and use in encoding"
	)

◆ STRING_PARAM_FLAG() [2/8]

STRING_PARAM_FLAG	(	script_dir	,
		""	,
		"Directory name for input script unicharsets"
	)

◆ STRING_PARAM_FLAG() [3/8]

STRING_PARAM_FLAG	(	words	,
		""	,
		"File listing words to use for the system dictionary"
	)

◆ STRING_PARAM_FLAG() [4/8]

STRING_PARAM_FLAG	(	puncs	,
		""	,
		"File listing punctuation patterns"
	)

◆ STRING_PARAM_FLAG() [5/8]

STRING_PARAM_FLAG	(	numbers	,
		""	,
		"File listing number patterns"
	)

◆ STRING_PARAM_FLAG() [6/8]

STRING_PARAM_FLAG	(	output_dir	,
		""	,
		"Root directory for output files"
	)

◆ STRING_PARAM_FLAG() [7/8]

STRING_PARAM_FLAG	(	version_str	,
		""	,
		"Version string to add to traineddata file"
	)

◆ STRING_PARAM_FLAG() [8/8]

STRING_PARAM_FLAG	(	lang	,
		""	,
		"Name of language being processed"
	)

Functions