tesseract v5.3.3.20231005
combine_lang_model.cpp File Reference
#include "commandlineflags.h"
#include "commontraining.h"
#include "lang_model_helpers.h"
#include "tprintf.h"
#include "unicharset_training_utils.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 41 of file combine_lang_model.cpp.

41 {
42 // Sets properties on the input unicharset file, and writes:
43 // rootdir/lang/lang.charset_size=ddd.txt
44 // rootdir/lang/lang.traineddata
45 // rootdir/lang/lang.unicharset
46 // If the 3 word lists are provided, the dawgs are also added
47 // to the traineddata file.
48 // The output unicharset and charset_size files are just for
49 // human readability.
50 tesseract::CheckSharedLibraryVersion();
51 tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
52
53 // If these reads fail, we get a warning message and an empty list of words.
54 std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\n');
55 std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\n');
56 std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\n');
57 // Load the input unicharset
58 UNICHARSET unicharset;
59 if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
60 tprintf("Failed to load unicharset from %s\n", FLAGS_input_unicharset.c_str());
61 return EXIT_FAILURE;
62 }
63 tprintf("Loaded unicharset of size %zu from file %s\n", unicharset.size(),
64 FLAGS_input_unicharset.c_str());
65
66 // Set unichar properties
67 tprintf("Setting unichar properties\n");
68 tesseract::SetupBasicProperties(/*report_errors*/ true,
69 /*decompose (NFD)*/ false, &unicharset);
70 tprintf("Setting script properties\n");
71 tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
72 // Combine everything into a traineddata file.
73 return tesseract::CombineLangModel(unicharset, FLAGS_script_dir.c_str(),
74 FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(),
75 FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs,
76 numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
77 /*writer*/ nullptr);
78}
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355