tesseract v5.3.3.20231005
combine_lang_model.cpp
Go to the documentation of this file.
1// Copyright 2017 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3// Purpose: Program to generate a traineddata file that can be used to train an
4// LSTM-based neural network model from a unicharset and an optional
5// set of wordlists. Eliminates the need to run
6// set_unicharset_properties, wordlist2dawg, some non-existent binary
7// to generate the recoder, and finally combine_tessdata.
8
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18
19#include "commandlineflags.h"
20#include "commontraining.h" // CheckSharedLibraryVersion
21#include "lang_model_helpers.h"
22#include "tprintf.h"
24
25using namespace tesseract;
26
27static STRING_PARAM_FLAG(input_unicharset, "",
28 "Filename with unicharset to complete and use in encoding");
29static STRING_PARAM_FLAG(script_dir, "", "Directory name for input script unicharsets");
30static STRING_PARAM_FLAG(words, "", "File listing words to use for the system dictionary");
31static STRING_PARAM_FLAG(puncs, "", "File listing punctuation patterns");
32static STRING_PARAM_FLAG(numbers, "", "File listing number patterns");
33static STRING_PARAM_FLAG(output_dir, "", "Root directory for output files");
34static STRING_PARAM_FLAG(version_str, "", "Version string to add to traineddata file");
35static STRING_PARAM_FLAG(lang, "", "Name of language being processed");
36static BOOL_PARAM_FLAG(lang_is_rtl, false, "True if lang being processed is written right-to-left");
37static BOOL_PARAM_FLAG(pass_through_recoder, false,
38 "If true, the recoder is a simple pass-through of the "
39 "unicharset. Otherwise, potentially a compression of it");
40
41int main(int argc, char **argv) {
42 // Sets properties on the input unicharset file, and writes:
43 // rootdir/lang/lang.charset_size=ddd.txt
44 // rootdir/lang/lang.traineddata
45 // rootdir/lang/lang.unicharset
46 // If the 3 word lists are provided, the dawgs are also added
47 // to the traineddata file.
48 // The output unicharset and charset_size files are just for
49 // human readability.
50 tesseract::CheckSharedLibraryVersion();
51 tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
52
53 // If these reads fail, we get a warning message and an empty list of words.
54 std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\n');
55 std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\n');
56 std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\n');
57 // Load the input unicharset
58 UNICHARSET unicharset;
59 if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
60 tprintf("Failed to load unicharset from %s\n", FLAGS_input_unicharset.c_str());
61 return EXIT_FAILURE;
62 }
63 tprintf("Loaded unicharset of size %zu from file %s\n", unicharset.size(),
64 FLAGS_input_unicharset.c_str());
65
66 // Set unichar properties
67 tprintf("Setting unichar properties\n");
68 tesseract::SetupBasicProperties(/*report_errors*/ true,
69 /*decompose (NFD)*/ false, &unicharset);
70 tprintf("Setting script properties\n");
71 tesseract::SetScriptProperties(FLAGS_script_dir.c_str(), &unicharset);
72 // Combine everything into a traineddata file.
73 return tesseract::CombineLangModel(unicharset, FLAGS_script_dir.c_str(),
74 FLAGS_version_str.c_str(), FLAGS_output_dir.c_str(),
75 FLAGS_lang.c_str(), FLAGS_pass_through_recoder, words, puncs,
76 numbers, FLAGS_lang_is_rtl, /*reader*/ nullptr,
77 /*writer*/ nullptr);
78}
#define BOOL_PARAM_FLAG(name, val, comment)
#define STRING_PARAM_FLAG(name, val, comment)
int main(int argc, char **argv)
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void SetupBasicProperties(bool report_errors, bool decompose, UNICHARSET *unicharset)
void SetScriptProperties(const std::string &script_dir, UNICHARSET *unicharset)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355