tesseract v5.3.3.20231005
lang_model_helpers.cpp
Go to the documentation of this file.
1// Copyright 2017 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3// Purpose: Collection of convenience functions to simplify creation of the
4// unicharset, recoder, and dawgs for an LSTM model.
5
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9// http://www.apache.org/licenses/LICENSE-2.0
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include "lang_model_helpers.h"
17
18#include "dawg.h"
19#include "fileio.h"
20#include "tessdatamanager.h"
21#include "trie.h"
22#include "unicharcompress.h"
23
24#include <cstdlib>
25
26#include <sys/stat.h>
27#include <sys/types.h>
28
29#if defined(_WIN32)
30# include <direct.h>
31#endif
32
33namespace tesseract {
34
35// Helper makes a filename (<output_dir>/<lang>/<lang><suffix>) and writes data
36// to the file, using writer if not null, otherwise, a default writer.
37// Default writer will overwrite any existing file, but a supplied writer
38// can do its own thing. If lang is empty, returns true but does nothing.
39// NOTE that suffix should contain any required . for the filename.
40bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix,
41 const std::vector<char> &data, FileWriter writer) {
42 if (lang.empty()) {
43 return true;
44 }
45 std::string dirname = output_dir + "/" + lang;
46 // Attempt to make the directory, but ignore errors, as it may not be a
47 // standard filesystem, and the writer will complain if not successful.
48#if defined(_WIN32)
49 _mkdir(dirname.c_str());
50#else
51 mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
52#endif
53 std::string filename = dirname + "/" + lang + suffix;
54 if (writer == nullptr) {
55 return SaveDataToFile(data, filename.c_str());
56 } else {
57 return (*writer)(data, filename.c_str());
58 }
59}
60
61// Helper reads a file with optional reader and returns a string.
62// On failure emits a warning message and returns an empty string.
63std::string ReadFile(const std::string &filename, FileReader reader) {
64 if (filename.empty()) {
65 return std::string();
66 }
67 std::vector<char> data;
68 bool read_result;
69 if (reader == nullptr) {
70 read_result = LoadDataFromFile(filename.c_str(), &data);
71 } else {
72 read_result = (*reader)(filename.c_str(), &data);
73 }
74 if (read_result) {
75 return std::string(&data[0], data.size());
76 }
77 tprintf("Failed to read data from: %s\n", filename.c_str());
78 return std::string();
79}
80
81// Helper writes the unicharset to file and to the traineddata.
82bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir,
83 const std::string &lang, FileWriter writer, TessdataManager *traineddata) {
84 std::vector<char> unicharset_data;
85 TFile fp;
86 fp.OpenWrite(&unicharset_data);
87 if (!unicharset.save_to_file(&fp)) {
88 return false;
89 }
90 traineddata->OverwriteEntry(TESSDATA_LSTM_UNICHARSET, &unicharset_data[0],
91 unicharset_data.size());
92 return WriteFile(output_dir, lang, ".unicharset", unicharset_data, writer);
93}
94
95// Helper creates the recoder and writes it to the traineddata, and a human-
96// readable form to file.
97bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,
98 const std::string &lang, FileWriter writer, std::string *radical_table_data,
99 TessdataManager *traineddata) {
100 UnicharCompress recoder;
101 // Where the unicharset is carefully setup already to contain a good
102 // compact encoding, use a pass-through recoder that does nothing.
103 // For scripts that have a large number of unicodes (Han, Hangul) we want
104 // to use the recoder to compress the symbol space by re-encoding each
105 // unicode as multiple codes from a smaller 'alphabet' that are related to the
106 // shapes in the character. Hangul Jamo is a perfect example of this.
107 // See the Hangul Syllables section, sub-section "Equivalence" in:
108 // http://www.unicode.org/versions/Unicode10.0.0/ch18.pdf
109 if (pass_through) {
110 recoder.SetupPassThrough(unicharset);
111 } else {
112 int null_char = unicharset.has_special_codes() ? UNICHAR_BROKEN : unicharset.size();
113 tprintf("Null char=%d\n", null_char);
114 if (!recoder.ComputeEncoding(unicharset, null_char, radical_table_data)) {
115 tprintf("Creation of encoded unicharset failed!!\n");
116 return false;
117 }
118 }
119 TFile fp;
120 std::vector<char> recoder_data;
121 fp.OpenWrite(&recoder_data);
122 if (!recoder.Serialize(&fp)) {
123 return false;
124 }
125 traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size());
126 std::string encoding = recoder.GetEncodingAsString(unicharset);
127 recoder_data.resize(encoding.length(), 0);
128 memcpy(&recoder_data[0], &encoding[0], encoding.length());
129 std::string suffix;
130 suffix += ".charset_size=" + std::to_string(recoder.code_range());
131 suffix += ".txt";
132 return WriteFile(output_dir, lang, suffix.c_str(), recoder_data, writer);
133}
134
135// Helper builds a dawg from the given words, using the unicharset as coding,
136// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
137static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset,
138 Trie::RTLReversePolicy reverse_policy, TessdataType file_type,
139 TessdataManager *traineddata) {
140 // The first 3 arguments are not used in this case.
141 Trie trie(DAWG_TYPE_WORD, "", SYSTEM_DAWG_PERM, unicharset.size(), 0);
142 trie.add_word_list(words, unicharset, reverse_policy);
143 tprintf("Reducing Trie to SquishedDawg\n");
144 std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
145 if (dawg == nullptr || dawg->NumEdges() == 0) {
146 return false;
147 }
148 TFile fp;
149 std::vector<char> dawg_data;
150 fp.OpenWrite(&dawg_data);
151 if (!dawg->write_squished_dawg(&fp)) {
152 return false;
153 }
154 traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
155 return true;
156}
157
158// Builds and writes the dawgs, given a set of words, punctuation
159// patterns, number patterns, to the traineddata. Encoding uses the given
160// unicharset, and the punc dawgs is reversed if lang_is_rtl.
161static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs,
162 const std::vector<std::string> &numbers, bool lang_is_rtl,
163 const UNICHARSET &unicharset, TessdataManager *traineddata) {
164 if (puncs.empty()) {
165 tprintf("Must have non-empty puncs list to use language models!!\n");
166 return false;
167 }
168 // For each of the dawg types, make the dawg, and write to traineddata.
169 // Dawgs are reversed as follows:
170 // Words: According to the word content.
171 // Puncs: According to lang_is_rtl.
172 // Numbers: Never.
173 // System dawg (main wordlist).
174 if (!words.empty() && !WriteDawg(words, unicharset, Trie::RRP_REVERSE_IF_HAS_RTL,
175 TESSDATA_LSTM_SYSTEM_DAWG, traineddata)) {
176 return false;
177 }
178 // punc/punc-dawg.
179 Trie::RTLReversePolicy reverse_policy =
181 if (!WriteDawg(puncs, unicharset, reverse_policy, TESSDATA_LSTM_PUNC_DAWG, traineddata)) {
182 return false;
183 }
184 // numbers/number-dawg.
185 if (!numbers.empty() && !WriteDawg(numbers, unicharset, Trie::RRP_DO_NO_REVERSE,
186 TESSDATA_LSTM_NUMBER_DAWG, traineddata)) {
187 return false;
188 }
189 return true;
190}
191
192// The main function for combine_lang_model.cpp.
193// Returns EXIT_SUCCESS or EXIT_FAILURE for error.
194int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,
195 const std::string &version_str, const std::string &output_dir,
196 const std::string &lang, bool pass_through_recoder,
197 const std::vector<std::string> &words, const std::vector<std::string> &puncs,
198 const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,
199 FileWriter writer) {
200 // Build the traineddata file.
201 TessdataManager traineddata;
202 if (!version_str.empty()) {
203 traineddata.SetVersionString(traineddata.VersionString() + ":" + version_str);
204 }
205 // Unicharset and recoder.
206 if (!WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
207 tprintf("Error writing unicharset!!\n");
208 return EXIT_FAILURE;
209 } else {
210 tprintf("Config file is optional, continuing...\n");
211 }
212 // If there is a config file, read it and add to traineddata.
213 std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
214 std::string config_file = ReadFile(config_filename, reader);
215 if (config_file.length() > 0) {
216 traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length());
217 }
218 std::string radical_filename = script_dir + "/radical-stroke.txt";
219 std::string radical_data = ReadFile(radical_filename, reader);
220 if (radical_data.empty()) {
221 tprintf("Error reading radical code table %s\n", radical_filename.c_str());
222 return EXIT_FAILURE;
223 }
224 if (!WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data,
225 &traineddata)) {
226 tprintf("Error writing recoder!!\n");
227 }
228 if (!words.empty() || !puncs.empty() || !numbers.empty()) {
229 if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) {
230 tprintf("Error during conversion of wordlists to DAWGs!!\n");
231 return EXIT_FAILURE;
232 }
233 }
234
235 // Traineddata file.
236 std::vector<char> traineddata_data;
237 traineddata.Serialize(&traineddata_data);
238 if (!WriteFile(output_dir, lang, ".traineddata", traineddata_data, writer)) {
239 tprintf("Error writing output traineddata file!!\n");
240 return EXIT_FAILURE;
241 }
242 tprintf("Created %s/%s/%s.traineddata", output_dir.c_str(), lang.c_str(), lang.c_str());
243 return EXIT_SUCCESS;
244}
245
246} // namespace tesseract
@ DAWG_TYPE_WORD
Definition: dawg.h:66
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:40
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_RECODER
@ TESSDATA_LANG_CONFIG
@ TESSDATA_LSTM_NUMBER_DAWG
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, const std::string &lang, FileWriter writer, std::string *radical_table_data, TessdataManager *traineddata)
bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, const std::vector< char > &data, FileWriter writer)
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
@ UNICHAR_BROKEN
Definition: unicharset.h:38
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:61
bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, const std::string &lang, FileWriter writer, TessdataManager *traineddata)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
void SetVersionString(const std::string &v_str)
void Serialize(std::vector< char > *data) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
void SetupPassThrough(const UNICHARSET &unicharset)
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
bool has_special_codes() const
Definition: unicharset.h:756
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
size_t size() const
Definition: unicharset.h:355
RTLReversePolicy
Definition: trie.h:55
@ RRP_DO_NO_REVERSE
Definition: trie.h:56
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
@ RRP_FORCE_REVERSE
Definition: trie.h:58