40bool WriteFile(
const std::string &output_dir,
const std::string &lang,
const std::string &suffix,
41 const std::vector<char> &data,
FileWriter writer) {
45 std::string dirname = output_dir +
"/" + lang;
49 _mkdir(dirname.c_str());
51 mkdir(dirname.c_str(), S_IRWXU | S_IRWXG);
53 std::string filename = dirname +
"/" + lang + suffix;
54 if (writer ==
nullptr) {
57 return (*writer)(data, filename.c_str());
64 if (filename.empty()) {
67 std::vector<char> data;
69 if (reader ==
nullptr) {
72 read_result = (*reader)(filename.c_str(), &data);
75 return std::string(&data[0], data.size());
77 tprintf(
"Failed to read data from: %s\n", filename.c_str());
84 std::vector<char> unicharset_data;
91 unicharset_data.size());
92 return WriteFile(output_dir, lang,
".unicharset", unicharset_data, writer);
98 const std::string &lang,
FileWriter writer, std::string *radical_table_data,
113 tprintf(
"Null char=%d\n", null_char);
114 if (!recoder.
ComputeEncoding(unicharset, null_char, radical_table_data)) {
115 tprintf(
"Creation of encoded unicharset failed!!\n");
120 std::vector<char> recoder_data;
127 recoder_data.resize(encoding.length(), 0);
128 memcpy(&recoder_data[0], &encoding[0], encoding.length());
130 suffix +=
".charset_size=" + std::to_string(recoder.
code_range());
132 return WriteFile(output_dir, lang, suffix.c_str(), recoder_data, writer);
137static bool WriteDawg(
const std::vector<std::string> &words,
const UNICHARSET &unicharset,
139 TessdataManager *traineddata) {
142 trie.add_word_list(words, unicharset, reverse_policy);
143 tprintf(
"Reducing Trie to SquishedDawg\n");
144 std::unique_ptr<SquishedDawg> dawg(trie.trie_to_dawg());
145 if (dawg ==
nullptr || dawg->NumEdges() == 0) {
149 std::vector<char> dawg_data;
150 fp.OpenWrite(&dawg_data);
151 if (!dawg->write_squished_dawg(&fp)) {
154 traineddata->OverwriteEntry(file_type, &dawg_data[0], dawg_data.size());
161static bool WriteDawgs(
const std::vector<std::string> &words,
const std::vector<std::string> &puncs,
162 const std::vector<std::string> &numbers,
bool lang_is_rtl,
163 const UNICHARSET &unicharset, TessdataManager *traineddata) {
165 tprintf(
"Must have non-empty puncs list to use language models!!\n");
195 const std::string &version_str,
const std::string &output_dir,
196 const std::string &lang,
bool pass_through_recoder,
197 const std::vector<std::string> &words,
const std::vector<std::string> &puncs,
198 const std::vector<std::string> &numbers,
bool lang_is_rtl,
FileReader reader,
202 if (!version_str.empty()) {
206 if (!
WriteUnicharset(unicharset, output_dir, lang, writer, &traineddata)) {
207 tprintf(
"Error writing unicharset!!\n");
210 tprintf(
"Config file is optional, continuing...\n");
213 std::string config_filename = script_dir +
"/" + lang +
"/" + lang +
".config";
214 std::string config_file =
ReadFile(config_filename, reader);
215 if (config_file.length() > 0) {
218 std::string radical_filename = script_dir +
"/radical-stroke.txt";
219 std::string radical_data =
ReadFile(radical_filename, reader);
220 if (radical_data.empty()) {
221 tprintf(
"Error reading radical code table %s\n", radical_filename.c_str());
224 if (!
WriteRecoder(unicharset, pass_through_recoder, output_dir, lang, writer, &radical_data,
226 tprintf(
"Error writing recoder!!\n");
228 if (!words.empty() || !puncs.empty() || !numbers.empty()) {
229 if (!WriteDawgs(words, puncs, numbers, lang_is_rtl, unicharset, &traineddata)) {
230 tprintf(
"Error during conversion of wordlists to DAWGs!!\n");
236 std::vector<char> traineddata_data;
237 traineddata.
Serialize(&traineddata_data);
238 if (!
WriteFile(output_dir, lang,
".traineddata", traineddata_data, writer)) {
239 tprintf(
"Error writing output traineddata file!!\n");
242 tprintf(
"Created %s/%s/%s.traineddata", output_dir.c_str(), lang.c_str(), lang.c_str());
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
void tprintf(const char *format,...)
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_LSTM_NUMBER_DAWG
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir, const std::string &lang, FileWriter writer, std::string *radical_table_data, TessdataManager *traineddata)
bool WriteFile(const std::string &output_dir, const std::string &lang, const std::string &suffix, const std::vector< char > &data, FileWriter writer)
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
bool(*)(const char *filename, std::vector< char > *data) FileReader
bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir, const std::string &lang, FileWriter writer, TessdataManager *traineddata)
std::string ReadFile(const std::string &filename, FileReader reader)
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir, const std::string &version_str, const std::string &output_dir, const std::string &lang, bool pass_through_recoder, const std::vector< std::string > &words, const std::vector< std::string > &puncs, const std::vector< std::string > &numbers, bool lang_is_rtl, FileReader reader, FileWriter writer)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void OpenWrite(std::vector< char > *data)
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
void SetVersionString(const std::string &v_str)
void Serialize(std::vector< char > *data) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
void SetupPassThrough(const UNICHARSET &unicharset)
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
bool has_special_codes() const
bool save_to_file(const char *const filename) const