tesseract v5.3.3.20231005
combine_tessdata.cpp
Go to the documentation of this file.
1
2// File: combine_tessdata.cpp
3// Description: Creates a unified traineddata file from several
4// data files produced by the training process.
5// Author: Daria Antonova
6//
7// (C) Copyright 2009, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#include "commontraining.h" // CheckSharedLibraryVersion
21#include "lstmrecognizer.h"
22#include "tessdatamanager.h"
23
24#include <cerrno>
25#include <iostream> // std::cout
26
27using namespace tesseract;
28
29static int list_components(TessdataManager &tm, const char *filename) {
30 // Initialize TessdataManager with the data in the given traineddata file.
31 if (filename != nullptr && !tm.Init(filename)) {
32 tprintf("Failed to read %s\n", filename);
33 return EXIT_FAILURE;
34 }
35 tm.Directory();
36 return EXIT_SUCCESS;
37}
38
39static int list_network(TessdataManager &tm, const char *filename) {
40 if (filename != nullptr && !tm.Init(filename)) {
41 tprintf("Failed to read %s\n", filename);
42 return EXIT_FAILURE;
43 }
47 if (!recognizer.DeSerialize(&tm, &fp)) {
48 tprintf("Failed to deserialize LSTM in %s!\n", filename);
49 return EXIT_FAILURE;
50 }
51 std::cout << "LSTM: network=" << recognizer.GetNetwork()
52 << ", int_mode=" << recognizer.IsIntMode()
53 << ", recoding=" << recognizer.IsRecoding()
54 << ", iteration=" << recognizer.training_iteration()
55 << ", sample_iteration=" << recognizer.sample_iteration()
56 << ", null_char=" << recognizer.null_char()
57 << ", learning_rate=" << recognizer.learning_rate()
58 << ", momentum=" << recognizer.GetMomentum()
59 << ", adam_beta=" << recognizer.GetAdamBeta() << '\n';
60
61 std::cout << "Layer Learning Rates: ";
62 auto layers = recognizer.EnumerateLayers();
63 for (const auto &id : layers) {
64 auto layer = recognizer.GetLayer(id);
65 std::cout << id << "(" << layer->name() << ")"
66 << "=" << recognizer.GetLayerLearningRate(id)
67 << (layers[layers.size() - 1] != id ? ", " : "");
68 }
69 std::cout << "\n";
70 }
71 return EXIT_SUCCESS;
72}
73
74// Main program to combine/extract/overwrite tessdata components
75// in [lang].traineddata files.
76//
77// To combine all the individual tessdata components (unicharset, DAWGs,
78// classifier templates, ambiguities, language configs) located at, say,
79// /home/$USER/temp/eng.* run:
80//
81// combine_tessdata /home/$USER/temp/eng.
82//
83// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
84//
85// Specify option -e if you would like to extract individual components
86// from a combined traineddata file. For example, to extract language config
87// file and the unicharset from tessdata/eng.traineddata run:
88//
89// combine_tessdata -e tessdata/eng.traineddata
90// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
91//
92// The desired config file and unicharset will be written to
93// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
94//
95// Specify option -o to overwrite individual components of the given
96// [lang].traineddata file. For example, to overwrite language config
97// and unichar ambiguities files in tessdata/eng.traineddata use:
98//
99// combine_tessdata -o tessdata/eng.traineddata
100// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
101//
102// As a result, tessdata/eng.traineddata will contain the new language config
103// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
104//
105// Note: the file names of the files to extract to and to overwrite from should
106// have the appropriate file suffixes (extensions) indicating their tessdata
107// component type (.unicharset for the unicharset, .unicharambigs for unichar
108// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
109//
110// Specify option -u to unpack all the components to the specified path:
111//
112// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
113//
114// This will create /home/$USER/temp/eng.* files with individual tessdata
115// components from tessdata/eng.traineddata.
116//
117int main(int argc, char **argv) {
118 tesseract::CheckSharedLibraryVersion();
119
120 int i;
122 if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
123 printf("%s\n", tesseract::TessBaseAPI::Version());
124 return EXIT_SUCCESS;
125 } else if (argc == 2) {
126 printf("Combining tessdata files\n");
127 std::string lang = argv[1];
128 char *last = &argv[1][strlen(argv[1]) - 1];
129 if (*last != '.') {
130 lang += '.';
131 }
132 std::string output_file = lang;
133 output_file += kTrainedDataSuffix;
134 if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
135 printf("Error combining tessdata files into %s\n", output_file.c_str());
136 } else {
137 printf("Output %s created successfully.\n", output_file.c_str());
138 }
139 } else if (argc >= 4 &&
140 (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
141 // Initialize TessdataManager with the data in the given traineddata file.
142 if (!tm.Init(argv[2])) {
143 tprintf("Failed to read %s\n", argv[2]);
144 return EXIT_FAILURE;
145 }
146 printf("Extracting tessdata components from %s\n", argv[2]);
147 if (strcmp(argv[1], "-e") == 0) {
148 for (i = 3; i < argc; ++i) {
149 errno = 0;
150 if (tm.ExtractToFile(argv[i])) {
151 printf("Wrote %s\n", argv[i]);
152 } else if (errno == 0) {
153 printf(
154 "Not extracting %s, since this component"
155 " is not present\n",
156 argv[i]);
157 return EXIT_FAILURE;
158 } else {
159 printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
160 return EXIT_FAILURE;
161 }
162 }
163 } else { // extract all the components
164 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
165 std::string filename = argv[3];
166 char *last = &argv[3][strlen(argv[3]) - 1];
167 if (*last != '.') {
168 filename += '.';
169 }
170 filename += tesseract::kTessdataFileSuffixes[i];
171 errno = 0;
172 if (tm.ExtractToFile(filename.c_str())) {
173 printf("Wrote %s\n", filename.c_str());
174 } else if (errno != 0) {
175 printf("Error, could not extract %s: %s\n", filename.c_str(),
176 strerror(errno));
177 return EXIT_FAILURE;
178 }
179 }
180 }
181 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
182 // Rename the current traineddata file to a temporary name.
183 const char *new_traineddata_filename = argv[2];
184 std::string traineddata_filename = new_traineddata_filename;
185 traineddata_filename += ".__tmp__";
186 if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
187 tprintf("Failed to create a temporary file %s\n",
188 traineddata_filename.c_str());
189 return EXIT_FAILURE;
190 }
191
192 // Initialize TessdataManager with the data in the given traineddata file.
193 tm.Init(traineddata_filename.c_str());
194
195 // Write the updated traineddata file.
196 tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
197 } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
198 if (!tm.Init(argv[2])) {
199 tprintf("Failed to read %s\n", argv[2]);
200 return EXIT_FAILURE;
201 }
204 tprintf("No LSTM Component found in %s!\n", argv[2]);
205 return EXIT_FAILURE;
206 }
207 tesseract::LSTMRecognizer recognizer;
208 if (!recognizer.DeSerialize(&tm, &fp)) {
209 tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
210 return EXIT_FAILURE;
211 }
212 recognizer.ConvertToInt();
213 std::vector<char> lstm_data;
214 fp.OpenWrite(&lstm_data);
215 ASSERT_HOST(recognizer.Serialize(&tm, &fp));
217 lstm_data.size());
218 if (!tm.SaveFile(argv[2], nullptr)) {
219 tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
220 return EXIT_FAILURE;
221 }
222 } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
223 return list_components(tm, argv[2]);
224 } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
225 return list_network(tm, argv[2]);
226 } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
227 int result = list_components(tm, argv[2]);
228 if (result == EXIT_SUCCESS) {
229 result = list_network(tm, nullptr);
230 }
231 return result;
232 } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
233 int result = list_network(tm, argv[2]);
234 if (result == EXIT_SUCCESS) {
235 result = list_components(tm, nullptr);
236 }
237 return result;
238 } else {
239 printf(
240 "Usage for combining tessdata components:\n"
241 " %s language_data_path_prefix\n"
242 " (e.g. %s tessdata/eng.)\n\n",
243 argv[0], argv[0]);
244 printf(
245 "Usage for extracting tessdata components:\n"
246 " %s -e traineddata_file [output_component_file...]\n"
247 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
248 argv[0], argv[0]);
249 printf(
250 "Usage for overwriting tessdata components:\n"
251 " %s -o traineddata_file [input_component_file...]\n"
252 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
253 argv[0], argv[0]);
254 printf(
255 "Usage for unpacking all tessdata components:\n"
256 " %s -u traineddata_file output_path_prefix\n"
257 " (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
258 argv[0], argv[0]);
259 printf(
260 "Usage for listing the network information\n"
261 " %s -l traineddata_file\n"
262 " (e.g. %s -l eng.traineddata)\n\n",
263 argv[0], argv[0]);
264 printf(
265 "Usage for listing directory of components:\n"
266 " %s -d traineddata_file\n\n",
267 argv[0]);
268 printf(
269 "Usage for compacting LSTM component to int:\n"
270 " %s -c traineddata_file\n",
271 argv[0]);
272 return EXIT_FAILURE;
273 }
274 tm.Directory();
275 return EXIT_SUCCESS;
276}
#define ASSERT_HOST(x)
Definition: errcode.h:54
int main(int argc, char **argv)
LIST last(LIST var_list)
Definition: oldlist.cpp:153
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_NUM_ENTRIES
static const char * Version()
Definition: baseapi.cpp:241
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
void OverwriteEntry(TessdataType type, const char *data, int size)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
bool Init(const char *data_file_name)
std::vector< std::string > EnumerateLayers() const
float GetLayerLearningRate(const std::string &id) const
const char * GetNetwork() const
Network * GetLayer(const std::string &id) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
const std::string & name() const
Definition: network.h:140