tesseract v5.3.3.20231005
combine_tessdata.cpp File Reference
#include "commontraining.h"
#include "lstmrecognizer.h"
#include "tessdatamanager.h"
#include <cerrno>
#include <iostream>

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

Definition at line 117 of file combine_tessdata.cpp.

117 {
118 tesseract::CheckSharedLibraryVersion();
119
120 int i;
122 if (argc > 1 && (!strcmp(argv[1], "-v") || !strcmp(argv[1], "--version"))) {
123 printf("%s\n", tesseract::TessBaseAPI::Version());
124 return EXIT_SUCCESS;
125 } else if (argc == 2) {
126 printf("Combining tessdata files\n");
127 std::string lang = argv[1];
128 char *last = &argv[1][strlen(argv[1]) - 1];
129 if (*last != '.') {
130 lang += '.';
131 }
132 std::string output_file = lang;
133 output_file += kTrainedDataSuffix;
134 if (!tm.CombineDataFiles(lang.c_str(), output_file.c_str())) {
135 printf("Error combining tessdata files into %s\n", output_file.c_str());
136 } else {
137 printf("Output %s created successfully.\n", output_file.c_str());
138 }
139 } else if (argc >= 4 &&
140 (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) {
141 // Initialize TessdataManager with the data in the given traineddata file.
142 if (!tm.Init(argv[2])) {
143 tprintf("Failed to read %s\n", argv[2]);
144 return EXIT_FAILURE;
145 }
146 printf("Extracting tessdata components from %s\n", argv[2]);
147 if (strcmp(argv[1], "-e") == 0) {
148 for (i = 3; i < argc; ++i) {
149 errno = 0;
150 if (tm.ExtractToFile(argv[i])) {
151 printf("Wrote %s\n", argv[i]);
152 } else if (errno == 0) {
153 printf(
154 "Not extracting %s, since this component"
155 " is not present\n",
156 argv[i]);
157 return EXIT_FAILURE;
158 } else {
159 printf("Error, could not extract %s: %s\n", argv[i], strerror(errno));
160 return EXIT_FAILURE;
161 }
162 }
163 } else { // extract all the components
164 for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
165 std::string filename = argv[3];
166 char *last = &argv[3][strlen(argv[3]) - 1];
167 if (*last != '.') {
168 filename += '.';
169 }
170 filename += tesseract::kTessdataFileSuffixes[i];
171 errno = 0;
172 if (tm.ExtractToFile(filename.c_str())) {
173 printf("Wrote %s\n", filename.c_str());
174 } else if (errno != 0) {
175 printf("Error, could not extract %s: %s\n", filename.c_str(),
176 strerror(errno));
177 return EXIT_FAILURE;
178 }
179 }
180 }
181 } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
182 // Rename the current traineddata file to a temporary name.
183 const char *new_traineddata_filename = argv[2];
184 std::string traineddata_filename = new_traineddata_filename;
185 traineddata_filename += ".__tmp__";
186 if (rename(new_traineddata_filename, traineddata_filename.c_str()) != 0) {
187 tprintf("Failed to create a temporary file %s\n",
188 traineddata_filename.c_str());
189 return EXIT_FAILURE;
190 }
191
192 // Initialize TessdataManager with the data in the given traineddata file.
193 tm.Init(traineddata_filename.c_str());
194
195 // Write the updated traineddata file.
196 tm.OverwriteComponents(new_traineddata_filename, argv + 3, argc - 3);
197 } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
198 if (!tm.Init(argv[2])) {
199 tprintf("Failed to read %s\n", argv[2]);
200 return EXIT_FAILURE;
201 }
204 tprintf("No LSTM Component found in %s!\n", argv[2]);
205 return EXIT_FAILURE;
206 }
207 tesseract::LSTMRecognizer recognizer;
208 if (!recognizer.DeSerialize(&tm, &fp)) {
209 tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
210 return EXIT_FAILURE;
211 }
212 recognizer.ConvertToInt();
213 std::vector<char> lstm_data;
214 fp.OpenWrite(&lstm_data);
215 ASSERT_HOST(recognizer.Serialize(&tm, &fp));
217 lstm_data.size());
218 if (!tm.SaveFile(argv[2], nullptr)) {
219 tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
220 return EXIT_FAILURE;
221 }
222 } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
223 return list_components(tm, argv[2]);
224 } else if (argc == 3 && strcmp(argv[1], "-l") == 0) {
225 return list_network(tm, argv[2]);
226 } else if (argc == 3 && strcmp(argv[1], "-dl") == 0) {
227 int result = list_components(tm, argv[2]);
228 if (result == EXIT_SUCCESS) {
229 result = list_network(tm, nullptr);
230 }
231 return result;
232 } else if (argc == 3 && strcmp(argv[1], "-ld") == 0) {
233 int result = list_network(tm, argv[2]);
234 if (result == EXIT_SUCCESS) {
235 result = list_components(tm, nullptr);
236 }
237 return result;
238 } else {
239 printf(
240 "Usage for combining tessdata components:\n"
241 " %s language_data_path_prefix\n"
242 " (e.g. %s tessdata/eng.)\n\n",
243 argv[0], argv[0]);
244 printf(
245 "Usage for extracting tessdata components:\n"
246 " %s -e traineddata_file [output_component_file...]\n"
247 " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
248 argv[0], argv[0]);
249 printf(
250 "Usage for overwriting tessdata components:\n"
251 " %s -o traineddata_file [input_component_file...]\n"
252 " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
253 argv[0], argv[0]);
254 printf(
255 "Usage for unpacking all tessdata components:\n"
256 " %s -u traineddata_file output_path_prefix\n"
257 " (e.g. %s -u eng.traineddata tmp/eng.)\n\n",
258 argv[0], argv[0]);
259 printf(
260 "Usage for listing the network information\n"
261 " %s -l traineddata_file\n"
262 " (e.g. %s -l eng.traineddata)\n\n",
263 argv[0], argv[0]);
264 printf(
265 "Usage for listing directory of components:\n"
266 " %s -d traineddata_file\n\n",
267 argv[0]);
268 printf(
269 "Usage for compacting LSTM component to int:\n"
270 " %s -c traineddata_file\n",
271 argv[0]);
272 return EXIT_FAILURE;
273 }
274 tm.Directory();
275 return EXIT_SUCCESS;
276}
#define ASSERT_HOST(x)
Definition: errcode.h:54
LIST last(LIST var_list)
Definition: oldlist.cpp:153
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_NUM_ENTRIES
static const char * Version()
Definition: baseapi.cpp:241
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
void OverwriteEntry(TessdataType type, const char *data, int size)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
bool Init(const char *data_file_name)
bool Serialize(const TessdataManager *mgr, TFile *fp) const
bool DeSerialize(const TessdataManager *mgr, TFile *fp)