tesseract  4.00.00dev
combine_tessdata.cpp
Go to the documentation of this file.
1 // File: combine_tessdata.cpp
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 // Created: Wed Jun 03 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "lstmrecognizer.h"
22 #include "tessdatamanager.h"
23 
24 // Main program to combine/extract/overwrite tessdata components
25 // in [lang].traineddata files.
26 //
27 // To combine all the individual tessdata components (unicharset, DAWGs,
28 // classifier templates, ambiguities, language configs) located at, say,
29 // /home/$USER/temp/eng.* run:
30 //
31 // combine_tessdata /home/$USER/temp/eng.
32 //
33 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
34 //
35 // Specify option -e if you would like to extract individual components
36 // from a combined traineddata file. For example, to extract language config
37 // file and the unicharset from tessdata/eng.traineddata run:
38 //
39 // combine_tessdata -e tessdata/eng.traineddata
40 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
41 //
42 // The desired config file and unicharset will be written to
43 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
44 //
45 // Specify option -o to overwrite individual components of the given
46 // [lang].traineddata file. For example, to overwrite language config
47 // and unichar ambiguities files in tessdata/eng.traineddata use:
48 //
49 // combine_tessdata -o tessdata/eng.traineddata
50 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
51 //
52 // As a result, tessdata/eng.traineddata will contain the new language config
53 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
54 //
55 // Note: the file names of the files to extract to and to overwrite from should
56 // have the appropriate file suffixes (extensions) indicating their tessdata
57 // component type (.unicharset for the unicharset, .unicharambigs for unichar
58 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
59 //
60 // Specify option -u to unpack all the components to the specified path:
61 //
62 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
63 //
64 // This will create /home/$USER/temp/eng.* files with individual tessdata
65 // components from tessdata/eng.traineddata.
66 //
67 int main(int argc, char **argv) {
68  int i;
70  if (argc == 2) {
71  printf("Combining tessdata files\n");
72  STRING lang = argv[1];
73  char* last = &argv[1][strlen(argv[1])-1];
74  if (*last != '.')
75  lang += '.';
76  STRING output_file = lang;
77  output_file += kTrainedDataSuffix;
78  if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
79  printf("Error combining tessdata files into %s\n",
80  output_file.string());
81  } else {
82  printf("Output %s created successfully.\n", output_file.string());
83  }
84  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
85  strcmp(argv[1], "-u") == 0)) {
86  // Initialize TessdataManager with the data in the given traineddata file.
87  if (!tm.Init(argv[2])) {
88  tprintf("Failed to read %s\n", argv[2]);
89  exit(1);
90  }
91  printf("Extracting tessdata components from %s\n", argv[2]);
92  if (strcmp(argv[1], "-e") == 0) {
93  for (i = 3; i < argc; ++i) {
94  if (tm.ExtractToFile(argv[i])) {
95  printf("Wrote %s\n", argv[i]);
96  } else {
97  printf("Not extracting %s, since this component"
98  " is not present\n", argv[i]);
99  }
100  }
101  } else { // extract all the components
102  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
103  STRING filename = argv[3];
104  char* last = &argv[3][strlen(argv[3])-1];
105  if (*last != '.')
106  filename += '.';
107  filename += tesseract::kTessdataFileSuffixes[i];
108  if (tm.ExtractToFile(filename.string())) {
109  printf("Wrote %s\n", filename.string());
110  }
111  }
112  }
113  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
114  // Rename the current traineddata file to a temporary name.
115  const char *new_traineddata_filename = argv[2];
116  STRING traineddata_filename = new_traineddata_filename;
117  traineddata_filename += ".__tmp__";
118  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
119  tprintf("Failed to create a temporary file %s\n",
120  traineddata_filename.string());
121  exit(1);
122  }
123 
124  // Initialize TessdataManager with the data in the given traineddata file.
125  tm.Init(traineddata_filename.string());
126 
127  // Write the updated traineddata file.
128  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
129  } else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
130  if (!tm.Init(argv[2])) {
131  tprintf("Failed to read %s\n", argv[2]);
132  exit(1);
133  }
134  tesseract::TFile fp;
135  if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
136  tprintf("No LSTM Component found in %s!\n", argv[2]);
137  exit(1);
138  }
139  tesseract::LSTMRecognizer recognizer;
140  if (!recognizer.DeSerialize(&tm, &fp)) {
141  tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
142  exit(1);
143  }
144  recognizer.ConvertToInt();
145  GenericVector<char> lstm_data;
146  fp.OpenWrite(&lstm_data);
147  ASSERT_HOST(recognizer.Serialize(&tm, &fp));
148  tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
149  lstm_data.size());
150  if (!tm.SaveFile(argv[2], nullptr)) {
151  tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
152  exit(1);
153  }
154  } else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
155  // Initialize TessdataManager with the data in the given traineddata file.
156  tm.Init(argv[2]);
157  } else {
158  printf("Usage for combining tessdata components:\n"
159  " %s language_data_path_prefix\n"
160  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
161  printf("Usage for extracting tessdata components:\n"
162  " %s -e traineddata_file [output_component_file...]\n"
163  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
164  argv[0], argv[0]);
165  printf("Usage for overwriting tessdata components:\n"
166  " %s -o traineddata_file [input_component_file...]\n"
167  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
168  argv[0], argv[0]);
169  printf("Usage for unpacking all tessdata components:\n"
170  " %s -u traineddata_file output_path_prefix\n"
171  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
172  printf(
173  "Usage for listing directory of components:\n"
174  " %s -d traineddata_file\n",
175  argv[0]);
176  printf(
177  "Usage for compacting LSTM component to int:\n"
178  " %s -c traineddata_file\n",
179  argv[0]);
180  return 1;
181  }
182  tm.Directory();
183 }
bool SaveFile(const STRING &filename, FileWriter writer) const
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:125
bool ExtractToFile(const char *filename)
int size() const
Definition: genericvector.h:72
bool Serialize(const TessdataManager *mgr, TFile *fp) const
LIST last(LIST var_list)
Definition: oldlist.cpp:247
#define tprintf(...)
Definition: tprintf.h:31
bool GetComponent(TessdataType type, TFile *fp)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
const char * string() const
Definition: strngs.cpp:198
Definition: strngs.h:45
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void OverwriteEntry(TessdataType type, const char *data, int size)
bool Init(const char *data_file_name)
int main(int argc, char **argv)