tesseract  4.0.0-beta.1-59-g2cc4
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 #include "tessdatamanager.h"
29 
30 #include <stdio.h>
31 
32 #include "helpers.h"
33 #include "serialis.h"
34 #include "strngs.h"
35 #include "tprintf.h"
36 #include "params.h"
37 
38 namespace tesseract {
39 
40 TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
42 }
43 
45  : reader_(reader),
46  is_loaded_(false),
47  swap_(false) {
49 }
50 
51 // Lazily loads from the the given filename. Won't actually read the file
52 // until it needs it.
53 void TessdataManager::LoadFileLater(const char *data_file_name) {
54  Clear();
55  data_file_name_ = data_file_name;
56 }
57 
58 bool TessdataManager::Init(const char *data_file_name) {
60  if (reader_ == nullptr) {
61  if (!LoadDataFromFile(data_file_name, &data)) return false;
62  } else {
63  if (!(*reader_)(data_file_name, &data)) return false;
64  }
65  return LoadMemBuffer(data_file_name, &data[0], data.size());
66 }
67 
68 // Loads from the given memory buffer as if a file.
69 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
70  int size) {
71  Clear();
72  data_file_name_ = name;
73  TFile fp;
74  fp.Open(data, size);
75  int32_t num_entries = TESSDATA_NUM_ENTRIES;
76  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
77  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
78  fp.set_swap(swap_);
79  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
80  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
81  GenericVector<int64_t> offset_table;
82  offset_table.resize_no_init(num_entries);
83  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
84  num_entries)
85  return false;
86  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
87  if (offset_table[i] >= 0) {
88  int64_t entry_size = size - offset_table[i];
89  int j = i + 1;
90  while (j < num_entries && offset_table[j] == -1) ++j;
91  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
92  entries_[i].resize_no_init(entry_size);
93  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
94  }
95  }
96  if (entries_[TESSDATA_VERSION].empty()) {
97  SetVersionString("Pre-4.0.0");
98  }
99  is_loaded_ = true;
100  return true;
101 }
102 
103 // Overwrites a single entry of the given type.
104 void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
105  int size) {
106  is_loaded_ = true;
107  entries_[type].resize_no_init(size);
108  memcpy(&entries_[type][0], data, size);
109 }
110 
111 // Saves to the given filename.
113  FileWriter writer) const {
114  ASSERT_HOST(is_loaded_);
115  GenericVector<char> data;
116  Serialize(&data);
117  if (writer == nullptr)
118  return SaveDataToFile(data, filename);
119  else
120  return (*writer)(data, filename);
121 }
122 
123 // Serializes to the given vector.
125  ASSERT_HOST(is_loaded_);
126  // Compute the offset_table and total size.
127  int64_t offset_table[TESSDATA_NUM_ENTRIES];
128  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
129  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
130  if (entries_[i].empty()) {
131  offset_table[i] = -1;
132  } else {
133  offset_table[i] = offset;
134  offset += entries_[i].size();
135  }
136  }
137  data->init_to_size(offset, 0);
138  int32_t num_entries = TESSDATA_NUM_ENTRIES;
139  TFile fp;
140  fp.OpenWrite(data);
141  fp.FWrite(&num_entries, sizeof(num_entries), 1);
142  fp.FWrite(offset_table, sizeof(offset_table), 1);
143  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
144  if (!entries_[i].empty()) {
145  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
146  }
147  }
148 }
149 
150 // Resets to the initial state, keeping the reader.
152  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
153  entries_[i].clear();
154  }
155  is_loaded_ = false;
156 }
157 
158 // Prints a directory of contents.
160  tprintf("Version string:%s\n", VersionString().c_str());
161  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
162  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
163  if (!entries_[i].empty()) {
164  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
165  entries_[i].size(), offset);
166  offset += entries_[i].size();
167  }
168  }
169 }
170 
171 // Opens the given TFile pointer to the given component type.
172 // Returns false in case of failure.
174  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
175  const TessdataManager *const_this = this;
176  return const_this->GetComponent(type, fp);
177 }
178 
179 // As non-const version except it can't load the component if not already
180 // loaded.
182  ASSERT_HOST(is_loaded_);
183  if (entries_[type].empty()) return false;
184  fp->Open(&entries_[type][0], entries_[type].size());
185  fp->set_swap(swap_);
186  return true;
187 }
188 
189 // Returns the current version string.
190 std::string TessdataManager::VersionString() const {
191  return std::string(&entries_[TESSDATA_VERSION][0],
192  entries_[TESSDATA_VERSION].size());
193 }
194 
195 // Sets the version string to the given v_str.
196 void TessdataManager::SetVersionString(const std::string &v_str) {
197  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
198  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
199 }
200 
202  const char *language_data_path_prefix,
203  const char *output_filename) {
204  // Load individual tessdata components from files.
205  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206  TessdataType type;
207  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
208  STRING filename = language_data_path_prefix;
209  filename += kTessdataFileSuffixes[i];
210  FILE *fp = fopen(filename.string(), "rb");
211  if (fp != nullptr) {
212  fclose(fp);
213  if (!LoadDataFromFile(filename, &entries_[type])) {
214  tprintf("Load of file %s failed!\n", filename.string());
215  return false;
216  }
217  }
218  }
219  is_loaded_ = true;
220 
221  // Make sure that the required components are present.
222  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
223  tprintf(
224  "Error: traineddata file must contain at least (a unicharset file"
225  "and inttemp) OR an lstm file.\n");
226  return false;
227  }
228  // Write updated data to the output traineddata file.
229  return SaveFile(output_filename, nullptr);
230 }
231 
233  const char *new_traineddata_filename,
234  char **component_filenames,
235  int num_new_components) {
236  // Open the files with the new components.
237  for (int i = 0; i < num_new_components; ++i) {
238  TessdataType type;
239  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
240  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
241  tprintf("Failed to read component file:%s\n", component_filenames[i]);
242  return false;
243  }
244  }
245  }
246 
247  // Write updated data to the output traineddata file.
248  return SaveFile(new_traineddata_filename, nullptr);
249 }
250 
253  ASSERT_HOST(
255  if (entries_[type].empty()) return false;
256  return SaveDataToFile(entries_[type], filename);
257 }
258 
260  TessdataType *type) {
261  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
262  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
263  *type = static_cast<TessdataType>(i);
264  return true;
265  }
266  }
267  tprintf("TessdataManager can't determine which tessdata"
268  " component is represented by %s\n", suffix);
269  return false;
270 }
271 
273  TessdataType *type) {
274  // Get the file suffix (extension)
275  const char *suffix = strrchr(filename, '.');
276  if (suffix == nullptr || *(++suffix) == '\0') return false;
277  return TessdataTypeFromFileSuffix(suffix, type);
278 }
279 
280 } // namespace tesseract
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
bool ExtractToFile(const char *filename)
bool Init(const char *data_file_name)
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
std::string VersionString() const
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:112
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:129
Definition: strngs.h:45
bool SaveFile(const STRING &filename, FileWriter writer) const
int size() const
Definition: genericvector.h:72
bool GetComponent(TessdataType type, TFile *fp)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void set_swap(bool value)
Definition: serialis.h:65
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:189
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void resize_no_init(int size)
Definition: genericvector.h:66
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:101
#define tprintf(...)
Definition: tprintf.h:31
void OverwriteEntry(TessdataType type, const char *data, int size)
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:38
void LoadFileLater(const char *data_file_name)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void SetVersionString(const std::string &v_str)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
const char * string() const
Definition: strngs.cpp:198
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:152
void Serialize(GenericVector< char > *data) const
#define PACKAGE_VERSION
Definition: config_auto.h:131
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
void init_to_size(int size, T t)