tesseract  4.00.00dev
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <stdio.h>
27 
28 #include "helpers.h"
29 #include "serialis.h"
30 #include "strngs.h"
31 #include "tprintf.h"
32 #include "params.h"
33 
34 namespace tesseract {
35 
36 // Lazily loads from the the given filename. Won't actually read the file
37 // until it needs it.
38 void TessdataManager::LoadFileLater(const char *data_file_name) {
39  Clear();
40  data_file_name_ = data_file_name;
41 }
42 
43 bool TessdataManager::Init(const char *data_file_name) {
45  if (reader_ == nullptr) {
46  if (!LoadDataFromFile(data_file_name, &data)) return false;
47  } else {
48  if (!(*reader_)(data_file_name, &data)) return false;
49  }
50  return LoadMemBuffer(data_file_name, &data[0], data.size());
51 }
52 
53 // Loads from the given memory buffer as if a file.
54 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
55  int size) {
56  Clear();
57  data_file_name_ = name;
58  TFile fp;
59  fp.Open(data, size);
60  inT32 num_entries = TESSDATA_NUM_ENTRIES;
61  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
62  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
63  fp.set_swap(swap_);
64  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
65  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
66  GenericVector<inT64> offset_table;
67  offset_table.resize_no_init(num_entries);
68  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
69  num_entries)
70  return false;
71  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
72  if (offset_table[i] >= 0) {
73  inT64 entry_size = size - offset_table[i];
74  int j = i + 1;
75  while (j < num_entries && offset_table[j] == -1) ++j;
76  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
77  entries_[i].resize_no_init(entry_size);
78  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
79  }
80  }
81  if (entries_[TESSDATA_VERSION].empty()) {
82  SetVersionString("Pre-4.0.0");
83  }
84  is_loaded_ = true;
85  return true;
86 }
87 
88 // Overwrites a single entry of the given type.
89 void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
90  int size) {
91  is_loaded_ = true;
92  entries_[type].resize_no_init(size);
93  memcpy(&entries_[type][0], data, size);
94 }
95 
96 // Saves to the given filename.
98  FileWriter writer) const {
99  ASSERT_HOST(is_loaded_);
100  GenericVector<char> data;
101  Serialize(&data);
102  if (writer == nullptr)
103  return SaveDataToFile(data, filename);
104  else
105  return (*writer)(data, filename);
106 }
107 
108 // Serializes to the given vector.
110  ASSERT_HOST(is_loaded_);
111  // Compute the offset_table and total size.
112  inT64 offset_table[TESSDATA_NUM_ENTRIES];
113  inT64 offset = sizeof(inT32) + sizeof(offset_table);
114  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
115  if (entries_[i].empty()) {
116  offset_table[i] = -1;
117  } else {
118  offset_table[i] = offset;
119  offset += entries_[i].size();
120  }
121  }
122  data->init_to_size(offset, 0);
123  inT32 num_entries = TESSDATA_NUM_ENTRIES;
124  TFile fp;
125  fp.OpenWrite(data);
126  fp.FWrite(&num_entries, sizeof(num_entries), 1);
127  fp.FWrite(offset_table, sizeof(offset_table), 1);
128  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
129  if (!entries_[i].empty()) {
130  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
131  }
132  }
133 }
134 
135 // Resets to the initial state, keeping the reader.
137  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
138  entries_[i].clear();
139  }
140  is_loaded_ = false;
141 }
142 
143 // Prints a directory of contents.
145  tprintf("Version string:%s\n", VersionString().c_str());
146  int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
147  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
148  if (!entries_[i].empty()) {
149  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
150  entries_[i].size(), offset);
151  offset += entries_[i].size();
152  }
153  }
154 }
155 
156 // Opens the given TFile pointer to the given component type.
157 // Returns false in case of failure.
159  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
160  const TessdataManager *const_this = this;
161  return const_this->GetComponent(type, fp);
162 }
163 
164 // As non-const version except it can't load the component if not already
165 // loaded.
167  ASSERT_HOST(is_loaded_);
168  if (entries_[type].empty()) return false;
169  fp->Open(&entries_[type][0], entries_[type].size());
170  fp->set_swap(swap_);
171  return true;
172 }
173 
174 // Returns the current version string.
176  return string(&entries_[TESSDATA_VERSION][0],
177  entries_[TESSDATA_VERSION].size());
178 }
179 
180 // Sets the version string to the given v_str.
181 void TessdataManager::SetVersionString(const string &v_str) {
182  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
183  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
184 }
185 
187  const char *language_data_path_prefix,
188  const char *output_filename) {
189  // Load individual tessdata components from files.
190  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
191  TessdataType type;
192  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
193  STRING filename = language_data_path_prefix;
194  filename += kTessdataFileSuffixes[i];
195  FILE *fp = fopen(filename.string(), "rb");
196  if (fp != nullptr) {
197  fclose(fp);
198  if (!LoadDataFromFile(filename, &entries_[type])) {
199  tprintf("Load of file %s failed!\n", filename.string());
200  return false;
201  }
202  }
203  }
204  is_loaded_ = true;
205 
206  // Make sure that the required components are present.
207  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
208  tprintf(
209  "Error: traineddata file must contain at least (a unicharset file"
210  "and inttemp) OR an lstm file.\n");
211  return false;
212  }
213  // Write updated data to the output traineddata file.
214  return SaveFile(output_filename, nullptr);
215 }
216 
218  const char *new_traineddata_filename,
219  char **component_filenames,
220  int num_new_components) {
221  // Open the files with the new components.
222  for (int i = 0; i < num_new_components; ++i) {
223  TessdataType type;
224  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
225  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
226  tprintf("Failed to read component file:%s\n", component_filenames[i]);
227  return false;
228  }
229  }
230  }
231 
232  // Write updated data to the output traineddata file.
233  return SaveFile(new_traineddata_filename, nullptr);
234 }
235 
238  ASSERT_HOST(
240  if (entries_[type].empty()) return false;
241  return SaveDataToFile(entries_[type], filename);
242 }
243 
245  TessdataType *type) {
246  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
247  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
248  *type = static_cast<TessdataType>(i);
249  return true;
250  }
251  }
252  tprintf("TessdataManager can't determine which tessdata"
253  " component is represented by %s\n", suffix);
254  return false;
255 }
256 
258  TessdataType *type) {
259  // Get the file suffix (extension)
260  const char *suffix = strrchr(filename, '.');
261  if (suffix == nullptr || *(++suffix) == '\0') return false;
262  return TessdataTypeFromFileSuffix(suffix, type);
263 }
264 
265 } // namespace tesseract
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
bool SaveFile(const STRING &filename, FileWriter writer) const
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:125
void resize_no_init(int size)
Definition: genericvector.h:66
bool ExtractToFile(const char *filename)
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
bool GetComponent(TessdataType type, TFile *fp)
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
void set_swap(bool value)
Definition: serialis.h:65
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
const char * string() const
Definition: strngs.cpp:198
bool LoadMemBuffer(const char *name, const char *data, int size)
void LoadFileLater(const char *data_file_name)
void Serialize(GenericVector< char > *data) const
Definition: strngs.h:45
int32_t inT32
Definition: host.h:38
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:38
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:97
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
void OverwriteEntry(TessdataType type, const char *data, int size)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
bool Init(const char *data_file_name)
void init_to_size(int size, T t)
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108
int64_t inT64
Definition: host.h:40
void SetVersionString(const string &v_str)