tesseract v5.3.3.20231005
tessdatamanager.cpp
Go to the documentation of this file.
1
2// File: tessdatamanager.cpp
3// Description: Functions to handle loading/combining tesseract data files.
4// Author: Daria Antonova
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20# include "config_auto.h"
21#endif
22
23#include "tessdatamanager.h"
24
25#include <cstdio>
26#include <string>
27
28#if defined(HAVE_LIBARCHIVE)
29# include <archive.h>
30# include <archive_entry.h>
31#endif
32
33#include <tesseract/version.h>
34#include "errcode.h"
35#include "helpers.h"
36#include "params.h"
37#include "serialis.h"
38#include "tprintf.h"
39
40namespace tesseract {
41
42TessdataManager::TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
43 SetVersionString(TESSERACT_VERSION_STR);
44}
45
47 : reader_(reader), is_loaded_(false), swap_(false) {
48 SetVersionString(TESSERACT_VERSION_STR);
49}
50
51// Lazily loads from the given filename. Won't actually read the file
52// until it needs it.
53void TessdataManager::LoadFileLater(const char *data_file_name) {
54 Clear();
55 data_file_name_ = data_file_name;
56}
57
58#if defined(HAVE_LIBARCHIVE)
59bool TessdataManager::LoadArchiveFile(const char *filename) {
60 bool result = false;
61 archive *a = archive_read_new();
62 if (a != nullptr) {
63 archive_read_support_filter_all(a);
64 archive_read_support_format_all(a);
65 if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
66 archive_entry *ae;
67 while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
68 const char *component = archive_entry_pathname(ae);
69 if (component != nullptr) {
71 if (TessdataTypeFromFileName(component, &type)) {
72 int64_t size = archive_entry_size(ae);
73 if (size > 0) {
74 entries_[type].resize(size);
75 if (archive_read_data(a, &entries_[type][0], size) == size) {
76 is_loaded_ = true;
77 }
78 }
79 }
80 }
81 }
82 result = is_loaded_;
83 }
84 archive_read_free(a);
85 }
86 return result;
87}
88#endif
89
90bool TessdataManager::Init(const char *data_file_name) {
91 std::vector<char> data;
92 if (reader_ == nullptr) {
93#if defined(HAVE_LIBARCHIVE)
94 if (LoadArchiveFile(data_file_name)) {
95 return true;
96 }
97#endif
98 if (!LoadDataFromFile(data_file_name, &data)) {
99 return false;
100 }
101 } else {
102 if (!(*reader_)(data_file_name, &data)) {
103 return false;
104 }
105 }
106 return LoadMemBuffer(data_file_name, &data[0], data.size());
107}
108
109// Loads from the given memory buffer as if a file.
110bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) {
111 // TODO: This method supports only the proprietary file format.
112 Clear();
113 data_file_name_ = name;
114 TFile fp;
115 fp.Open(data, size);
116 uint32_t num_entries;
117 if (!fp.DeSerialize(&num_entries)) {
118 return false;
119 }
120 swap_ = num_entries > kMaxNumTessdataEntries;
121 fp.set_swap(swap_);
122 if (swap_) {
123 ReverseN(&num_entries, sizeof(num_entries));
124 }
125 if (num_entries > kMaxNumTessdataEntries) {
126 return false;
127 }
128 // TODO: optimize (no init required).
129 std::vector<int64_t> offset_table(num_entries);
130 if (!fp.DeSerialize(&offset_table[0], num_entries)) {
131 return false;
132 }
133 for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
134 if (offset_table[i] >= 0) {
135 int64_t entry_size = size - offset_table[i];
136 unsigned j = i + 1;
137 while (j < num_entries && offset_table[j] == -1) {
138 ++j;
139 }
140 if (j < num_entries) {
141 entry_size = offset_table[j] - offset_table[i];
142 }
143 entries_[i].resize(entry_size);
144 if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
145 return false;
146 }
147 }
148 }
149 if (entries_[TESSDATA_VERSION].empty()) {
150 SetVersionString("Pre-4.0.0");
151 }
152 is_loaded_ = true;
153 return true;
154}
155
156// Overwrites a single entry of the given type.
157void TessdataManager::OverwriteEntry(TessdataType type, const char *data, int size) {
158 is_loaded_ = true;
159 entries_[type].resize(size);
160 memcpy(&entries_[type][0], data, size);
161}
162
163// Saves to the given filename.
164bool TessdataManager::SaveFile(const char *filename, FileWriter writer) const {
165 // TODO: This method supports only the proprietary file format.
166 ASSERT_HOST(is_loaded_);
167 std::vector<char> data;
168 Serialize(&data);
169 if (writer == nullptr) {
170 return SaveDataToFile(data, filename);
171 } else {
172 return (*writer)(data, filename);
173 }
174}
175
176// Serializes to the given vector.
177void TessdataManager::Serialize(std::vector<char> *data) const {
178 // TODO: This method supports only the proprietary file format.
179 ASSERT_HOST(is_loaded_);
180 // Compute the offset_table and total size.
181 int64_t offset_table[TESSDATA_NUM_ENTRIES];
182 int64_t offset = sizeof(int32_t) + sizeof(offset_table);
183 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
184 if (entries_[i].empty()) {
185 offset_table[i] = -1;
186 } else {
187 offset_table[i] = offset;
188 offset += entries_[i].size();
189 }
190 }
191 data->resize(offset, 0);
192 int32_t num_entries = TESSDATA_NUM_ENTRIES;
193 TFile fp;
194 fp.OpenWrite(data);
195 fp.Serialize(&num_entries);
196 fp.Serialize(&offset_table[0], countof(offset_table));
197 for (const auto &entry : entries_) {
198 if (!entry.empty()) {
199 fp.Serialize(&entry[0], entry.size());
200 }
201 }
202}
203
204// Resets to the initial state, keeping the reader.
206 for (auto &entry : entries_) {
207 entry.clear();
208 }
209 is_loaded_ = false;
210}
211
212// Prints a directory of contents.
214 tprintf("Version:%s\n", VersionString().c_str());
215 auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
216 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217 if (!entries_[i].empty()) {
218 tprintf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
219 offset);
220 offset += entries_[i].size();
221 }
222 }
223}
224
225// Opens the given TFile pointer to the given component type.
226// Returns false in case of failure.
228 if (!is_loaded_ && !Init(data_file_name_.c_str())) {
229 return false;
230 }
231 const TessdataManager *const_this = this;
232 return const_this->GetComponent(type, fp);
233}
234
235// As non-const version except it can't load the component if not already
236// loaded.
238 ASSERT_HOST(is_loaded_);
239 if (entries_[type].empty()) {
240 return false;
241 }
242 fp->Open(&entries_[type][0], entries_[type].size());
243 fp->set_swap(swap_);
244 return true;
245}
246
247// Returns the current version string.
249 return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
250}
251
252// Sets the version string to the given v_str.
253void TessdataManager::SetVersionString(const std::string &v_str) {
254 entries_[TESSDATA_VERSION].resize(v_str.size());
255 memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
256}
257
258bool TessdataManager::CombineDataFiles(const char *language_data_path_prefix,
259 const char *output_filename) {
260 // Load individual tessdata components from files.
261 for (auto filesuffix : kTessdataFileSuffixes) {
263 ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
264 std::string filename = language_data_path_prefix;
265 filename += filesuffix;
266 FILE *fp = fopen(filename.c_str(), "rb");
267 if (fp != nullptr) {
268 fclose(fp);
269 if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
270 tprintf("Load of file %s failed!\n", filename.c_str());
271 return false;
272 }
273 }
274 }
275 is_loaded_ = true;
276
277 // Make sure that the required components are present.
278 if (!IsBaseAvailable() && !IsLSTMAvailable()) {
279 tprintf(
280 "Error: traineddata file must contain at least (a unicharset file"
281 " and inttemp) OR an lstm file.\n");
282 return false;
283 }
284 // Write updated data to the output traineddata file.
285 return SaveFile(output_filename, nullptr);
286}
287
288bool TessdataManager::OverwriteComponents(const char *new_traineddata_filename,
289 char **component_filenames, int num_new_components) {
290 // Open the files with the new components.
291 // TODO: This method supports only the proprietary file format.
292 for (int i = 0; i < num_new_components; ++i) {
294 if (TessdataTypeFromFileName(component_filenames[i], &type)) {
295 if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
296 tprintf("Failed to read component file:%s\n", component_filenames[i]);
297 return false;
298 }
299 }
300 }
301
302 // Write updated data to the output traineddata file.
303 return SaveFile(new_traineddata_filename, nullptr);
304}
305
306bool TessdataManager::ExtractToFile(const char *filename) {
308 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
309 if (entries_[type].empty()) {
310 return false;
311 }
312 return SaveDataToFile(entries_[type], filename);
313}
314
315bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) {
316 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
317 if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
318 *type = static_cast<TessdataType>(i);
319 return true;
320 }
321 }
322#if !defined(NDEBUG)
323 tprintf(
324 "TessdataManager can't determine which tessdata"
325 " component is represented by %s\n",
326 suffix);
327#endif
328 return false;
329}
330
331bool TessdataManager::TessdataTypeFromFileName(const char *filename, TessdataType *type) {
332 // Get the file suffix (extension)
333 const char *suffix = strrchr(filename, '.');
334 if (suffix == nullptr || *(++suffix) == '\0') {
335 return false;
336 }
337 return TessdataTypeFromFileSuffix(suffix, type);
338}
339
340} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:40
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_NUM_ENTRIES
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:34
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:61
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
type
Definition: upload.py:458
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
void set_swap(bool value)
Definition: serialis.h:75
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
void LoadFileLater(const char *data_file_name)
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Init(const char *data_file_name)
void Serialize(std::vector< char > *data) const