20# include "config_auto.h"
28#if defined(HAVE_LIBARCHIVE)
30# include <archive_entry.h>
33#include <tesseract/version.h>
47 : reader_(reader), is_loaded_(false), swap_(false) {
55 data_file_name_ = data_file_name;
58#if defined(HAVE_LIBARCHIVE)
59bool TessdataManager::LoadArchiveFile(
const char *filename) {
61 archive *a = archive_read_new();
63 archive_read_support_filter_all(a);
64 archive_read_support_format_all(a);
65 if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
67 while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
68 const char *component = archive_entry_pathname(ae);
69 if (component !=
nullptr) {
71 if (TessdataTypeFromFileName(component, &
type)) {
72 int64_t size = archive_entry_size(ae);
74 entries_[
type].resize(size);
75 if (archive_read_data(a, &entries_[
type][0], size) == size) {
91 std::vector<char> data;
92 if (reader_ ==
nullptr) {
93#if defined(HAVE_LIBARCHIVE)
94 if (LoadArchiveFile(data_file_name)) {
102 if (!(*reader_)(data_file_name, &data)) {
113 data_file_name_ = name;
116 uint32_t num_entries;
120 swap_ = num_entries > kMaxNumTessdataEntries;
123 ReverseN(&num_entries,
sizeof(num_entries));
125 if (num_entries > kMaxNumTessdataEntries) {
129 std::vector<int64_t> offset_table(num_entries);
130 if (!fp.
DeSerialize(&offset_table[0], num_entries)) {
134 if (offset_table[
i] >= 0) {
135 int64_t entry_size = size - offset_table[
i];
137 while (j < num_entries && offset_table[j] == -1) {
140 if (j < num_entries) {
141 entry_size = offset_table[j] - offset_table[
i];
143 entries_[
i].resize(entry_size);
159 entries_[
type].resize(size);
160 memcpy(&entries_[
type][0], data, size);
167 std::vector<char> data;
169 if (writer ==
nullptr) {
172 return (*writer)(data, filename);
182 int64_t offset =
sizeof(int32_t) +
sizeof(offset_table);
184 if (entries_[
i].empty()) {
185 offset_table[
i] = -1;
187 offset_table[
i] = offset;
188 offset += entries_[
i].size();
191 data->resize(offset, 0);
197 for (
const auto &entry : entries_) {
198 if (!entry.empty()) {
206 for (
auto &entry : entries_) {
217 if (!entries_[
i].empty()) {
218 tprintf(
"%u:%s:size=%zu, offset=%zu\n",
i, kTessdataFileSuffixes[
i], entries_[
i].size(),
220 offset += entries_[
i].size();
228 if (!is_loaded_ && !
Init(data_file_name_.c_str())) {
239 if (entries_[
type].empty()) {
259 const char *output_filename) {
261 for (
auto filesuffix : kTessdataFileSuffixes) {
264 std::string filename = language_data_path_prefix;
265 filename += filesuffix;
266 FILE *fp = fopen(filename.c_str(),
"rb");
270 tprintf(
"Load of file %s failed!\n", filename.c_str());
280 "Error: traineddata file must contain at least (a unicharset file"
281 " and inttemp) OR an lstm file.\n");
285 return SaveFile(output_filename,
nullptr);
289 char **component_filenames,
int num_new_components) {
292 for (
int i = 0;
i < num_new_components; ++
i) {
294 if (TessdataTypeFromFileName(component_filenames[
i], &
type)) {
296 tprintf(
"Failed to read component file:%s\n", component_filenames[
i]);
303 return SaveFile(new_traineddata_filename,
nullptr);
308 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &
type));
309 if (entries_[
type].empty()) {
315bool TessdataManager::TessdataTypeFromFileSuffix(
const char *suffix,
TessdataType *
type) {
317 if (strcmp(kTessdataFileSuffixes[
i], suffix) == 0) {
324 "TessdataManager can't determine which tessdata"
325 " component is represented by %s\n",
331bool TessdataManager::TessdataTypeFromFileName(
const char *filename,
TessdataType *
type) {
333 const char *suffix = strrchr(filename,
'.');
334 if (suffix ==
nullptr || *(++suffix) ==
'\0') {
337 return TessdataTypeFromFileSuffix(suffix,
type);
void ReverseN(void *ptr, int num_bytes)
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
void tprintf(const char *format,...)
constexpr size_t countof(T const (&)[N]) noexcept
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
bool(*)(const char *filename, std::vector< char > *data) FileReader
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void OpenWrite(std::vector< char > *data)
bool DeSerialize(std::string &data)
bool Serialize(const std::string &data)
void set_swap(bool value)
bool Open(const char *filename, FileReader reader)
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
bool IsLSTMAvailable() const
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
bool ExtractToFile(const char *filename)
void LoadFileLater(const char *data_file_name)
bool IsBaseAvailable() const
bool LoadMemBuffer(const char *name, const char *data, int size)
bool Init(const char *data_file_name)
void Serialize(std::vector< char > *data) const