tesseract v5.3.3.20231005
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()=default
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const char *filename, FileWriter writer) const
 
void Serialize (std::vector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const std::string & GetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Detailed Description

Definition at line 127 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 42 of file tessdatamanager.cpp.

42 : reader_(nullptr), is_loaded_(false), swap_(false) {
43 SetVersionString(TESSERACT_VERSION_STR);
44}
void SetVersionString(const std::string &v_str)

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 46 of file tessdatamanager.cpp.

47 : reader_(reader), is_loaded_(false), swap_(false) {
48 SetVersionString(TESSERACT_VERSION_STR);
49}

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
default

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 205 of file tessdatamanager.cpp.

205 {
206 for (auto &entry : entries_) {
207 entry.clear();
208 }
209 is_loaded_ = false;
210}

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 258 of file tessdatamanager.cpp.

259 {
260 // Load individual tessdata components from files.
261 for (auto filesuffix : kTessdataFileSuffixes) {
263 ASSERT_HOST(TessdataTypeFromFileSuffix(filesuffix, &type));
264 std::string filename = language_data_path_prefix;
265 filename += filesuffix;
266 FILE *fp = fopen(filename.c_str(), "rb");
267 if (fp != nullptr) {
268 fclose(fp);
269 if (!LoadDataFromFile(filename.c_str(), &entries_[type])) {
270 tprintf("Load of file %s failed!\n", filename.c_str());
271 return false;
272 }
273 }
274 }
275 is_loaded_ = true;
276
277 // Make sure that the required components are present.
278 if (!IsBaseAvailable() && !IsLSTMAvailable()) {
279 tprintf(
280 "Error: traineddata file must contain at least (a unicharset file"
281 " and inttemp) OR an lstm file.\n");
282 return false;
283 }
284 // Write updated data to the output traineddata file.
285 return SaveFile(output_filename, nullptr);
286}
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
type
Definition: upload.py:458
bool SaveFile(const char *filename, FileWriter writer) const

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 213 of file tessdatamanager.cpp.

213 {
214 tprintf("Version:%s\n", VersionString().c_str());
215 auto offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
216 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217 if (!entries_[i].empty()) {
218 tprintf("%u:%s:size=%zu, offset=%zu\n", i, kTessdataFileSuffixes[i], entries_[i].size(),
219 offset);
220 offset += entries_[i].size();
221 }
222 }
223}
@ TESSDATA_NUM_ENTRIES
std::string VersionString() const

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 306 of file tessdatamanager.cpp.

306 {
308 ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type));
309 if (entries_[type].empty()) {
310 return false;
311 }
312 return SaveDataToFile(entries_[type], filename);
313}
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 227 of file tessdatamanager.cpp.

227 {
228 if (!is_loaded_ && !Init(data_file_name_.c_str())) {
229 return false;
230 }
231 const TessdataManager *const_this = this;
232 return const_this->GetComponent(type, fp);
233}
bool Init(const char *data_file_name)

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 237 of file tessdatamanager.cpp.

237 {
238 ASSERT_HOST(is_loaded_);
239 if (entries_[type].empty()) {
240 return false;
241 }
242 fp->Open(&entries_[type][0], entries_[type].size());
243 fp->set_swap(swap_);
244 return true;
245}

◆ GetDataFileName()

const std::string & tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 192 of file tessdatamanager.h.

192 {
193 return data_file_name_;
194 }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 90 of file tessdatamanager.cpp.

90 {
91 std::vector<char> data;
92 if (reader_ == nullptr) {
93#if defined(HAVE_LIBARCHIVE)
94 if (LoadArchiveFile(data_file_name)) {
95 return true;
96 }
97#endif
98 if (!LoadDataFromFile(data_file_name, &data)) {
99 return false;
100 }
101 } else {
102 if (!(*reader_)(data_file_name, &data)) {
103 return false;
104 }
105 }
106 return LoadMemBuffer(data_file_name, &data[0], data.size());
107}
bool LoadMemBuffer(const char *name, const char *data, int size)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 137 of file tessdatamanager.h.

137 {
138 return is_loaded_;
139 }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 182 of file tessdatamanager.h.

182 {
183 return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184 }

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 166 of file tessdatamanager.h.

166 {
167 return !entries_[type].empty();
168 }

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 187 of file tessdatamanager.h.

187 {
188 return !entries_[TESSDATA_LSTM].empty();
189 }

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 53 of file tessdatamanager.cpp.

53 {
54 Clear();
55 data_file_name_ = data_file_name;
56}

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 110 of file tessdatamanager.cpp.

110 {
111 // TODO: This method supports only the proprietary file format.
112 Clear();
113 data_file_name_ = name;
114 TFile fp;
115 fp.Open(data, size);
116 uint32_t num_entries;
117 if (!fp.DeSerialize(&num_entries)) {
118 return false;
119 }
120 swap_ = num_entries > kMaxNumTessdataEntries;
121 fp.set_swap(swap_);
122 if (swap_) {
123 ReverseN(&num_entries, sizeof(num_entries));
124 }
125 if (num_entries > kMaxNumTessdataEntries) {
126 return false;
127 }
128 // TODO: optimize (no init required).
129 std::vector<int64_t> offset_table(num_entries);
130 if (!fp.DeSerialize(&offset_table[0], num_entries)) {
131 return false;
132 }
133 for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
134 if (offset_table[i] >= 0) {
135 int64_t entry_size = size - offset_table[i];
136 unsigned j = i + 1;
137 while (j < num_entries && offset_table[j] == -1) {
138 ++j;
139 }
140 if (j < num_entries) {
141 entry_size = offset_table[j] - offset_table[i];
142 }
143 entries_[i].resize(entry_size);
144 if (!fp.DeSerialize(&entries_[i][0], entry_size)) {
145 return false;
146 }
147 }
148 }
149 if (entries_[TESSDATA_VERSION].empty()) {
150 SetVersionString("Pre-4.0.0");
151 }
152 is_loaded_ = true;
153 return true;
154}
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 288 of file tessdatamanager.cpp.

289 {
290 // Open the files with the new components.
291 // TODO: This method supports only the proprietary file format.
292 for (int i = 0; i < num_new_components; ++i) {
294 if (TessdataTypeFromFileName(component_filenames[i], &type)) {
295 if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
296 tprintf("Failed to read component file:%s\n", component_filenames[i]);
297 return false;
298 }
299 }
300 }
301
302 // Write updated data to the output traineddata file.
303 return SaveFile(new_traineddata_filename, nullptr);
304}

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 157 of file tessdatamanager.cpp.

157 {
158 is_loaded_ = true;
159 entries_[type].resize(size);
160 memcpy(&entries_[type][0], data, size);
161}

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const char *  filename,
FileWriter  writer 
) const

Definition at line 164 of file tessdatamanager.cpp.

164 {
165 // TODO: This method supports only the proprietary file format.
166 ASSERT_HOST(is_loaded_);
167 std::vector<char> data;
168 Serialize(&data);
169 if (writer == nullptr) {
170 return SaveDataToFile(data, filename);
171 } else {
172 return (*writer)(data, filename);
173 }
174}
void Serialize(std::vector< char > *data) const

◆ Serialize()

void tesseract::TessdataManager::Serialize ( std::vector< char > *  data) const

Definition at line 177 of file tessdatamanager.cpp.

177 {
178 // TODO: This method supports only the proprietary file format.
179 ASSERT_HOST(is_loaded_);
180 // Compute the offset_table and total size.
181 int64_t offset_table[TESSDATA_NUM_ENTRIES];
182 int64_t offset = sizeof(int32_t) + sizeof(offset_table);
183 for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
184 if (entries_[i].empty()) {
185 offset_table[i] = -1;
186 } else {
187 offset_table[i] = offset;
188 offset += entries_[i].size();
189 }
190 }
191 data->resize(offset, 0);
192 int32_t num_entries = TESSDATA_NUM_ENTRIES;
193 TFile fp;
194 fp.OpenWrite(data);
195 fp.Serialize(&num_entries);
196 fp.Serialize(&offset_table[0], countof(offset_table));
197 for (const auto &entry : entries_) {
198 if (!entry.empty()) {
199 fp.Serialize(&entry[0], entry.size());
200 }
201 }
202}
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:34

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string &  v_str)

Definition at line 253 of file tessdatamanager.cpp.

253 {
254 entries_[TESSDATA_VERSION].resize(v_str.size());
255 memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
256}

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134 {
135 return swap_;
136 }

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 248 of file tessdatamanager.cpp.

248 {
249 return std::string(&entries_[TESSDATA_VERSION][0], entries_[TESSDATA_VERSION].size());
250}

The documentation for this class was generated from the following files: