tesseract  4.0.0-beta.1-59-g2cc4
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
std::string VersionString () const
 
void SetVersionString (const std::string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type)
 

Detailed Description

Definition at line 126 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )

Definition at line 40 of file tessdatamanager.cpp.

40  : reader_(nullptr), is_loaded_(false), swap_(false) {
42 }
void SetVersionString(const std::string &v_str)
#define PACKAGE_VERSION
Definition: config_auto.h:131

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
explicit

Definition at line 44 of file tessdatamanager.cpp.

45  : reader_(reader),
46  is_loaded_(false),
47  swap_(false) {
49 }
void SetVersionString(const std::string &v_str)
#define PACKAGE_VERSION
Definition: config_auto.h:131

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 131 of file tessdatamanager.h.

131 {}

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 151 of file tessdatamanager.cpp.

151  {
152  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
153  entries_[i].clear();
154  }
155  is_loaded_ = false;
156 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 201 of file tessdatamanager.cpp.

203  {
204  // Load individual tessdata components from files.
205  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
206  TessdataType type;
207  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
208  STRING filename = language_data_path_prefix;
209  filename += kTessdataFileSuffixes[i];
210  FILE *fp = fopen(filename.string(), "rb");
211  if (fp != nullptr) {
212  fclose(fp);
213  if (!LoadDataFromFile(filename, &entries_[type])) {
214  tprintf("Load of file %s failed!\n", filename.string());
215  return false;
216  }
217  }
218  }
219  is_loaded_ = true;
220 
221  // Make sure that the required components are present.
222  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
223  tprintf(
224  "Error: traineddata file must contain at least (a unicharset file"
225  "and inttemp) OR an lstm file.\n");
226  return false;
227  }
228  // Write updated data to the output traineddata file.
229  return SaveFile(output_filename, nullptr);
230 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
Definition: strngs.h:45
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 159 of file tessdatamanager.cpp.

159  {
160  tprintf("Version string:%s\n", VersionString().c_str());
161  int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t);
162  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
163  if (!entries_[i].empty()) {
164  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
165  entries_[i].size(), offset);
166  offset += entries_[i].size();
167  }
168  }
169 }
std::string VersionString() const
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 251 of file tessdatamanager.cpp.

251  {
253  ASSERT_HOST(
255  if (entries_[type].empty()) return false;
256  return SaveDataToFile(entries_[type], filename);
257 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 173 of file tessdatamanager.cpp.

173  {
174  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
175  const TessdataManager *const_this = this;
176  return const_this->GetComponent(type, fp);
177 }
bool Init(const char *data_file_name)
const char * string() const
Definition: strngs.cpp:198

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 181 of file tessdatamanager.cpp.

181  {
182  ASSERT_HOST(is_loaded_);
183  if (entries_[type].empty()) return false;
184  fp->Open(&entries_[type][0], entries_[type].size());
185  fp->set_swap(swap_);
186  return true;
187 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 186 of file tessdatamanager.h.

186 { return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 58 of file tessdatamanager.cpp.

58  {
60  if (reader_ == nullptr) {
61  if (!LoadDataFromFile(data_file_name, &data)) return false;
62  } else {
63  if (!(*reader_)(data_file_name, &data)) return false;
64  }
65  return LoadMemBuffer(data_file_name, &data[0], data.size());
66 }
int size() const
Definition: genericvector.h:72
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
bool LoadMemBuffer(const char *name, const char *data, int size)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 134 of file tessdatamanager.h.

134 { return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 177 of file tessdatamanager.h.

177  {
178  return !entries_[TESSDATA_UNICHARSET].empty() &&
179  !entries_[TESSDATA_INTTEMP].empty();
180  }
bool empty() const
Definition: genericvector.h:91

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 161 of file tessdatamanager.h.

161  {
162  return !entries_[type].empty();
163  }
bool empty() const
Definition: genericvector.h:91

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 183 of file tessdatamanager.h.

183 { return !entries_[TESSDATA_LSTM].empty(); }
bool empty() const
Definition: genericvector.h:91

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 53 of file tessdatamanager.cpp.

53  {
54  Clear();
55  data_file_name_ = data_file_name;
56 }

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 69 of file tessdatamanager.cpp.

70  {
71  Clear();
72  data_file_name_ = name;
73  TFile fp;
74  fp.Open(data, size);
75  int32_t num_entries = TESSDATA_NUM_ENTRIES;
76  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
77  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
78  fp.set_swap(swap_);
79  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
80  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
81  GenericVector<int64_t> offset_table;
82  offset_table.resize_no_init(num_entries);
83  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
84  num_entries)
85  return false;
86  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
87  if (offset_table[i] >= 0) {
88  int64_t entry_size = size - offset_table[i];
89  int j = i + 1;
90  while (j < num_entries && offset_table[j] == -1) ++j;
91  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
92  entries_[i].resize_no_init(entry_size);
93  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
94  }
95  }
96  if (entries_[TESSDATA_VERSION].empty()) {
97  SetVersionString("Pre-4.0.0");
98  }
99  is_loaded_ = true;
100  return true;
101 }
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:189
void resize_no_init(int size)
Definition: genericvector.h:66
void SetVersionString(const std::string &v_str)

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 232 of file tessdatamanager.cpp.

235  {
236  // Open the files with the new components.
237  for (int i = 0; i < num_new_components; ++i) {
238  TessdataType type;
239  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
240  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
241  tprintf("Failed to read component file:%s\n", component_filenames[i]);
242  return false;
243  }
244  }
245  }
246 
247  // Write updated data to the output traineddata file.
248  return SaveFile(new_traineddata_filename, nullptr);
249 }
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
#define tprintf(...)
Definition: tprintf.h:31
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 104 of file tessdatamanager.cpp.

105  {
106  is_loaded_ = true;
107  entries_[type].resize_no_init(size);
108  memcpy(&entries_[type][0], data, size);
109 }
void resize_no_init(int size)
Definition: genericvector.h:66

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 112 of file tessdatamanager.cpp.

113  {
114  ASSERT_HOST(is_loaded_);
115  GenericVector<char> data;
116  Serialize(&data);
117  if (writer == nullptr)
118  return SaveDataToFile(data, filename);
119  else
120  return (*writer)(data, filename);
121 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
void Serialize(GenericVector< char > *data) const

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 124 of file tessdatamanager.cpp.

124  {
125  ASSERT_HOST(is_loaded_);
126  // Compute the offset_table and total size.
127  int64_t offset_table[TESSDATA_NUM_ENTRIES];
128  int64_t offset = sizeof(int32_t) + sizeof(offset_table);
129  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
130  if (entries_[i].empty()) {
131  offset_table[i] = -1;
132  } else {
133  offset_table[i] = offset;
134  offset += entries_[i].size();
135  }
136  }
137  data->init_to_size(offset, 0);
138  int32_t num_entries = TESSDATA_NUM_ENTRIES;
139  TFile fp;
140  fp.OpenWrite(data);
141  fp.FWrite(&num_entries, sizeof(num_entries), 1);
142  fp.FWrite(offset_table, sizeof(offset_table), 1);
143  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
144  if (!entries_[i].empty()) {
145  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
146  }
147  }
148 }
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84
void init_to_size(int size, T t)

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const std::string &  v_str)

Definition at line 196 of file tessdatamanager.cpp.

196  {
197  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
198  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
199 }
void resize_no_init(int size)
Definition: genericvector.h:66

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 133 of file tessdatamanager.h.

133 { return swap_; }

◆ TessdataTypeFromFileName()

bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 272 of file tessdatamanager.cpp.

273  {
274  // Get the file suffix (extension)
275  const char *suffix = strrchr(filename, '.');
276  if (suffix == nullptr || *(++suffix) == '\0') return false;
277  return TessdataTypeFromFileSuffix(suffix, type);
278 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)

◆ TessdataTypeFromFileSuffix()

bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 259 of file tessdatamanager.cpp.

260  {
261  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
262  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
263  *type = static_cast<TessdataType>(i);
264  return true;
265  }
266  }
267  tprintf("TessdataManager can't determine which tessdata"
268  " component is represented by %s\n", suffix);
269  return false;
270 }
#define tprintf(...)
Definition: tprintf.h:31

◆ VersionString()

std::string tesseract::TessdataManager::VersionString ( ) const

Definition at line 190 of file tessdatamanager.cpp.

190  {
191  return std::string(&entries_[TESSDATA_VERSION][0],
192  entries_[TESSDATA_VERSION].size());
193 }

The documentation for this class was generated from the following files: