tesseract  4.00.00dev
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()
 
bool swap () const
 
bool is_loaded () const
 
void LoadFileLater (const char *data_file_name)
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
void OverwriteEntry (TessdataType type, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool IsComponentAvailable (TessdataType type) const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool GetComponent (TessdataType type, TFile *fp) const
 
string VersionString () const
 
void SetVersionString (const string &v_str)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type)
 

Detailed Description

Definition at line 131 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )
inline

Definition at line 133 of file tessdatamanager.h.

133  : reader_(nullptr), is_loaded_(false), swap_(false) {
135  }
#define TESSERACT_VERSION_STR
Definition: version.h:8
void SetVersionString(const string &v_str)

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
inlineexplicit

Definition at line 136 of file tessdatamanager.h.

137  : reader_(reader), is_loaded_(false), swap_(false) {
139  }
#define TESSERACT_VERSION_STR
Definition: version.h:8
void SetVersionString(const string &v_str)

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 140 of file tessdatamanager.h.

140 {}

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 136 of file tessdatamanager.cpp.

136  {
137  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
138  entries_[i].clear();
139  }
140  is_loaded_ = false;
141 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 186 of file tessdatamanager.cpp.

188  {
189  // Load individual tessdata components from files.
190  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
191  TessdataType type;
192  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
193  STRING filename = language_data_path_prefix;
194  filename += kTessdataFileSuffixes[i];
195  FILE *fp = fopen(filename.string(), "rb");
196  if (fp != nullptr) {
197  fclose(fp);
198  if (!LoadDataFromFile(filename, &entries_[type])) {
199  tprintf("Load of file %s failed!\n", filename.string());
200  return false;
201  }
202  }
203  }
204  is_loaded_ = true;
205 
206  // Make sure that the required components are present.
207  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
208  tprintf(
209  "Error: traineddata file must contain at least (a unicharset file"
210  "and inttemp) OR an lstm file.\n");
211  return false;
212  }
213  // Write updated data to the output traineddata file.
214  return SaveFile(output_filename, nullptr);
215 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
bool SaveFile(const STRING &filename, FileWriter writer) const
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
Definition: strngs.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 144 of file tessdatamanager.cpp.

144  {
145  tprintf("Version string:%s\n", VersionString().c_str());
146  int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
147  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
148  if (!entries_[i].empty()) {
149  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
150  entries_[i].size(), offset);
151  offset += entries_[i].size();
152  }
153  }
154 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
int64_t inT64
Definition: host.h:40

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 236 of file tessdatamanager.cpp.

236  {
238  ASSERT_HOST(
240  if (entries_[type].empty()) return false;
241  return SaveDataToFile(entries_[type], filename);
242 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)

◆ GetComponent() [1/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 158 of file tessdatamanager.cpp.

158  {
159  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
160  const TessdataManager *const_this = this;
161  return const_this->GetComponent(type, fp);
162 }
const char * string() const
Definition: strngs.cpp:198
bool Init(const char *data_file_name)

◆ GetComponent() [2/2]

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
) const

Definition at line 166 of file tessdatamanager.cpp.

166  {
167  ASSERT_HOST(is_loaded_);
168  if (entries_[type].empty()) return false;
169  fp->Open(&entries_[type][0], entries_[type].size());
170  fp->set_swap(swap_);
171  return true;
172 }
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 195 of file tessdatamanager.h.

195 { return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 43 of file tessdatamanager.cpp.

43  {
45  if (reader_ == nullptr) {
46  if (!LoadDataFromFile(data_file_name, &data)) return false;
47  } else {
48  if (!(*reader_)(data_file_name, &data)) return false;
49  }
50  return LoadMemBuffer(data_file_name, &data[0], data.size());
51 }
int size() const
Definition: genericvector.h:72
bool LoadMemBuffer(const char *name, const char *data, int size)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 143 of file tessdatamanager.h.

143 { return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 186 of file tessdatamanager.h.

186  {
187  return !entries_[TESSDATA_UNICHARSET].empty() &&
188  !entries_[TESSDATA_INTTEMP].empty();
189  }
bool empty() const
Definition: genericvector.h:91

◆ IsComponentAvailable()

bool tesseract::TessdataManager::IsComponentAvailable ( TessdataType  type) const
inline

Definition at line 170 of file tessdatamanager.h.

170  {
171  return !entries_[type].empty();
172  }
bool empty() const
Definition: genericvector.h:91

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 192 of file tessdatamanager.h.

192 { return !entries_[TESSDATA_LSTM].empty(); }
bool empty() const
Definition: genericvector.h:91

◆ LoadFileLater()

void tesseract::TessdataManager::LoadFileLater ( const char *  data_file_name)

Definition at line 38 of file tessdatamanager.cpp.

38  {
39  Clear();
40  data_file_name_ = data_file_name;
41 }

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 54 of file tessdatamanager.cpp.

55  {
56  Clear();
57  data_file_name_ = name;
58  TFile fp;
59  fp.Open(data, size);
60  inT32 num_entries = TESSDATA_NUM_ENTRIES;
61  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
62  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
63  fp.set_swap(swap_);
64  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
65  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
66  GenericVector<inT64> offset_table;
67  offset_table.resize_no_init(num_entries);
68  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
69  num_entries)
70  return false;
71  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
72  if (offset_table[i] >= 0) {
73  inT64 entry_size = size - offset_table[i];
74  int j = i + 1;
75  while (j < num_entries && offset_table[j] == -1) ++j;
76  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
77  entries_[i].resize_no_init(entry_size);
78  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
79  }
80  }
81  if (entries_[TESSDATA_VERSION].empty()) {
82  SetVersionString("Pre-4.0.0");
83  }
84  is_loaded_ = true;
85  return true;
86 }
void resize_no_init(int size)
Definition: genericvector.h:66
int32_t inT32
Definition: host.h:38
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
int64_t inT64
Definition: host.h:40
void SetVersionString(const string &v_str)

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 217 of file tessdatamanager.cpp.

220  {
221  // Open the files with the new components.
222  for (int i = 0; i < num_new_components; ++i) {
223  TessdataType type;
224  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
225  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
226  tprintf("Failed to read component file:%s\n", component_filenames[i]);
227  return false;
228  }
229  }
230  }
231 
232  // Write updated data to the output traineddata file.
233  return SaveFile(new_traineddata_filename, nullptr);
234 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
bool SaveFile(const STRING &filename, FileWriter writer) const
#define tprintf(...)
Definition: tprintf.h:31
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ OverwriteEntry()

void tesseract::TessdataManager::OverwriteEntry ( TessdataType  type,
const char *  data,
int  size 
)

Definition at line 89 of file tessdatamanager.cpp.

90  {
91  is_loaded_ = true;
92  entries_[type].resize_no_init(size);
93  memcpy(&entries_[type][0], data, size);
94 }
void resize_no_init(int size)
Definition: genericvector.h:66

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 97 of file tessdatamanager.cpp.

98  {
99  ASSERT_HOST(is_loaded_);
100  GenericVector<char> data;
101  Serialize(&data);
102  if (writer == nullptr)
103  return SaveDataToFile(data, filename);
104  else
105  return (*writer)(data, filename);
106 }
void Serialize(GenericVector< char > *data) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 109 of file tessdatamanager.cpp.

109  {
110  ASSERT_HOST(is_loaded_);
111  // Compute the offset_table and total size.
112  inT64 offset_table[TESSDATA_NUM_ENTRIES];
113  inT64 offset = sizeof(inT32) + sizeof(offset_table);
114  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
115  if (entries_[i].empty()) {
116  offset_table[i] = -1;
117  } else {
118  offset_table[i] = offset;
119  offset += entries_[i].size();
120  }
121  }
122  data->init_to_size(offset, 0);
123  inT32 num_entries = TESSDATA_NUM_ENTRIES;
124  TFile fp;
125  fp.OpenWrite(data);
126  fp.FWrite(&num_entries, sizeof(num_entries), 1);
127  fp.FWrite(offset_table, sizeof(offset_table), 1);
128  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
129  if (!entries_[i].empty()) {
130  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
131  }
132  }
133 }
int size() const
Definition: genericvector.h:72
int32_t inT32
Definition: host.h:38
#define ASSERT_HOST(x)
Definition: errcode.h:84
void init_to_size(int size, T t)
int64_t inT64
Definition: host.h:40

◆ SetVersionString()

void tesseract::TessdataManager::SetVersionString ( const string &  v_str)

Definition at line 181 of file tessdatamanager.cpp.

181  {
182  entries_[TESSDATA_VERSION].resize_no_init(v_str.size());
183  memcpy(&entries_[TESSDATA_VERSION][0], v_str.data(), v_str.size());
184 }
void resize_no_init(int size)
Definition: genericvector.h:66

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 142 of file tessdatamanager.h.

142 { return swap_; }

◆ TessdataTypeFromFileName()

bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 257 of file tessdatamanager.cpp.

258  {
259  // Get the file suffix (extension)
260  const char *suffix = strrchr(filename, '.');
261  if (suffix == nullptr || *(++suffix) == '\0') return false;
262  return TessdataTypeFromFileSuffix(suffix, type);
263 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)

◆ TessdataTypeFromFileSuffix()

bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 244 of file tessdatamanager.cpp.

245  {
246  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
247  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
248  *type = static_cast<TessdataType>(i);
249  return true;
250  }
251  }
252  tprintf("TessdataManager can't determine which tessdata"
253  " component is represented by %s\n", suffix);
254  return false;
255 }
#define tprintf(...)
Definition: tprintf.h:31

◆ VersionString()

string tesseract::TessdataManager::VersionString ( ) const

Definition at line 175 of file tessdatamanager.cpp.

175  {
176  return string(&entries_[TESSDATA_VERSION][0],
177  entries_[TESSDATA_VERSION].size());
178 }

The documentation for this class was generated from the following files: