All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 ~TessdataManager ()
 
int DebugLevel ()
 
bool Init (const char *data_file_name, int debug_level)
 
const STRINGGetDataFileName () const
 
FILE * GetDataFilePtr () const
 
bool SeekToStart (TessdataType tessdata_type)
 
inT64 GetEndOffset (TessdataType tessdata_type) const
 
void End ()
 
bool swap () const
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool WriteMetadata (inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
 
static bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
static void CopyFile (FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
 
static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type, bool *text_file)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type, bool *text_file)
 

Detailed Description

Definition at line 133 of file tessdatamanager.h.

Constructor & Destructor Documentation

tesseract::TessdataManager::TessdataManager ( )
inline

Definition at line 135 of file tessdatamanager.h.

135  {
136  data_file_ = NULL;
137  actual_tessdata_num_entries_ = 0;
138  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
139  offset_table_[i] = -1;
140  }
141  }
#define NULL
Definition: host.h:144
tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 142 of file tessdatamanager.h.

142 {}

Member Function Documentation

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)
static

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 127 of file tessdatamanager.cpp.

129  {
130  int i;
131  inT64 offset_table[TESSDATA_NUM_ENTRIES];
132  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
133  FILE *output_file = fopen(output_filename, "wb");
134  if (output_file == NULL) {
135  tprintf("Error opening %s for writing\n", output_filename);
136  return false;
137  }
138  // Leave some space for recording the offset_table.
139  if (fseek(output_file,
140  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
141  tprintf("Error seeking %s\n", output_filename);
142  return false;
143  }
144 
146  bool text_file = false;
147  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
148 
149  // Load individual tessdata components from files.
150  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
152  kTessdataFileSuffixes[i], &type, &text_file));
153  STRING filename = language_data_path_prefix;
154  filename += kTessdataFileSuffixes[i];
155  file_ptr[i] = fopen(filename.string(), "rb");
156  if (file_ptr[i] != NULL) {
157  offset_table[type] = ftell(output_file);
158  CopyFile(file_ptr[i], output_file, text_file, -1);
159  fclose(file_ptr[i]);
160  }
161  }
162 
163  // Make sure that the required components are present.
164  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
165  tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
166  fclose(output_file);
167  return false;
168  }
169  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
170  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
171  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
172  tprintf("Error opening %spffmtable and/or %snormproto files"
173  " while %sinttemp file was present\n", language_data_path_prefix,
174  language_data_path_prefix, language_data_path_prefix);
175  fclose(output_file);
176  return false;
177  }
178 
179  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
180 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int inT32
Definition: host.h:102
long long int inT64
Definition: host.h:108
void tesseract::TessdataManager::CopyFile ( FILE *  input_file,
FILE *  output_file,
bool  newline_end,
inT64  num_bytes_to_copy 
)
static

Copies data from the given input file to the output_file provided. If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from the input file, otherwise all the data in the input file is copied.

Definition at line 74 of file tessdatamanager.cpp.

75  {
76  if (num_bytes_to_copy == 0) return;
77  int buffer_size = 1024;
78  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79  buffer_size = num_bytes_to_copy;
80  }
81  inT64 num_bytes_copied = 0;
82  char *chunk = new char[buffer_size];
83  int bytes_read;
84  char last_char = 0x0;
85  while ((bytes_read = fread(chunk, sizeof(char),
86  buffer_size, input_file))) {
87  fwrite(chunk, sizeof(char), bytes_read, output_file);
88  last_char = chunk[bytes_read-1];
89  if (num_bytes_to_copy > 0) {
90  num_bytes_copied += bytes_read;
91  if (num_bytes_copied == num_bytes_to_copy) break;
92  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93  buffer_size = num_bytes_to_copy - num_bytes_copied;
94  }
95  }
96  }
97  if (newline_end) ASSERT_HOST(last_char == '\n');
98  delete[] chunk;
99 }
#define ASSERT_HOST(x)
Definition: errcode.h:84
long long int inT64
Definition: host.h:108
int tesseract::TessdataManager::DebugLevel ( )
inline

Definition at line 143 of file tessdatamanager.h.

143 { return debug_level_; }
void tesseract::TessdataManager::End ( )
inline

Closes data_file_ (if it was opened by Init()).

Definition at line 192 of file tessdatamanager.h.

192  {
193  if (data_file_ != NULL) {
194  fclose(data_file_);
195  data_file_ = NULL;
196  }
197  }
#define NULL
Definition: host.h:144
bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 258 of file tessdatamanager.cpp.

258  {
260  bool text_file = false;
262  filename, &type, &text_file));
263  if (!SeekToStart(type)) return false;
264 
265  FILE *output_file = fopen(filename, "wb");
266  if (output_file == NULL) {
267  tprintf("Error opening %s\n", filename);
268  exit(1);
269  }
270  inT64 begin_offset = ftell(GetDataFilePtr());
271  inT64 end_offset = GetEndOffset(type);
273  GetDataFilePtr(), output_file, text_file,
274  end_offset - begin_offset + 1);
275  fclose(output_file);
276  return true;
277 }
FILE * GetDataFilePtr() const
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
#define tprintf(...)
Definition: tprintf.h:31
inT64 GetEndOffset(TessdataType tessdata_type) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SeekToStart(TessdataType tessdata_type)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
#define NULL
Definition: host.h:144
long long int inT64
Definition: host.h:108
const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 152 of file tessdatamanager.h.

152 { return data_file_name_; }
FILE* tesseract::TessdataManager::GetDataFilePtr ( ) const
inline

Returns data file pointer.

Definition at line 155 of file tessdatamanager.h.

155 { return data_file_; }
inT64 tesseract::TessdataManager::GetEndOffset ( TessdataType  tessdata_type) const
inline

Returns the end offset for the given tesseract data file type.

Definition at line 178 of file tessdatamanager.h.

178  {
179  int index = tessdata_type + 1;
180  while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) {
181  ++index; // skip tessdata types not present in the combined file
182  }
183  if (debug_level_) {
184  tprintf("TessdataManager: end offset for type %d is %lld\n",
185  tessdata_type,
186  (index == actual_tessdata_num_entries_) ? -1
187  : offset_table_[index]);
188  }
189  return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1;
190  }
#define tprintf(...)
Definition: tprintf.h:31
bool tesseract::TessdataManager::Init ( const char *  data_file_name,
int  debug_level 
)

Opens the given data file and reads the offset table.

Returns
true on success.

Definition at line 36 of file tessdatamanager.cpp.

36  {
37  int i;
38  debug_level_ = debug_level;
39  data_file_name_ = data_file_name;
40  data_file_ = fopen(data_file_name, "rb");
41  if (data_file_ == NULL) {
42  tprintf("Error opening data file %s\n", data_file_name);
43  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
44  "to the parent directory of your \"tessdata\" directory.\n");
45  return false;
46  }
47  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
48  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
49  if (swap_) {
50  ReverseN(&actual_tessdata_num_entries_,
51  sizeof(actual_tessdata_num_entries_));
52  }
53  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
54  // For forward compatability, truncate to the number we can handle.
55  actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
56  }
57  fread(offset_table_, sizeof(inT64),
58  actual_tessdata_num_entries_, data_file_);
59  if (swap_) {
60  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61  ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
62  }
63  }
64  if (debug_level_) {
65  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
66  actual_tessdata_num_entries_);
67  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
69  }
70  }
71  return true;
72 }
#define tprintf(...)
Definition: tprintf.h:31
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
long long int inT64
Definition: host.h:108
bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 182 of file tessdatamanager.cpp.

185  {
186  int i;
187  inT64 offset_table[TESSDATA_NUM_ENTRIES];
189  bool text_file = false;
190  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
191  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
192  offset_table[i] = -1;
193  file_ptr[i] = NULL;
194  }
195  FILE *output_file = fopen(new_traineddata_filename, "wb");
196  if (output_file == NULL) {
197  tprintf("Error opening %s for writing\n", new_traineddata_filename);
198  return false;
199  }
200 
201  // Leave some space for recording the offset_table.
202  if (fseek(output_file,
203  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
204  fclose(output_file);
205  tprintf("Error seeking %s\n", new_traineddata_filename);
206  return false;
207  }
208 
209  // Open the files with the new components.
210  for (i = 0; i < num_new_components; ++i) {
211  if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
212  file_ptr[type] = fopen(component_filenames[i], "rb");
213  }
214 
215  // Write updated data to the output traineddata file.
216  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217  if (file_ptr[i] != NULL) {
218  // Get the data from the opened component file.
219  offset_table[i] = ftell(output_file);
220  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
221  fclose(file_ptr[i]);
222  } else {
223  // Get this data component from the loaded data file.
224  if (SeekToStart(static_cast<TessdataType>(i))) {
225  offset_table[i] = ftell(output_file);
226  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
227  GetEndOffset(static_cast<TessdataType>(i)) -
228  ftell(data_file_) + 1);
229  }
230  }
231  }
232  const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
233  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
234 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
#define tprintf(...)
Definition: tprintf.h:31
inT64 GetEndOffset(TessdataType tessdata_type) const
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
bool SeekToStart(TessdataType tessdata_type)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
long long int inT64
Definition: host.h:108
bool tesseract::TessdataManager::SeekToStart ( TessdataType  tessdata_type)
inline

Returns false if there is no data of the given type. Otherwise does a seek on the data_file_ to position the pointer at the start of the data of the given type.

Definition at line 162 of file tessdatamanager.h.

162  {
163  if (debug_level_) {
164  tprintf("TessdataManager: seek to offset %lld - start of tessdata"
165  "type %d (%s))\n", offset_table_[tessdata_type],
166  tessdata_type, kTessdataFileSuffixes[tessdata_type]);
167  }
168  if (offset_table_[tessdata_type] < 0) {
169  return false;
170  } else {
171  ASSERT_HOST(fseek(data_file_,
172  static_cast<size_t>(offset_table_[tessdata_type]),
173  SEEK_SET) == 0);
174  return true;
175  }
176  }
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 198 of file tessdatamanager.h.

198  {
199  return swap_;
200  }
bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type,
bool *  text_file 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 250 of file tessdatamanager.cpp.

251  {
252  // Get the file suffix (extension)
253  const char *suffix = strrchr(filename, '.');
254  if (suffix == NULL || *(++suffix) == '\0') return false;
255  return TessdataTypeFromFileSuffix(suffix, type, text_file);
256 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
#define NULL
Definition: host.h:144
bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type,
bool *  text_file 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. Sets *text_file to true if the component is in text format (e.g. unicharset, unichar ambigs, config, etc).

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 236 of file tessdatamanager.cpp.

237  {
238  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
239  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
240  *type = static_cast<TessdataType>(i);
241  *text_file = kTessdataFileIsText[i];
242  return true;
243  }
244  }
245  tprintf("TessdataManager can't determine which tessdata"
246  " component is represented by %s\n", suffix);
247  return false;
248 }
#define tprintf(...)
Definition: tprintf.h:31
bool tesseract::TessdataManager::WriteMetadata ( inT64 offset_table,
const char *  language_data_path_prefix,
FILE *  output_file 
)
static

Writes the number of entries and the given offset table to output_file. Returns false on error.

Definition at line 101 of file tessdatamanager.cpp.

103  {
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  bool result = true;
106  if (fseek(output_file, 0, SEEK_SET) != 0 ||
107  fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
108  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
109  output_file) != TESSDATA_NUM_ENTRIES) {
110  fclose(output_file);
111  result = false;
112  tprintf("WriteMetadata failed in TessdataManager!\n");
113  } else if (fclose(output_file)) {
114  result = false;
115  tprintf("WriteMetadata failed to close file!\n");
116  } else {
117  tprintf("TessdataManager combined tesseract data files.\n");
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
120  language_data_path_prefix, kTessdataFileSuffixes[i],
121  offset_table[i]);
122  }
123  }
124  return result;
125 }
#define tprintf(...)
Definition: tprintf.h:31
int inT32
Definition: host.h:102
long long int inT64
Definition: host.h:108

The documentation for this class was generated from the following files: