All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <stdio.h>
27 
28 #include "helpers.h"
29 #include "serialis.h"
30 #include "strngs.h"
31 #include "tprintf.h"
32 #include "params.h"
33 
34 namespace tesseract {
35 
36 bool TessdataManager::Init(const char *data_file_name, int debug_level) {
37  int i;
38  debug_level_ = debug_level;
39  data_file_name_ = data_file_name;
40  data_file_ = fopen(data_file_name, "rb");
41  if (data_file_ == NULL) {
42  tprintf("Error opening data file %s\n", data_file_name);
43  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set "
44  "to the parent directory of your \"tessdata\" directory.\n");
45  return false;
46  }
47  fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_);
48  swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries);
49  if (swap_) {
50  ReverseN(&actual_tessdata_num_entries_,
51  sizeof(actual_tessdata_num_entries_));
52  }
53  if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
54  // For forward compatability, truncate to the number we can handle.
55  actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
56  }
57  fread(offset_table_, sizeof(inT64),
58  actual_tessdata_num_entries_, data_file_);
59  if (swap_) {
60  for (i = 0 ; i < actual_tessdata_num_entries_; ++i) {
61  ReverseN(&offset_table_[i], sizeof(offset_table_[i]));
62  }
63  }
64  if (debug_level_) {
65  tprintf("TessdataManager loaded %d types of tesseract data files.\n",
66  actual_tessdata_num_entries_);
67  for (i = 0; i < actual_tessdata_num_entries_; ++i) {
68  tprintf("Offset for type %d is %lld\n", i, offset_table_[i]);
69  }
70  }
71  return true;
72 }
73 
74 void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
75  bool newline_end, inT64 num_bytes_to_copy) {
76  if (num_bytes_to_copy == 0) return;
77  int buffer_size = 1024;
78  if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
79  buffer_size = num_bytes_to_copy;
80  }
81  inT64 num_bytes_copied = 0;
82  char *chunk = new char[buffer_size];
83  int bytes_read;
84  char last_char = 0x0;
85  while ((bytes_read = fread(chunk, sizeof(char),
86  buffer_size, input_file))) {
87  fwrite(chunk, sizeof(char), bytes_read, output_file);
88  last_char = chunk[bytes_read-1];
89  if (num_bytes_to_copy > 0) {
90  num_bytes_copied += bytes_read;
91  if (num_bytes_copied == num_bytes_to_copy) break;
92  if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
93  buffer_size = num_bytes_to_copy - num_bytes_copied;
94  }
95  }
96  }
97  if (newline_end) ASSERT_HOST(last_char == '\n');
98  delete[] chunk;
99 }
100 
102  const char * language_data_path_prefix,
103  FILE *output_file) {
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  bool result = true;
106  if (fseek(output_file, 0, SEEK_SET) != 0 ||
107  fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 ||
108  fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES,
109  output_file) != TESSDATA_NUM_ENTRIES) {
110  fclose(output_file);
111  result = false;
112  tprintf("WriteMetadata failed in TessdataManager!\n");
113  } else if (fclose(output_file)) {
114  result = false;
115  tprintf("WriteMetadata failed to close file!\n");
116  } else {
117  tprintf("TessdataManager combined tesseract data files.\n");
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  tprintf("Offset for type %2d (%s%-22s) is %lld\n", i,
120  language_data_path_prefix, kTessdataFileSuffixes[i],
121  offset_table[i]);
122  }
123  }
124  return result;
125 }
126 
128  const char *language_data_path_prefix,
129  const char *output_filename) {
130  int i;
131  inT64 offset_table[TESSDATA_NUM_ENTRIES];
132  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
133  FILE *output_file = fopen(output_filename, "wb");
134  if (output_file == NULL) {
135  tprintf("Error opening %s for writing\n", output_filename);
136  return false;
137  }
138  // Leave some space for recording the offset_table.
139  if (fseek(output_file,
140  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
141  tprintf("Error seeking %s\n", output_filename);
142  return false;
143  }
144 
146  bool text_file = false;
147  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
148 
149  // Load individual tessdata components from files.
150  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
152  kTessdataFileSuffixes[i], &type, &text_file));
153  STRING filename = language_data_path_prefix;
154  filename += kTessdataFileSuffixes[i];
155  file_ptr[i] = fopen(filename.string(), "rb");
156  if (file_ptr[i] != NULL) {
157  offset_table[type] = ftell(output_file);
158  CopyFile(file_ptr[i], output_file, text_file, -1);
159  fclose(file_ptr[i]);
160  }
161  }
162 
163  // Make sure that the required components are present.
164  if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
165  tprintf("Error opening %sunicharset file\n", language_data_path_prefix);
166  fclose(output_file);
167  return false;
168  }
169  if (file_ptr[TESSDATA_INTTEMP] != NULL &&
170  (file_ptr[TESSDATA_PFFMTABLE] == NULL ||
171  file_ptr[TESSDATA_NORMPROTO] == NULL)) {
172  tprintf("Error opening %spffmtable and/or %snormproto files"
173  " while %sinttemp file was present\n", language_data_path_prefix,
174  language_data_path_prefix, language_data_path_prefix);
175  fclose(output_file);
176  return false;
177  }
178 
179  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
180 }
181 
183  const char *new_traineddata_filename,
184  char **component_filenames,
185  int num_new_components) {
186  int i;
187  inT64 offset_table[TESSDATA_NUM_ENTRIES];
189  bool text_file = false;
190  FILE *file_ptr[TESSDATA_NUM_ENTRIES];
191  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
192  offset_table[i] = -1;
193  file_ptr[i] = NULL;
194  }
195  FILE *output_file = fopen(new_traineddata_filename, "wb");
196  if (output_file == NULL) {
197  tprintf("Error opening %s for writing\n", new_traineddata_filename);
198  return false;
199  }
200 
201  // Leave some space for recording the offset_table.
202  if (fseek(output_file,
203  sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) {
204  fclose(output_file);
205  tprintf("Error seeking %s\n", new_traineddata_filename);
206  return false;
207  }
208 
209  // Open the files with the new components.
210  for (i = 0; i < num_new_components; ++i) {
211  if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file))
212  file_ptr[type] = fopen(component_filenames[i], "rb");
213  }
214 
215  // Write updated data to the output traineddata file.
216  for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
217  if (file_ptr[i] != NULL) {
218  // Get the data from the opened component file.
219  offset_table[i] = ftell(output_file);
220  CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
221  fclose(file_ptr[i]);
222  } else {
223  // Get this data component from the loaded data file.
224  if (SeekToStart(static_cast<TessdataType>(i))) {
225  offset_table[i] = ftell(output_file);
226  CopyFile(data_file_, output_file, kTessdataFileIsText[i],
227  GetEndOffset(static_cast<TessdataType>(i)) -
228  ftell(data_file_) + 1);
229  }
230  }
231  }
232  const char *language_data_path_prefix = strchr(new_traineddata_filename, '.');
233  return WriteMetadata(offset_table, language_data_path_prefix, output_file);
234 }
235 
237  const char *suffix, TessdataType *type, bool *text_file) {
238  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
239  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
240  *type = static_cast<TessdataType>(i);
241  *text_file = kTessdataFileIsText[i];
242  return true;
243  }
244  }
245  tprintf("TessdataManager can't determine which tessdata"
246  " component is represented by %s\n", suffix);
247  return false;
248 }
249 
251  const char *filename, TessdataType *type, bool *text_file) {
252  // Get the file suffix (extension)
253  const char *suffix = strrchr(filename, '.');
254  if (suffix == NULL || *(++suffix) == '\0') return false;
255  return TessdataTypeFromFileSuffix(suffix, type, text_file);
256 }
257 
260  bool text_file = false;
262  filename, &type, &text_file));
263  if (!SeekToStart(type)) return false;
264 
265  FILE *output_file = fopen(filename, "wb");
266  if (output_file == NULL) {
267  tprintf("Error opening %s\n", filename);
268  exit(1);
269  }
270  inT64 begin_offset = ftell(GetDataFilePtr());
271  inT64 end_offset = GetEndOffset(type);
273  GetDataFilePtr(), output_file, text_file,
274  end_offset - begin_offset + 1);
275  fclose(output_file);
276  return true;
277 }
278 
279 } // namespace tesseract
FILE * GetDataFilePtr() const
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type, bool *text_file)
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type, bool *text_file)
#define tprintf(...)
Definition: tprintf.h:31
inT64 GetEndOffset(TessdataType tessdata_type) const
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool ExtractToFile(const char *filename)
static bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
static bool WriteMetadata(inT64 *offset_table, const char *language_data_path_prefix, FILE *output_file)
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
bool SeekToStart(TessdataType tessdata_type)
bool Init(const char *data_file_name, int debug_level)
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end, inT64 num_bytes_to_copy)
Definition: strngs.h:44
#define NULL
Definition: host.h:144
const char * string() const
Definition: strngs.cpp:193
int inT32
Definition: host.h:102
long long int inT64
Definition: host.h:108