tesseract  4.00.00dev
tessdatamanager.h
Go to the documentation of this file.
1 // File: tessdatamanager.h
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21 #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
22 
23 #include <stdio.h>
24 
25 #include "host.h"
26 #include "strngs.h"
27 #include "tprintf.h"
28 #include "version.h"
29 
30 static const char kTrainedDataSuffix[] = "traineddata";
31 
32 // When adding new tessdata types and file suffixes, please make sure to
33 // update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
34 static const char kLangConfigFileSuffix[] = "config";
35 static const char kUnicharsetFileSuffix[] = "unicharset";
36 static const char kAmbigsFileSuffix[] = "unicharambigs";
37 static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
38 static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
39 static const char kNormProtoFileSuffix[] = "normproto";
40 static const char kPuncDawgFileSuffix[] = "punc-dawg";
41 static const char kSystemDawgFileSuffix[] = "word-dawg";
42 static const char kNumberDawgFileSuffix[] = "number-dawg";
43 static const char kFreqDawgFileSuffix[] = "freq-dawg";
44 static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
45 static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
46 static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
47 static const char kShapeTableFileSuffix[] = "shapetable";
48 static const char kBigramDawgFileSuffix[] = "bigram-dawg";
49 static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
50 static const char kParamsModelFileSuffix[] = "params-model";
51 static const char kLSTMModelFileSuffix[] = "lstm";
52 static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
53 static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
54 static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
55 static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
56 static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
57 static const char kVersionFileSuffix[] = "version";
58 
59 namespace tesseract {
60 
72  TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
73  TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
74  TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
86 
88 };
89 
94 static const char *const kTessdataFileSuffixes[] = {
95  kLangConfigFileSuffix, // 0
96  kUnicharsetFileSuffix, // 1
97  kAmbigsFileSuffix, // 2
98  kBuiltInTemplatesFileSuffix, // 3
99  kBuiltInCutoffsFileSuffix, // 4
100  kNormProtoFileSuffix, // 5
101  kPuncDawgFileSuffix, // 6
102  kSystemDawgFileSuffix, // 7
103  kNumberDawgFileSuffix, // 8
104  kFreqDawgFileSuffix, // 9
105  kFixedLengthDawgsFileSuffix, // 10 // deprecated
106  kCubeUnicharsetFileSuffix, // 11 // deprecated
107  kCubeSystemDawgFileSuffix, // 12 // deprecated
108  kShapeTableFileSuffix, // 13
109  kBigramDawgFileSuffix, // 14
110  kUnambigDawgFileSuffix, // 15
111  kParamsModelFileSuffix, // 16
112  kLSTMModelFileSuffix, // 17
113  kLSTMPuncDawgFileSuffix, // 18
114  kLSTMSystemDawgFileSuffix, // 19
115  kLSTMNumberDawgFileSuffix, // 20
116  kLSTMUnicharsetFileSuffix, // 21
117  kLSTMRecoderFileSuffix, // 22
118  kVersionFileSuffix, // 23
119 };
120 
128 static const int kMaxNumTessdataEntries = 1000;
129 
130 
132  public:
133  TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {
135  }
136  explicit TessdataManager(FileReader reader)
137  : reader_(reader), is_loaded_(false), swap_(false) {
139  }
141 
142  bool swap() const { return swap_; }
143  bool is_loaded() const { return is_loaded_; }
144 
145  // Lazily loads from the the given filename. Won't actually read the file
146  // until it needs it.
147  void LoadFileLater(const char *data_file_name);
152  bool Init(const char *data_file_name);
153  // Loads from the given memory buffer as if a file, remembering name as some
154  // arbitrary source id for caching.
155  bool LoadMemBuffer(const char *name, const char *data, int size);
156  // Overwrites a single entry of the given type.
157  void OverwriteEntry(TessdataType type, const char *data, int size);
158 
159  // Saves to the given filename.
160  bool SaveFile(const STRING &filename, FileWriter writer) const;
161  // Serializes to the given vector.
162  void Serialize(GenericVector<char> *data) const;
163  // Resets to the initial state, keeping the reader.
164  void Clear();
165 
166  // Prints a directory of contents.
167  void Directory() const;
168 
169  // Returns true if the component requested is present.
171  return !entries_[type].empty();
172  }
173  // Opens the given TFile pointer to the given component type.
174  // Returns false in case of failure.
175  bool GetComponent(TessdataType type, TFile *fp);
176  // As non-const version except it can't load the component if not already
177  // loaded.
178  bool GetComponent(TessdataType type, TFile *fp) const;
179 
180  // Returns the current version string.
181  string VersionString() const;
182  // Sets the version string to the given v_str.
183  void SetVersionString(const string &v_str);
184 
185  // Returns true if the base Tesseract components are present.
186  bool IsBaseAvailable() const {
187  return !entries_[TESSDATA_UNICHARSET].empty() &&
188  !entries_[TESSDATA_INTTEMP].empty();
189  }
190 
191  // Returns true if the LSTM components are present.
192  bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); }
193 
194  // Return the name of the underlying data file.
195  const STRING &GetDataFileName() const { return data_file_name_; }
196 
202  bool CombineDataFiles(const char *language_data_path_prefix,
203  const char *output_filename);
204 
210  bool OverwriteComponents(const char *new_traineddata_filename,
211  char **component_filenames,
212  int num_new_components);
213 
224  bool ExtractToFile(const char *filename);
225 
232  static bool TessdataTypeFromFileSuffix(const char *suffix,
233  TessdataType *type);
234 
239  static bool TessdataTypeFromFileName(const char *filename,
240  TessdataType *type);
241 
242  private:
243  // Name of file it came from.
244  STRING data_file_name_;
245  // Function to load the file when we need it.
246  FileReader reader_;
247  // True if the file has been loaded.
248  bool is_loaded_;
249  // True if the bytes need swapping.
250  bool swap_;
251  // Contents of each element of the traineddata file.
253 };
254 
255 } // namespace tesseract
256 
257 #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
bool empty() const
Definition: genericvector.h:91
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
bool SaveFile(const STRING &filename, FileWriter writer) const
bool ExtractToFile(const char *filename)
bool IsComponentAvailable(TessdataType type) const
bool GetComponent(TessdataType type, TFile *fp)
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
TessdataManager(FileReader reader)
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool LoadMemBuffer(const char *name, const char *data, int size)
void LoadFileLater(const char *data_file_name)
void Serialize(GenericVector< char > *data) const
Definition: strngs.h:45
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
void OverwriteEntry(TessdataType type, const char *data, int size)
const STRING & GetDataFileName() const
bool Init(const char *data_file_name)
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
#define TESSERACT_VERSION_STR
Definition: version.h:8
void SetVersionString(const string &v_str)