tesseract v5.3.3.20231005
tessdatamanager.h
Go to the documentation of this file.
1
2// File: tessdatamanager.h
3// Description: Functions to handle loading/combining tesseract data files.
4// Author: Daria Antonova
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCUTIL_TESSDATAMANAGER_H_
20#define TESSERACT_CCUTIL_TESSDATAMANAGER_H_
21
22#include <tesseract/baseapi.h> // FileReader
23#include <string> // std::string
24#include <vector> // std::vector
25#include "serialis.h" // FileWriter
26
27static const char kTrainedDataSuffix[] = "traineddata";
28
29// When adding new tessdata types and file suffixes, please make sure to
30// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
31static const char kLangConfigFileSuffix[] = "config";
32static const char kUnicharsetFileSuffix[] = "unicharset";
33static const char kAmbigsFileSuffix[] = "unicharambigs";
34static const char kBuiltInTemplatesFileSuffix[] = "inttemp";
35static const char kBuiltInCutoffsFileSuffix[] = "pffmtable";
36static const char kNormProtoFileSuffix[] = "normproto";
37static const char kPuncDawgFileSuffix[] = "punc-dawg";
38static const char kSystemDawgFileSuffix[] = "word-dawg";
39static const char kNumberDawgFileSuffix[] = "number-dawg";
40static const char kFreqDawgFileSuffix[] = "freq-dawg";
41static const char kFixedLengthDawgsFileSuffix[] = "fixed-length-dawgs";
42static const char kCubeUnicharsetFileSuffix[] = "cube-unicharset";
43static const char kCubeSystemDawgFileSuffix[] = "cube-word-dawg";
44static const char kShapeTableFileSuffix[] = "shapetable";
45static const char kBigramDawgFileSuffix[] = "bigram-dawg";
46static const char kUnambigDawgFileSuffix[] = "unambig-dawg";
47static const char kParamsModelFileSuffix[] = "params-model";
48static const char kLSTMModelFileSuffix[] = "lstm";
49static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg";
50static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg";
51static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg";
52static const char kLSTMUnicharsetFileSuffix[] = "lstm-unicharset";
53static const char kLSTMRecoderFileSuffix[] = "lstm-recoder";
54static const char kVersionFileSuffix[] = "version";
55
56namespace tesseract {
57
69 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated
70 TESSDATA_CUBE_UNICHARSET, // 11 // deprecated
71 TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated
83
85};
86
91static const char *const kTessdataFileSuffixes[] = {
92 kLangConfigFileSuffix, // 0
93 kUnicharsetFileSuffix, // 1
94 kAmbigsFileSuffix, // 2
95 kBuiltInTemplatesFileSuffix, // 3
96 kBuiltInCutoffsFileSuffix, // 4
97 kNormProtoFileSuffix, // 5
98 kPuncDawgFileSuffix, // 6
99 kSystemDawgFileSuffix, // 7
100 kNumberDawgFileSuffix, // 8
101 kFreqDawgFileSuffix, // 9
102 kFixedLengthDawgsFileSuffix, // 10 // deprecated
103 kCubeUnicharsetFileSuffix, // 11 // deprecated
104 kCubeSystemDawgFileSuffix, // 12 // deprecated
105 kShapeTableFileSuffix, // 13
106 kBigramDawgFileSuffix, // 14
107 kUnambigDawgFileSuffix, // 15
108 kParamsModelFileSuffix, // 16
109 kLSTMModelFileSuffix, // 17
110 kLSTMPuncDawgFileSuffix, // 18
111 kLSTMSystemDawgFileSuffix, // 19
112 kLSTMNumberDawgFileSuffix, // 20
113 kLSTMUnicharsetFileSuffix, // 21
114 kLSTMRecoderFileSuffix, // 22
115 kVersionFileSuffix, // 23
116};
117
125static const int kMaxNumTessdataEntries = 1000;
126
128public:
130 explicit TessdataManager(FileReader reader);
131
132 ~TessdataManager() = default;
133
134 bool swap() const {
135 return swap_;
136 }
137 bool is_loaded() const {
138 return is_loaded_;
139 }
140
141 // Lazily loads from the given filename. Won't actually read the file
142 // until it needs it.
143 void LoadFileLater(const char *data_file_name);
148 bool Init(const char *data_file_name);
149 // Loads from the given memory buffer as if a file, remembering name as some
150 // arbitrary source id for caching.
151 bool LoadMemBuffer(const char *name, const char *data, int size);
152 // Overwrites a single entry of the given type.
153 void OverwriteEntry(TessdataType type, const char *data, int size);
154
155 // Saves to the given filename.
156 bool SaveFile(const char *filename, FileWriter writer) const;
157 // Serializes to the given vector.
158 void Serialize(std::vector<char> *data) const;
159 // Resets to the initial state, keeping the reader.
160 void Clear();
161
162 // Prints a directory of contents.
163 void Directory() const;
164
165 // Returns true if the component requested is present.
167 return !entries_[type].empty();
168 }
169 // Opens the given TFile pointer to the given component type.
170 // Returns false in case of failure.
171 bool GetComponent(TessdataType type, TFile *fp);
172 // As non-const version except it can't load the component if not already
173 // loaded.
174 bool GetComponent(TessdataType type, TFile *fp) const;
175
176 // Returns the current version string.
177 std::string VersionString() const;
178 // Sets the version string to the given v_str.
179 void SetVersionString(const std::string &v_str);
180
181 // Returns true if the base Tesseract components are present.
182 bool IsBaseAvailable() const {
183 return !entries_[TESSDATA_UNICHARSET].empty() && !entries_[TESSDATA_INTTEMP].empty();
184 }
185
186 // Returns true if the LSTM components are present.
187 bool IsLSTMAvailable() const {
188 return !entries_[TESSDATA_LSTM].empty();
189 }
190
191 // Return the name of the underlying data file.
192 const std::string &GetDataFileName() const {
193 return data_file_name_;
194 }
195
201 bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename);
202
208 bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames,
209 int num_new_components);
210
221 bool ExtractToFile(const char *filename);
222
223private:
224 // Use libarchive.
225 bool LoadArchiveFile(const char *filename);
226
233 static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type);
234
239 static bool TessdataTypeFromFileName(const char *filename, TessdataType *type);
240
241 // Name of file it came from.
242 std::string data_file_name_;
243 // Function to load the file when we need it.
244 FileReader reader_;
245 // True if the file has been loaded.
246 bool is_loaded_;
247 // True if the bytes need swapping.
248 bool swap_;
249 // Contents of each element of the traineddata file.
250 std::vector<char> entries_[TESSDATA_NUM_ENTRIES];
251};
252
253} // namespace tesseract
254
255#endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
Definition: serialis.h:40
@ TESSDATA_UNAMBIG_DAWG
@ TESSDATA_LSTM_SYSTEM_DAWG
@ TESSDATA_LSTM_UNICHARSET
@ TESSDATA_CUBE_SYSTEM_DAWG
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_NUMBER_DAWG
@ TESSDATA_CUBE_UNICHARSET
@ TESSDATA_LSTM_PUNC_DAWG
@ TESSDATA_BIGRAM_DAWG
@ TESSDATA_LSTM_RECODER
@ TESSDATA_LANG_CONFIG
@ TESSDATA_LSTM_NUMBER_DAWG
@ TESSDATA_NUM_ENTRIES
@ TESSDATA_SHAPE_TABLE
@ TESSDATA_FIXED_LENGTH_DAWGS
@ TESSDATA_SYSTEM_DAWG
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:61
type
Definition: upload.py:458
const std::string & GetDataFileName() const
bool IsComponentAvailable(TessdataType type) const
#define TESS_API
Definition: export.h:32