tesseract v5.3.3.20231005
tessedit.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: tessedit.cpp (Formerly tessedit.c)
3 * Description: (Previously) Main program for merge of tess and editor.
4 * Now just code to load the language model and various
5 * engine-specific data files.
6 * Author: Ray Smith
7 *
8 * (C) Copyright 1992, Hewlett-Packard Ltd.
9 ** Licensed under the Apache License, Version 2.0 (the "License");
10 ** you may not use this file except in compliance with the License.
11 ** You may obtain a copy of the License at
12 ** http://www.apache.org/licenses/LICENSE-2.0
13 ** Unless required by applicable law or agreed to in writing, software
14 ** distributed under the License is distributed on an "AS IS" BASIS,
15 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 ** See the License for the specific language governing permissions and
17 ** limitations under the License.
18 *
19 **********************************************************************/
20
21// Include automatically generated configuration file if running autoconf.
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h"
24#endif
25
26#include "control.h"
27#include "matchdefs.h"
28#include "pageres.h"
29#include "params.h"
30#include "stopper.h"
31#include "tesseractclass.h"
32#include "tessvars.h"
33#include "tprintf.h"
34#ifndef DISABLED_LEGACY_ENGINE
35# include "chop.h"
36# include "intmatcher.h"
37# include "reject.h"
38#endif
39#include "lstmrecognizer.h"
40
41namespace tesseract {
42
43// Read a "config" file containing a set of variable, value pairs.
44// Searches the standard places: tessdata/configs, tessdata/tessconfigs
45// and also accepts a relative or absolute path name.
46void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) {
47 std::string path = datadir;
48 path += "configs/";
49 path += filename;
50 FILE *fp;
51 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
52 fclose(fp);
53 } else {
54 path = datadir;
55 path += "tessconfigs/";
56 path += filename;
57 if ((fp = fopen(path.c_str(), "rb")) != nullptr) {
58 fclose(fp);
59 } else {
60 path = filename;
61 }
62 }
63 ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params());
64}
65
66// Returns false if a unicharset file for the specified language was not found
67// or was invalid.
68// This function initializes TessdataManager. After TessdataManager is
69// no longer needed, TessdataManager::End() should be called.
70//
71// This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
72// it is OEM_DEFAULT, in which case the value of the variable will be obtained
73// from the language-specific config file (stored in [lang].traineddata), from
74// the config files specified on the command line or left as the default
75// OEM_TESSERACT_ONLY if none of the configs specify this variable.
76bool Tesseract::init_tesseract_lang_data(const std::string &arg0,
77 const std::string &language, OcrEngineMode oem,
78 char **configs, int configs_size,
79 const std::vector<std::string> *vars_vec,
80 const std::vector<std::string> *vars_values,
81 bool set_only_non_debug_params, TessdataManager *mgr) {
82 // Set the language data path prefix
83 lang = !language.empty() ? language : "eng";
87
88 // Initialize TessdataManager.
89 std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
90 if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) {
91 tprintf("Error opening data file %s\n", tessdata_path.c_str());
92 tprintf(
93 "Please make sure the TESSDATA_PREFIX environment variable is set"
94 " to your \"tessdata\" directory.\n");
95 return false;
96 }
97#ifdef DISABLED_LEGACY_ENGINE
98 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
99#else
100 if (oem == OEM_DEFAULT) {
101 // Set the engine mode from availability, which can then be overridden by
102 // the config file when we read it below.
103 if (!mgr->IsLSTMAvailable()) {
104 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
105 } else if (!mgr->IsBaseAvailable()) {
106 tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
107 } else {
108 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
109 }
110 }
111#endif // ndef DISABLED_LEGACY_ENGINE
112
113 // If a language specific config file (lang.config) exists, load it in.
114 TFile fp;
115 if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
117 }
118
119 SetParamConstraint set_params_constraint =
121 // Load tesseract variables from config files. This is done after loading
122 // language-specific variables from [lang].traineddata file, so that custom
123 // config files can override values in [lang].traineddata file.
124 for (int i = 0; i < configs_size; ++i) {
125 read_config_file(configs[i], set_params_constraint);
126 }
127
128 // Set params specified in vars_vec (done after setting params from config
129 // files, so that params in vars_vec can override those from files).
130 if (vars_vec != nullptr && vars_values != nullptr) {
131 for (unsigned i = 0; i < vars_vec->size(); ++i) {
132 if (!ParamUtils::SetParam((*vars_vec)[i].c_str(), (*vars_values)[i].c_str(),
133 set_params_constraint, this->params())) {
134 tprintf("Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].c_str());
135 }
136 }
137 }
138
139 if (!tessedit_write_params_to_file.empty()) {
140 FILE *params_file = fopen(tessedit_write_params_to_file.c_str(), "wb");
141 if (params_file != nullptr) {
142 ParamUtils::PrintParams(params_file, this->params());
143 fclose(params_file);
144 } else {
145 tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
146 }
147 }
148
149#ifndef DISABLED_LEGACY_ENGINE
150 // Determine which ocr engine(s) should be loaded and used for recognition.
151 if (oem != OEM_DEFAULT) {
152 tessedit_ocr_engine_mode.set_value(oem);
153 }
154#endif
155
156 // If we are only loading the config file (and so not planning on doing any
157 // recognition) then there's nothing else do here.
158 if (tessedit_init_config_only) {
159 return true;
160 }
161
162// The various OcrEngineMode settings (see tesseract/publictypes.h) determine
163// which engine-specific data files need to be loaded. If LSTM_ONLY is
164// requested, the base Tesseract files are *Not* required.
165#ifdef DISABLED_LEGACY_ENGINE
166 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
167#else
168 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
169 tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
170#endif // ndef DISABLED_LEGACY_ENGINE
172 lstm_recognizer_ = new LSTMRecognizer(language_data_path_prefix.c_str());
173 ASSERT_HOST(lstm_recognizer_->Load(this->params(), lstm_use_matrix ? language : "", mgr));
174 } else {
175 tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
176 tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
177 }
178 }
179
180 // Load the unicharset
181 if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
182 // Avoid requiring a unicharset when we aren't running base tesseract.
183 unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
184 }
185#ifndef DISABLED_LEGACY_ENGINE
186 else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) {
187 tprintf(
188 "Error: Tesseract (legacy) engine requested, but components are "
189 "not present in %s!!\n",
190 tessdata_path.c_str());
191 return false;
192 }
193#endif // ndef DISABLED_LEGACY_ENGINE
195 tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
196 return false;
197 }
198 right_to_left_ = unicharset.major_right_to_left();
199
200#ifndef DISABLED_LEGACY_ENGINE
201
202 // Setup initial unichar ambigs table and read universal ambigs.
203 UNICHARSET encoder_unicharset;
204 encoder_unicharset.CopyFrom(unicharset);
205 unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
206 unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
207
208 if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
209 unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, ambigs_debug_level,
210 use_ambigs_for_adaption, &unicharset);
211 }
212
213 // Init ParamsModel.
214 // Load pass1 and pass2 weights (for now these two sets are the same, but in
215 // the future separate sets of weights can be generated).
217 language_model_->getParamsModel().SetPass(static_cast<ParamsModel::PassEnum>(p));
218 if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
219 if (!language_model_->getParamsModel().LoadFromFp(lang.c_str(), &fp)) {
220 return false;
221 }
222 }
223 }
224#endif // ndef DISABLED_LEGACY_ENGINE
225
226 return true;
227}
228
229// Helper returns true if the given string is in the vector of strings.
230static bool IsStrInList(const std::string &str, const std::vector<std::string> &str_list) {
231 for (const auto &i : str_list) {
232 if (i == str) {
233 return true;
234 }
235 }
236 return false;
237}
238
239// Parse a string of the form [~]<lang>[+[~]<lang>]*.
240// Langs with no prefix get appended to to_load, provided they
241// are not in there already.
242// Langs with ~ prefix get appended to not_to_load, provided they are not in
243// there already.
244void Tesseract::ParseLanguageString(const std::string &lang_str, std::vector<std::string> *to_load,
245 std::vector<std::string> *not_to_load) {
246 std::string remains(lang_str);
247 // Look whether the model file uses a prefix which must be applied to
248 // included model files as well.
249 std::string prefix;
250 size_t found = lang.find_last_of('/');
251 if (found != std::string::npos) {
252 // A prefix was found.
253 prefix = lang.substr(0, found + 1);
254 }
255 while (!remains.empty()) {
256 // Find the start of the lang code and which vector to add to.
257 const char *start = remains.c_str();
258 while (*start == '+') {
259 ++start;
260 }
261 std::vector<std::string> *target = to_load;
262 if (*start == '~') {
263 target = not_to_load;
264 ++start;
265 }
266 // Find the index of the end of the lang code in string start.
267 int end = strlen(start);
268 const char *plus = strchr(start, '+');
269 if (plus != nullptr && plus - start < end) {
270 end = plus - start;
271 }
272 std::string lang_code(start);
273 lang_code.resize(end);
274 std::string next(start + end);
275 remains = next;
276 lang_code = prefix + lang_code;
277 // Check whether lang_code is already in the target vector and add.
278 if (!IsStrInList(lang_code, *target)) {
279 target->push_back(lang_code);
280 }
281 }
282}
283
284// Initialize for potentially a set of languages defined by the language
285// string and recursively any additional languages required by any language
286// traineddata file (via tessedit_load_sublangs in its config) that is loaded.
287// See init_tesseract_internal for args.
288int Tesseract::init_tesseract(const std::string &arg0, const std::string &textbase,
289 const std::string &language, OcrEngineMode oem, char **configs,
290 int configs_size, const std::vector<std::string> *vars_vec,
291 const std::vector<std::string> *vars_values,
292 bool set_only_non_debug_params, TessdataManager *mgr) {
293 std::vector<std::string> langs_to_load;
294 std::vector<std::string> langs_not_to_load;
295 ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
296
297 for (auto *lang : sub_langs_) {
298 delete lang;
299 }
300
301 // Set the basename, compute the data directory.
302 main_setup(arg0, textbase);
303
304 sub_langs_.clear();
305 // Find the first loadable lang and load into this.
306 // Add any languages that this language requires
307 bool loaded_primary = false;
308 // Load the rest into sub_langs_.
309 // WARNING: A range based for loop does not work here because langs_to_load
310 // might be changed in the loop when a new submodel is found.
311 for (size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
312 auto &lang_to_load = langs_to_load[lang_index];
313 if (!IsStrInList(lang_to_load, langs_not_to_load)) {
314 const char *lang_str = lang_to_load.c_str();
315 Tesseract *tess_to_init;
316 if (!loaded_primary) {
317 tess_to_init = this;
318 } else {
319 tess_to_init = new Tesseract;
320 tess_to_init->main_setup(arg0, textbase);
321 }
322
323 int result = tess_to_init->init_tesseract_internal(arg0, textbase, lang_str, oem, configs,
324 configs_size, vars_vec, vars_values,
325 set_only_non_debug_params, mgr);
326 // Forget that language, but keep any reader we were given.
327 mgr->Clear();
328
329 if (!loaded_primary) {
330 if (result < 0) {
331 tprintf("Failed loading language '%s'\n", lang_str);
332 } else {
333 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
334 &langs_not_to_load);
335 loaded_primary = true;
336 }
337 } else {
338 if (result < 0) {
339 tprintf("Failed loading language '%s'\n", lang_str);
340 delete tess_to_init;
341 } else {
342 sub_langs_.push_back(tess_to_init);
343 // Add any languages that this language requires
344 ParseLanguageString(tess_to_init->tessedit_load_sublangs, &langs_to_load,
345 &langs_not_to_load);
346 }
347 }
348 }
349 }
350 if (!loaded_primary && !langs_to_load.empty()) {
351 tprintf("Tesseract couldn't load any languages!\n");
352 return -1; // Couldn't load any language!
353 }
354#ifndef DISABLED_LEGACY_ENGINE
355 if (!sub_langs_.empty()) {
356 // In multilingual mode word ratings have to be directly comparable,
357 // so use the same language model weights for all languages:
358 // use the primary language's params model if
359 // tessedit_use_primary_params_model is set,
360 // otherwise use default language model weights.
361 if (tessedit_use_primary_params_model) {
362 for (auto &sub_lang : sub_langs_) {
363 sub_lang->language_model_->getParamsModel().Copy(this->language_model_->getParamsModel());
364 }
365 tprintf("Using params model of the primary language\n");
366 } else {
367 this->language_model_->getParamsModel().Clear();
368 for (auto &sub_lang : sub_langs_) {
369 sub_lang->language_model_->getParamsModel().Clear();
370 }
371 }
372 }
373
375#endif // ndef DISABLED_LEGACY_ENGINE
376 return 0;
377}
378
379// Common initialization for a single language.
380// arg0 is the datapath for the tessdata directory, which could be the
381// path of the tessdata directory with no trailing /, or (if tessdata
382// lives in the same directory as the executable, the path of the executable,
383// hence the name arg0.
384// textbase is an optional output file basename (used only for training)
385// language is the language code to load.
386// oem controls which engine(s) will operate on the image
387// configs (argv) is an array of config filenames to load variables from.
388// May be nullptr.
389// configs_size (argc) is the number of elements in configs.
390// vars_vec is an optional vector of variables to set.
391// vars_values is an optional corresponding vector of values for the variables
392// in vars_vec.
393// If set_only_non_debug_params is true, only params that do not contain
394// "debug" in the name will be set.
395int Tesseract::init_tesseract_internal(const std::string &arg0, const std::string &textbase,
396 const std::string &language, OcrEngineMode oem,
397 char **configs, int configs_size,
398 const std::vector<std::string> *vars_vec,
399 const std::vector<std::string> *vars_values,
400 bool set_only_non_debug_params, TessdataManager *mgr) {
401 if (!init_tesseract_lang_data(arg0, language, oem, configs, configs_size, vars_vec,
402 vars_values, set_only_non_debug_params, mgr)) {
403 return -1;
404 }
405 if (tessedit_init_config_only) {
406 return 0;
407 }
408 // If only LSTM will be used, skip loading Tesseract classifier's
409 // pre-trained templates and dictionary.
410 bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
411 program_editup(textbase, init_tesseract ? mgr : nullptr, init_tesseract ? mgr : nullptr);
412 return 0; // Normal exit
413}
414
415#ifndef DISABLED_LEGACY_ENGINE
416
417// Helper builds the all_fonts table by adding new fonts from new_fonts.
418static void CollectFonts(const UnicityTable<FontInfo> &new_fonts,
419 UnicityTable<FontInfo> *all_fonts) {
420 for (int i = 0; i < new_fonts.size(); ++i) {
421 // UnicityTable uniques as we go.
422 all_fonts->push_back(new_fonts.at(i));
423 }
424}
425
426// Helper assigns an id to lang_fonts using the index in all_fonts table.
427static void AssignIds(const UnicityTable<FontInfo> &all_fonts, UnicityTable<FontInfo> *lang_fonts) {
428 for (int i = 0; i < lang_fonts->size(); ++i) {
429 auto index = all_fonts.get_index(lang_fonts->at(i));
430 lang_fonts->at(i).universal_id = index;
431 }
432}
433
434// Set the universal_id member of each font to be unique among all
435// instances of the same font loaded.
437 // Note that we can get away with bitwise copying FontInfo in
438 // all_fonts, as it is a temporary structure and we avoid setting the
439 // delete callback.
440 UnicityTable<FontInfo> all_fonts;
441
442 // Create the universal ID table.
443 CollectFonts(get_fontinfo_table(), &all_fonts);
444 for (auto &sub_lang : sub_langs_) {
445 CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
446 }
447 // Assign ids from the table to each font table.
448 AssignIds(all_fonts, &get_fontinfo_table());
449 for (auto &sub_lang : sub_langs_) {
450 AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
451 }
452 font_table_size_ = all_fonts.size();
453}
454
455#endif // ndef DISABLED_LEGACY_ENGINE
456
458 end_recog();
459}
460
461/* Define command type identifiers */
462
464} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
const char * p
@ OEM_TESSERACT_LSTM_COMBINED
Definition: publictypes.h:266
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264
SetParamConstraint
Definition: params.h:39
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
Definition: params.h:42
@ SET_PARAM_CONSTRAINT_NONE
Definition: params.h:40
@ RECOG_PSEUDO
Definition: pgedit.cpp:70
@ ACTION_2_CMD_EVENT
Definition: tessedit.cpp:463
@ RECOG_WERDS
Definition: pgedit.cpp:69
@ ACTION_1_CMD_EVENT
Definition: tessedit.cpp:463
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ TESSDATA_PARAMS_MODEL
@ TESSDATA_LANG_CONFIG
def next(obj)
Definition: ast.py:56
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:288
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
Definition: tessedit.cpp:244
void SetupUniversalFontIds()
Definition: tessedit.cpp:436
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:46
int init_tesseract_internal(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:395
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:76
int size() const
Return the size used.
Definition: unicity_table.h:51
int push_back(T object)
Add an element in the table.
Definition: unicity_table.h:80
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:64
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:51
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:72
std::string language_data_path_prefix
Definition: ccutil.h:60
ParamsVectors * params()
Definition: ccutil.h:53
UNICHARSET unicharset
Definition: ccutil.h:61
std::string lang
Definition: ccutil.h:59
std::string datadir
Definition: ccutil.h:57
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:63
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: ccutil.cpp:46
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:51
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool GetComponent(TessdataType type, TFile *fp)
bool IsComponentAvailable(TessdataType type) const
bool Init(const char *data_file_name)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool major_right_to_left() const
Definition: unicharset.cpp:983
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:438
size_t size() const
Definition: unicharset.h:355
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)
const UNICHARSET & GetUnicharset() const
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:39
std::unique_ptr< LanguageModel > language_model_
Definition: wordrec.h:382