tesseract  4.00.00dev
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  * Created: Tue Jan 07 15:21:46 GMT 1992
8  *
9  * (C) Copyright 1992, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "stderr.h"
28 #include "basedir.h"
29 #include "tessvars.h"
30 #include "control.h"
31 #include "reject.h"
32 #include "pageres.h"
33 #include "nwmain.h"
34 #include "pgedit.h"
35 #include "tprintf.h"
36 #include "tessedit.h"
37 #include "stopper.h"
38 #include "intmatcher.h"
39 #include "chop.h"
40 #include "efio.h"
41 #include "danerror.h"
42 #include "globals.h"
43 #ifndef ANDROID_BUILD
44 #include "lstmrecognizer.h"
45 #endif
46 #include "tesseractclass.h"
47 #include "params.h"
48 
49 #define VARDIR "configs/" /*variables files */
50  // config under api
51 #define API_CONFIG "configs/api_config"
52 
53 ETEXT_DESC *global_monitor = NULL; // progress monitor
54 
55 namespace tesseract {
56 
57 // Read a "config" file containing a set of variable, value pairs.
58 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
59 // and also accepts a relative or absolute path name.
61  SetParamConstraint constraint) {
62  STRING path = datadir;
63  path += "configs/";
64  path += filename;
65  FILE* fp;
66  if ((fp = fopen(path.string(), "rb")) != NULL) {
67  fclose(fp);
68  } else {
69  path = datadir;
70  path += "tessconfigs/";
71  path += filename;
72  if ((fp = fopen(path.string(), "rb")) != NULL) {
73  fclose(fp);
74  } else {
75  path = filename;
76  }
77  }
78  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
79 }
80 
81 // Returns false if a unicharset file for the specified language was not found
82 // or was invalid.
83 // This function initializes TessdataManager. After TessdataManager is
84 // no longer needed, TessdataManager::End() should be called.
85 //
86 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
87 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
88 // from the language-specific config file (stored in [lang].traineddata), from
89 // the config files specified on the command line or left as the default
90 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
92  const char *arg0, const char *textbase, const char *language,
93  OcrEngineMode oem, char **configs, int configs_size,
94  const GenericVector<STRING> *vars_vec,
95  const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
96  TessdataManager *mgr) {
97  // Set the basename, compute the data directory.
98  main_setup(arg0, textbase);
99 
100  // Set the language data path prefix
101  lang = language != NULL ? language : "eng";
105 
106  // Initialize TessdataManager.
107  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
108  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
109  tprintf("Error opening data file %s\n", tessdata_path.string());
110  tprintf("Please make sure the TESSDATA_PREFIX environment variable is set"
111  " to your \"tessdata\" directory.\n");
112  return false;
113  }
114  if (oem == OEM_DEFAULT) {
115  // Set the engine mode from availability, which can then be overidden by
116  // the config file when we read it below.
117  if (!mgr->IsLSTMAvailable()) {
119  } else if (!mgr->IsBaseAvailable()) {
121  } else {
123  }
124  }
125 
126  // If a language specific config file (lang.config) exists, load it in.
127  TFile fp;
128  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
130  this->params());
131  }
132 
133  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
135  // Load tesseract variables from config files. This is done after loading
136  // language-specific variables from [lang].traineddata file, so that custom
137  // config files can override values in [lang].traineddata file.
138  for (int i = 0; i < configs_size; ++i) {
139  read_config_file(configs[i], set_params_constraint);
140  }
141 
142  // Set params specified in vars_vec (done after setting params from config
143  // files, so that params in vars_vec can override those from files).
144  if (vars_vec != NULL && vars_values != NULL) {
145  for (int i = 0; i < vars_vec->size(); ++i) {
146  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
147  (*vars_values)[i].string(),
148  set_params_constraint, this->params())) {
149  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
150  exit(1);
151  }
152  }
153  }
154 
155  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
156  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
157  if (params_file != NULL) {
158  ParamUtils::PrintParams(params_file, this->params());
159  fclose(params_file);
160  } else {
161  tprintf("Failed to open %s for writing params.\n",
162  tessedit_write_params_to_file.string());
163  }
164  }
165 
166  // Determine which ocr engine(s) should be loaded and used for recognition.
167  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
168 
169  // If we are only loading the config file (and so not planning on doing any
170  // recognition) then there's nothing else do here.
172  return true;
173  }
174 
175 // The various OcrEngineMode settings (see publictypes.h) determine which
176 // engine-specific data files need to be loaded.
177 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
178 #ifndef ANDROID_BUILD
182  lstm_recognizer_ = new LSTMRecognizer;
183  ASSERT_HOST(
184  lstm_recognizer_->Load(lstm_use_matrix ? language : nullptr, mgr));
185  } else {
186  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
188  }
189  }
190 #endif
191 
192  // Load the unicharset
194  // Avoid requiring a unicharset when we aren't running base tesseract.
195 #ifndef ANDROID_BUILD
196  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
197 #endif
198  } else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
199  !unicharset.load_from_file(&fp, false)) {
200  return false;
201  }
202  if (unicharset.size() > MAX_NUM_CLASSES) {
203  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
204  return false;
205  }
206  right_to_left_ = unicharset.major_right_to_left();
207 
208  // Setup initial unichar ambigs table and read universal ambigs.
209  UNICHARSET encoder_unicharset;
210  encoder_unicharset.CopyFrom(unicharset);
212  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
213 
215  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
218  }
219  // Init ParamsModel.
220  // Load pass1 and pass2 weights (for now these two sets are the same, but in
221  // the future separate sets of weights can be generated).
222  for (int p = ParamsModel::PTRAIN_PASS1;
225  static_cast<ParamsModel::PassEnum>(p));
226  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
228  return false;
229  }
230  }
231  }
232 
233  return true;
234 }
235 
236 // Helper returns true if the given string is in the vector of strings.
237 static bool IsStrInList(const STRING& str,
238  const GenericVector<STRING>& str_list) {
239  for (int i = 0; i < str_list.size(); ++i) {
240  if (str_list[i] == str)
241  return true;
242  }
243  return false;
244 }
245 
246 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
247 // Langs with no prefix get appended to to_load, provided they
248 // are not in there already.
249 // Langs with ~ prefix get appended to not_to_load, provided they are not in
250 // there already.
251 void Tesseract::ParseLanguageString(const char* lang_str,
252  GenericVector<STRING>* to_load,
253  GenericVector<STRING>* not_to_load) {
254  STRING remains(lang_str);
255  while (remains.length() > 0) {
256  // Find the start of the lang code and which vector to add to.
257  const char* start = remains.string();
258  while (*start == '+')
259  ++start;
260  GenericVector<STRING>* target = to_load;
261  if (*start == '~') {
262  target = not_to_load;
263  ++start;
264  }
265  // Find the index of the end of the lang code in string start.
266  int end = strlen(start);
267  const char* plus = strchr(start, '+');
268  if (plus != NULL && plus - start < end)
269  end = plus - start;
270  STRING lang_code(start);
271  lang_code.truncate_at(end);
272  STRING next(start + end);
273  remains = next;
274  // Check whether lang_code is already in the target vector and add.
275  if (!IsStrInList(lang_code, *target)) {
276  target->push_back(lang_code);
277  }
278  }
279 }
280 
281 // Initialize for potentially a set of languages defined by the language
282 // string and recursively any additional languages required by any language
283 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
284 // See init_tesseract_internal for args.
285 int Tesseract::init_tesseract(const char *arg0, const char *textbase,
286  const char *language, OcrEngineMode oem,
287  char **configs, int configs_size,
288  const GenericVector<STRING> *vars_vec,
289  const GenericVector<STRING> *vars_values,
290  bool set_only_non_debug_params,
291  TessdataManager *mgr) {
292  GenericVector<STRING> langs_to_load;
293  GenericVector<STRING> langs_not_to_load;
294  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
295 
296  sub_langs_.delete_data_pointers();
297  sub_langs_.clear();
298  // Find the first loadable lang and load into this.
299  // Add any languages that this language requires
300  bool loaded_primary = false;
301  // Load the rest into sub_langs_.
302  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
303  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
304  const char *lang_str = langs_to_load[lang_index].string();
305  Tesseract *tess_to_init;
306  if (!loaded_primary) {
307  tess_to_init = this;
308  } else {
309  tess_to_init = new Tesseract;
310  }
311 
312  int result = tess_to_init->init_tesseract_internal(
313  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
314  vars_values, set_only_non_debug_params, mgr);
315  // Forget that language, but keep any reader we were given.
316  mgr->Clear();
317 
318  if (!loaded_primary) {
319  if (result < 0) {
320  tprintf("Failed loading language '%s'\n", lang_str);
321  } else {
322  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
323  &langs_to_load, &langs_not_to_load);
324  loaded_primary = true;
325  }
326  } else {
327  if (result < 0) {
328  tprintf("Failed loading language '%s'\n", lang_str);
329  delete tess_to_init;
330  } else {
331  sub_langs_.push_back(tess_to_init);
332  // Add any languages that this language requires
333  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
334  &langs_to_load, &langs_not_to_load);
335  }
336  }
337  }
338  }
339  if (!loaded_primary) {
340  tprintf("Tesseract couldn't load any languages!\n");
341  return -1; // Couldn't load any language!
342  }
343  if (!sub_langs_.empty()) {
344  // In multilingual mode word ratings have to be directly comparable,
345  // so use the same language model weights for all languages:
346  // use the primary language's params model if
347  // tessedit_use_primary_params_model is set,
348  // otherwise use default language model weights.
350  for (int s = 0; s < sub_langs_.size(); ++s) {
351  sub_langs_[s]->language_model_->getParamsModel().Copy(
353  }
354  tprintf("Using params model of the primary language\n");
355  } else {
357  for (int s = 0; s < sub_langs_.size(); ++s) {
358  sub_langs_[s]->language_model_->getParamsModel().Clear();
359  }
360  }
361  }
362 
364  return 0;
365 }
366 
367 // Common initialization for a single language.
368 // arg0 is the datapath for the tessdata directory, which could be the
369 // path of the tessdata directory with no trailing /, or (if tessdata
370 // lives in the same directory as the executable, the path of the executable,
371 // hence the name arg0.
372 // textbase is an optional output file basename (used only for training)
373 // language is the language code to load.
374 // oem controls which engine(s) will operate on the image
375 // configs (argv) is an array of config filenames to load variables from.
376 // May be NULL.
377 // configs_size (argc) is the number of elements in configs.
378 // vars_vec is an optional vector of variables to set.
379 // vars_values is an optional corresponding vector of values for the variables
380 // in vars_vec.
381 // If set_only_init_params is true, then only the initialization variables
382 // will be set.
383 int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
384  const char *language, OcrEngineMode oem,
385  char **configs, int configs_size,
386  const GenericVector<STRING> *vars_vec,
387  const GenericVector<STRING> *vars_values,
388  bool set_only_non_debug_params,
389  TessdataManager *mgr) {
390  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
391  configs_size, vars_vec, vars_values,
392  set_only_non_debug_params, mgr)) {
393  return -1;
394  }
396  return 0;
397  }
398  // If only LSTM will be used, skip loading Tesseract classifier's
399  // pre-trained templates and dictionary.
401  program_editup(textbase, init_tesseract ? mgr : nullptr,
402  init_tesseract ? mgr : nullptr);
403  return 0; //Normal exit
404 }
405 
406 // Helper builds the all_fonts table by adding new fonts from new_fonts.
407 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
408  UnicityTable<FontInfo>* all_fonts) {
409  for (int i = 0; i < new_fonts.size(); ++i) {
410  // UnicityTable uniques as we go.
411  all_fonts->push_back(new_fonts.get(i));
412  }
413 }
414 
415 // Helper assigns an id to lang_fonts using the index in all_fonts table.
416 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
417  UnicityTable<FontInfo>* lang_fonts) {
418  for (int i = 0; i < lang_fonts->size(); ++i) {
419  int index = all_fonts.get_id(lang_fonts->get(i));
420  lang_fonts->get_mutable(i)->universal_id = index;
421  }
422 }
423 
424 // Set the universal_id member of each font to be unique among all
425 // instances of the same font loaded.
427  // Note that we can get away with bitwise copying FontInfo in
428  // all_fonts, as it is a temporary structure and we avoid setting the
429  // delete callback.
430  UnicityTable<FontInfo> all_fonts;
432 
433  // Create the universal ID table.
434  CollectFonts(get_fontinfo_table(), &all_fonts);
435  for (int i = 0; i < sub_langs_.size(); ++i) {
436  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
437  }
438  // Assign ids from the table to each font table.
439  AssignIds(all_fonts, &get_fontinfo_table());
440  for (int i = 0; i < sub_langs_.size(); ++i) {
441  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
442  }
443  font_table_size_ = all_fonts.size();
444 }
445 
446 // init the LM component
447 int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
448  const char *language, TessdataManager *mgr) {
449  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
450  NULL, 0, NULL, NULL, false, mgr))
451  return -1;
453  getDict().Load(lang, mgr);
454  getDict().FinishLoad();
455  return 0;
456 }
457 
459  end_recog();
460 }
461 
462 /* Define command type identifiers */
463 
465 {
470 };
471 } // namespace tesseract
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:206
Dict & getDict() override
bool FinishLoad()
Definition: dict.cpp:328
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:251
void SetupUniversalFontIds()
Definition: tessedit.cpp:426
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:285
const T & get(int id) const
Return the object from an id.
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:61
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
LanguageModel * language_model_
Definition: wordrec.h:407
int size() const
Definition: genericvector.h:72
ParamsModel & getParamsModel()
char * tessedit_write_params_to_file
bool IsComponentAvailable(TessdataType type) const
#define tprintf(...)
Definition: tprintf.h:31
bool GetComponent(TessdataType type, TFile *fp)
STRING datadir
Definition: ccutil.h:64
SetParamConstraint
Definition: params.h:36
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
T * get_mutable(int id)
int get_id(T object) const
UNICHARSET unicharset
Definition: ccutil.h:68
int size() const
Definition: unicharset.h:338
const char * string() const
Definition: strngs.cpp:198
int push_back(T object)
bool LoadFromFp(const char *lang, TFile *fp)
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
STRING lang
Definition: ccutil.h:66
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:224
int push_back(T object)
Add an element in the table.
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:70
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
int ambigs_debug_level
Definition: ccutil.h:85
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:46
Definition: strngs.h:45
bool major_right_to_left() const
Definition: unicharset.cpp:960
void truncate_at(inT32 index)
Definition: strngs.cpp:269
#define ASSERT_HOST(x)
Definition: errcode.h:84
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:445
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:387
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:447
bool use_ambigs_for_adaption
Definition: ccutil.h:89
void SetPass(PassEnum pass)
Definition: params_model.h:72
ParamsVectors * params()
Definition: ccutil.h:62
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)
ETEXT_DESC * global_monitor
Definition: tessedit.cpp:53
const UNICHARSET & GetUnicharset() const
bool Init(const char *data_file_name)
bool Load(const char *lang, TessdataManager *mgr)
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:49
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:383
inT32 length() const
Definition: strngs.cpp:193
STRING language_data_path_prefix
Definition: ccutil.h:67
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:63
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
int size() const
Return the size used.