All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  * Created: Tue Jan 07 15:21:46 GMT 1992
8  *
9  * (C) Copyright 1992, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 #include "stderr.h"
23 #include "basedir.h"
24 #include "tessvars.h"
25 #include "control.h"
26 #include "reject.h"
27 #include "pageres.h"
28 #include "nwmain.h"
29 #include "pgedit.h"
30 #include "tprintf.h"
31 #include "tessedit.h"
32 #include "stopper.h"
33 #include "intmatcher.h"
34 #include "chop.h"
35 #include "efio.h"
36 #include "danerror.h"
37 #include "globals.h"
38 #include "tesseractclass.h"
39 #include "params.h"
40 
41 #define VARDIR "configs/" /*variables files */
42  //config under api
43 #define API_CONFIG "configs/api_config"
44 
45 ETEXT_DESC *global_monitor = NULL; // progress monitor
46 
47 namespace tesseract {
48 
49 // Read a "config" file containing a set of variable, value pairs.
50 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
51 // and also accepts a relative or absolute path name.
53  SetParamConstraint constraint) {
54  STRING path = datadir;
55  path += "configs/";
56  path += filename;
57  FILE* fp;
58  if ((fp = fopen(path.string(), "rb")) != NULL) {
59  fclose(fp);
60  } else {
61  path = datadir;
62  path += "tessconfigs/";
63  path += filename;
64  if ((fp = fopen(path.string(), "rb")) != NULL) {
65  fclose(fp);
66  } else {
67  path = filename;
68  }
69  }
70  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
71 }
72 
73 // Returns false if a unicharset file for the specified language was not found
74 // or was invalid.
75 // This function initializes TessdataManager. After TessdataManager is
76 // no longer needed, TessdataManager::End() should be called.
77 //
78 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
79 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
80 // from the language-specific config file (stored in [lang].traineddata), from
81 // the config files specified on the command line or left as the default
82 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
84  const char *arg0, const char *textbase, const char *language,
85  OcrEngineMode oem, char **configs, int configs_size,
86  const GenericVector<STRING> *vars_vec,
87  const GenericVector<STRING> *vars_values,
88  bool set_only_non_debug_params) {
89  // Set the basename, compute the data directory.
90  main_setup(arg0, textbase);
91 
92  // Set the language data path prefix
93  lang = language != NULL ? language : "eng";
97 
98  // Initialize TessdataManager.
99  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
100  if (!tessdata_manager.Init(tessdata_path.string(),
102  return false;
103  }
104 
105  // If a language specific config file (lang.config) exists, load it in.
112  tprintf("Loaded language config file\n");
113  }
114  }
115 
116  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
118  // Load tesseract variables from config files. This is done after loading
119  // language-specific variables from [lang].traineddata file, so that custom
120  // config files can override values in [lang].traineddata file.
121  for (int i = 0; i < configs_size; ++i) {
122  read_config_file(configs[i], set_params_constraint);
123  }
124 
125  // Set params specified in vars_vec (done after setting params from config
126  // files, so that params in vars_vec can override those from files).
127  if (vars_vec != NULL && vars_values != NULL) {
128  for (int i = 0; i < vars_vec->size(); ++i) {
129  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
130  (*vars_values)[i].string(),
131  set_params_constraint, this->params())) {
132  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
133  exit(1);
134  }
135  }
136  }
137 
138  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
139  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
140  if (params_file != NULL) {
141  ParamUtils::PrintParams(params_file, this->params());
142  fclose(params_file);
144  tprintf("Wrote parameters to %s\n",
145  tessedit_write_params_to_file.string());
146  }
147  } else {
148  tprintf("Failed to open %s for writing params.\n",
149  tessedit_write_params_to_file.string());
150  }
151  }
152 
153  // Determine which ocr engine(s) should be loaded and used for recognition.
154  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
156  tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n",
157  static_cast<int>(tessedit_ocr_engine_mode));
158  }
159 
160  // If we are only loading the config file (and so not planning on doing any
161  // recognition) then there's nothing else do here.
164  tprintf("Returning after loading config file\n");
165  }
166  return true;
167  }
168 
169  // Load the unicharset
172  return false;
173  }
174  if (unicharset.size() > MAX_NUM_CLASSES) {
175  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
176  return false;
177  }
178  if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n");
179  right_to_left_ = unicharset.major_right_to_left();
180 
181  // Setup initial unichar ambigs table and read universal ambigs.
182  UNICHARSET encoder_unicharset;
183  encoder_unicharset.CopyFrom(unicharset);
185  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
186 
189  TFile ambigs_file;
190  ambigs_file.Open(tessdata_manager.GetDataFilePtr(),
193  encoder_unicharset,
194  &ambigs_file,
196  if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n");
197  }
198 
199  // The various OcrEngineMode settings (see publictypes.h) determine which
200  // engine-specific data files need to be loaded. Currently everything needs
201  // the base tesseract data, which supplies other useful information, but
202  // alternative engines, such as cube and LSTM are optional.
203 #ifndef ANDROID_BUILD
207  tprintf("Loaded Cube w/out combiner\n");
211  tprintf("Loaded Cube with combiner\n");
212  }
213 #endif
214  // Init ParamsModel.
215  // Load pass1 and pass2 weights (for now these two sets are the same, but in
216  // the future separate sets of weights can be generated).
217  for (int p = ParamsModel::PTRAIN_PASS1;
220  static_cast<ParamsModel::PassEnum>(p));
225  return false;
226  }
227  }
228  }
230 
231  return true;
232 }
233 
234 // Helper returns true if the given string is in the vector of strings.
235 static bool IsStrInList(const STRING& str,
236  const GenericVector<STRING>& str_list) {
237  for (int i = 0; i < str_list.size(); ++i) {
238  if (str_list[i] == str)
239  return true;
240  }
241  return false;
242 }
243 
244 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
245 // Langs with no prefix get appended to to_load, provided they
246 // are not in there already.
247 // Langs with ~ prefix get appended to not_to_load, provided they are not in
248 // there already.
249 void Tesseract::ParseLanguageString(const char* lang_str,
250  GenericVector<STRING>* to_load,
251  GenericVector<STRING>* not_to_load) {
252  STRING remains(lang_str);
253  while (remains.length() > 0) {
254  // Find the start of the lang code and which vector to add to.
255  const char* start = remains.string();
256  while (*start == '+')
257  ++start;
258  GenericVector<STRING>* target = to_load;
259  if (*start == '~') {
260  target = not_to_load;
261  ++start;
262  }
263  // Find the index of the end of the lang code in string start.
264  int end = strlen(start);
265  const char* plus = strchr(start, '+');
266  if (plus != NULL && plus - start < end)
267  end = plus - start;
268  STRING lang_code(start);
269  lang_code.truncate_at(end);
270  STRING next(start + end);
271  remains = next;
272  // Check whether lang_code is already in the target vector and add.
273  if (!IsStrInList(lang_code, *target)) {
275  tprintf("Adding language '%s' to list\n", lang_code.string());
276  target->push_back(lang_code);
277  }
278  }
279 }
280 
281 // Initialize for potentially a set of languages defined by the language
282 // string and recursively any additional languages required by any language
283 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
284 // See init_tesseract_internal for args.
286  const char *arg0, const char *textbase, const char *language,
287  OcrEngineMode oem, char **configs, int configs_size,
288  const GenericVector<STRING> *vars_vec,
289  const GenericVector<STRING> *vars_values,
290  bool set_only_non_debug_params) {
291  GenericVector<STRING> langs_to_load;
292  GenericVector<STRING> langs_not_to_load;
293  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
294 
295  sub_langs_.delete_data_pointers();
296  sub_langs_.clear();
297  // Find the first loadable lang and load into this.
298  // Add any languages that this language requires
299  bool loaded_primary = false;
300  // Load the rest into sub_langs_.
301  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
302  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
303  const char *lang_str = langs_to_load[lang_index].string();
304  Tesseract *tess_to_init;
305  if (!loaded_primary) {
306  tess_to_init = this;
307  } else {
308  tess_to_init = new Tesseract;
309  }
310 
311  int result = tess_to_init->init_tesseract_internal(
312  arg0, textbase, lang_str, oem, configs, configs_size,
313  vars_vec, vars_values, set_only_non_debug_params);
314 
315  if (!loaded_primary) {
316  if (result < 0) {
317  tprintf("Failed loading language '%s'\n", lang_str);
318  } else {
320  tprintf("Loaded language '%s' as main language\n", lang_str);
321  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
322  &langs_to_load, &langs_not_to_load);
323  loaded_primary = true;
324  }
325  } else {
326  if (result < 0) {
327  tprintf("Failed loading language '%s'\n", lang_str);
328  delete tess_to_init;
329  } else {
331  tprintf("Loaded language '%s' as secondary language\n", lang_str);
332  sub_langs_.push_back(tess_to_init);
333  // Add any languages that this language requires
334  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
335  &langs_to_load, &langs_not_to_load);
336  }
337  }
338  }
339  }
340  if (!loaded_primary) {
341  tprintf("Tesseract couldn't load any languages!\n");
342  return -1; // Couldn't load any language!
343  }
344  if (!sub_langs_.empty()) {
345  // In multilingual mode word ratings have to be directly comparable,
346  // so use the same language model weights for all languages:
347  // use the primary language's params model if
348  // tessedit_use_primary_params_model is set,
349  // otherwise use default language model weights.
351  for (int s = 0; s < sub_langs_.size(); ++s) {
352  sub_langs_[s]->language_model_->getParamsModel().Copy(
354  }
355  tprintf("Using params model of the primary language\n");
358  }
359  } else {
361  for (int s = 0; s < sub_langs_.size(); ++s) {
362  sub_langs_[s]->language_model_->getParamsModel().Clear();
363  }
365  tprintf("Using default language params\n");
366  }
367  }
368 
370  return 0;
371 }
372 
373 // Common initialization for a single language.
374 // arg0 is the datapath for the tessdata directory, which could be the
375 // path of the tessdata directory with no trailing /, or (if tessdata
376 // lives in the same directory as the executable, the path of the executable,
377 // hence the name arg0.
378 // textbase is an optional output file basename (used only for training)
379 // language is the language code to load.
380 // oem controls which engine(s) will operate on the image
381 // configs (argv) is an array of config filenames to load variables from.
382 // May be NULL.
383 // configs_size (argc) is the number of elements in configs.
384 // vars_vec is an optional vector of variables to set.
385 // vars_values is an optional corresponding vector of values for the variables
386 // in vars_vec.
387 // If set_only_init_params is true, then only the initialization variables
388 // will be set.
390  const char *arg0, const char *textbase, const char *language,
391  OcrEngineMode oem, char **configs, int configs_size,
392  const GenericVector<STRING> *vars_vec,
393  const GenericVector<STRING> *vars_values,
394  bool set_only_non_debug_params) {
395  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
396  configs_size, vars_vec, vars_values,
397  set_only_non_debug_params)) {
398  return -1;
399  }
402  return 0;
403  }
404  // If only Cube will be used, skip loading Tesseract classifier's
405  // pre-trained templates.
406  bool init_tesseract_classifier =
409  // If only Cube will be used and if it has its own Unicharset,
410  // skip initializing permuter and loading Tesseract Dawgs.
411  bool init_dict =
414  program_editup(textbase, init_tesseract_classifier, init_dict);
416  return 0; //Normal exit
417 }
418 
419 // Helper builds the all_fonts table by adding new fonts from new_fonts.
420 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
421  UnicityTable<FontInfo>* all_fonts) {
422  for (int i = 0; i < new_fonts.size(); ++i) {
423  // UnicityTable uniques as we go.
424  all_fonts->push_back(new_fonts.get(i));
425  }
426 }
427 
428 // Helper assigns an id to lang_fonts using the index in all_fonts table.
429 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
430  UnicityTable<FontInfo>* lang_fonts) {
431  for (int i = 0; i < lang_fonts->size(); ++i) {
432  int index = all_fonts.get_id(lang_fonts->get(i));
433  lang_fonts->get_mutable(i)->universal_id = index;
434  }
435 }
436 
437 // Set the universal_id member of each font to be unique among all
438 // instances of the same font loaded.
440  // Note that we can get away with bitwise copying FontInfo in
441  // all_fonts, as it is a temporary structure and we avoid setting the
442  // delete callback.
443  UnicityTable<FontInfo> all_fonts;
445 
446  // Create the universal ID table.
447  CollectFonts(get_fontinfo_table(), &all_fonts);
448  for (int i = 0; i < sub_langs_.size(); ++i) {
449  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
450  }
451  // Assign ids from the table to each font table.
452  AssignIds(all_fonts, &get_fontinfo_table());
453  for (int i = 0; i < sub_langs_.size(); ++i) {
454  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
455  }
456  font_table_size_ = all_fonts.size();
457 }
458 
459 // init the LM component
460 int Tesseract::init_tesseract_lm(const char *arg0,
461  const char *textbase,
462  const char *language) {
463  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
464  NULL, 0, NULL, NULL, false))
465  return -1;
468  return 0;
469 }
470 
472  end_recog();
473 }
474 
475 /* Define command type identifiers */
476 
478 {
483 };
484 } // namespace tesseract
FILE * GetDataFilePtr() const
void set_compare_callback(TessResultCallback2< bool, T const &, T const & > *cb)
bool major_right_to_left() const
Definition: unicharset.cpp:931
int size() const
Definition: genericvector.h:72
static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:66
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
SetParamConstraint
Definition: params.h:36
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
ETEXT_DESC * global_monitor
Definition: tessedit.cpp:45
UNICHARSET unicharset
Definition: ccutil.h:72
bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset)
char * tessedit_write_params_to_file
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:180
inT64 GetEndOffset(TessdataType tessdata_type) const
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
inT32 length() const
Definition: strngs.cpp:188
TessdataManager tessdata_manager
Definition: ccutil.h:71
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:73
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:249
#define ASSERT_HOST(x)
Definition: errcode.h:84
T * get_mutable(int id)
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:186
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
STRING datadir
Definition: ccutil.h:67
LanguageModel * language_model_
Definition: wordrec.h:411
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
void truncate_at(inT32 index)
Definition: strngs.cpp:264
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:52
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:35
int push_back(T object)
Add an element in the table.
bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:98
Dict & getDict()
Definition: classify.h:65
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:389
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:83
STRING language_data_path_prefix
Definition: ccutil.h:70
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
ParamsVectors * params()
Definition: ccutil.h:65
const T & get(int id) const
Return the object from an id.
void Load(DawgCache *dawg_cache)
Definition: dict.cpp:194
void program_editup(const char *textbase, bool init_classifier, bool init_permute)
Definition: tface.cpp:46
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:285
bool SeekToStart(TessdataType tessdata_type)
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
int size() const
Return the size used.
ParamsModel & getParamsModel()
bool Init(const char *data_file_name, int debug_level)
void SetupUniversalFontIds()
Definition: tessedit.cpp:439
STRING lang
Definition: ccutil.h:69
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:41
Definition: strngs.h:44
void SetPass(PassEnum pass)
Definition: params_model.h:72
#define NULL
Definition: host.h:144
bool use_ambigs_for_adaption
Definition: ccutil.h:93
int size() const
Definition: unicharset.h:297
const char * string() const
Definition: strngs.cpp:193
int ambigs_debug_level
Definition: ccutil.h:89
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
int get_id(T object) const
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language)
Definition: tessedit.cpp:460