23# include "config_auto.h"
34#ifndef DISABLED_LEGACY_ENGINE
51 if ((fp = fopen(path.c_str(),
"rb")) !=
nullptr) {
55 path +=
"tessconfigs/";
57 if ((fp = fopen(path.c_str(),
"rb")) !=
nullptr) {
78 char **configs,
int configs_size,
79 const std::vector<std::string> *vars_vec,
80 const std::vector<std::string> *vars_values,
83 lang = !language.empty() ? language :
"eng";
91 tprintf(
"Error opening data file %s\n", tessdata_path.c_str());
93 "Please make sure the TESSDATA_PREFIX environment variable is set"
94 " to your \"tessdata\" directory.\n");
97#ifdef DISABLED_LEGACY_ENGINE
124 for (
int i = 0;
i < configs_size; ++
i) {
130 if (vars_vec !=
nullptr && vars_values !=
nullptr) {
131 for (
unsigned i = 0;
i < vars_vec->size(); ++
i) {
133 set_params_constraint, this->
params())) {
134 tprintf(
"Warning: The parameter '%s' was not found.\n", (*vars_vec)[
i].c_str());
139 if (!tessedit_write_params_to_file.empty()) {
140 FILE *params_file = fopen(tessedit_write_params_to_file.c_str(),
"wb");
141 if (params_file !=
nullptr) {
145 tprintf(
"Failed to open %s for writing params.\n", tessedit_write_params_to_file.c_str());
149#ifndef DISABLED_LEGACY_ENGINE
152 tessedit_ocr_engine_mode.set_value(oem);
158 if (tessedit_init_config_only) {
165#ifdef DISABLED_LEGACY_ENGINE
173 ASSERT_HOST(lstm_recognizer_->
Load(this->params(), lstm_use_matrix ? language :
"", mgr));
175 tprintf(
"Error: LSTM requested, but not present!! Loading tesseract.\n");
185#ifndef DISABLED_LEGACY_ENGINE
188 "Error: Tesseract (legacy) engine requested, but components are "
189 "not present in %s!!\n",
190 tessdata_path.c_str());
195 tprintf(
"Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
200#ifndef DISABLED_LEGACY_ENGINE
230static bool IsStrInList(
const std::string &str,
const std::vector<std::string> &str_list) {
231 for (
const auto &
i : str_list) {
245 std::vector<std::string> *not_to_load) {
246 std::string remains(lang_str);
250 size_t found =
lang.find_last_of(
'/');
251 if (found != std::string::npos) {
253 prefix =
lang.substr(0, found + 1);
255 while (!remains.empty()) {
257 const char *start = remains.c_str();
258 while (*start ==
'+') {
261 std::vector<std::string> *target = to_load;
263 target = not_to_load;
267 int end = strlen(start);
268 const char *plus = strchr(start,
'+');
269 if (plus !=
nullptr && plus - start < end) {
272 std::string lang_code(start);
273 lang_code.resize(end);
274 std::string
next(start + end);
276 lang_code = prefix + lang_code;
278 if (!IsStrInList(lang_code, *target)) {
279 target->push_back(lang_code);
289 const std::string &language,
OcrEngineMode oem,
char **configs,
290 int configs_size,
const std::vector<std::string> *vars_vec,
291 const std::vector<std::string> *vars_values,
293 std::vector<std::string> langs_to_load;
294 std::vector<std::string> langs_not_to_load;
297 for (
auto *
lang : sub_langs_) {
307 bool loaded_primary =
false;
311 for (
size_t lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
312 auto &lang_to_load = langs_to_load[lang_index];
313 if (!IsStrInList(lang_to_load, langs_not_to_load)) {
314 const char *lang_str = lang_to_load.c_str();
316 if (!loaded_primary) {
324 configs_size, vars_vec, vars_values,
325 set_only_non_debug_params, mgr);
329 if (!loaded_primary) {
331 tprintf(
"Failed loading language '%s'\n", lang_str);
335 loaded_primary =
true;
339 tprintf(
"Failed loading language '%s'\n", lang_str);
342 sub_langs_.push_back(tess_to_init);
350 if (!loaded_primary && !langs_to_load.empty()) {
351 tprintf(
"Tesseract couldn't load any languages!\n");
354#ifndef DISABLED_LEGACY_ENGINE
355 if (!sub_langs_.empty()) {
361 if (tessedit_use_primary_params_model) {
362 for (
auto &sub_lang : sub_langs_) {
363 sub_lang->language_model_->getParamsModel().Copy(this->
language_model_->getParamsModel());
365 tprintf(
"Using params model of the primary language\n");
368 for (
auto &sub_lang : sub_langs_) {
369 sub_lang->language_model_->getParamsModel().Clear();
397 char **configs,
int configs_size,
398 const std::vector<std::string> *vars_vec,
399 const std::vector<std::string> *vars_values,
402 vars_values, set_only_non_debug_params, mgr)) {
405 if (tessedit_init_config_only) {
415#ifndef DISABLED_LEGACY_ENGINE
420 for (
int i = 0;
i < new_fonts.
size(); ++
i) {
428 for (
int i = 0;
i < lang_fonts->size(); ++
i) {
429 auto index = all_fonts.get_index(lang_fonts->at(
i));
430 lang_fonts->at(
i).universal_id = index;
444 for (
auto &sub_lang : sub_langs_) {
445 CollectFonts(sub_lang->get_fontinfo_table(), &all_fonts);
449 for (
auto &sub_lang : sub_langs_) {
450 AssignIds(all_fonts, &sub_lang->get_fontinfo_table());
452 font_table_size_ = all_fonts.
size();
@ OEM_TESSERACT_LSTM_COMBINED
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
@ SET_PARAM_CONSTRAINT_NONE
void tprintf(const char *format,...)
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
void ParseLanguageString(const std::string &lang_str, std::vector< std::string > *to_load, std::vector< std::string > *not_to_load)
void SetupUniversalFontIds()
void read_config_file(const char *filename, SetParamConstraint constraint)
int init_tesseract_internal(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
bool init_tesseract_lang_data(const std::string &arg0, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
int size() const
Return the size used.
int push_back(T object)
Add an element in the table.
const T & at(int id) const
Return the object from an id.
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
std::string language_data_path_prefix
UnicharAmbigs unichar_ambigs
void main_setup(const std::string &argv0, const std::string &basename)
CCUtil::main_setup - set location of tessdata and name of image.
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
bool IsLSTMAvailable() const
bool GetComponent(TessdataType type, TFile *fp)
bool IsBaseAvailable() const
bool IsComponentAvailable(TessdataType type) const
bool Init(const char *data_file_name)
bool load_from_file(const char *const filename, bool skip_fragments)
bool major_right_to_left() const
void CopyFrom(const UNICHARSET &src)
UnicityTable< FontInfo > & get_fontinfo_table()
bool Load(const ParamsVectors *params, const std::string &lang, TessdataManager *mgr)
const UNICHARSET & GetUnicharset() const
void program_editup(const std::string &textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
std::unique_ptr< LanguageModel > language_model_