tesseract v5.3.3.20231005
baseapi.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: baseapi.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 2006, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#define _USE_MATH_DEFINES // for M_PI
20
21// Include automatically generated configuration file if running autoconf.
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h"
24#endif
25
26#include "boxword.h" // for BoxWord
27#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST
28#include "dawg_cache.h" // for DawgCache
29#include "dict.h" // for Dict
30#include "elst.h" // for ELIST_ITERATOR, ELISTIZE, ELISTIZEH
31#include "environ.h" // for l_uint8
32#ifndef DISABLED_LEGACY_ENGINE
33#include "equationdetect.h" // for EquationDetect, destructor of equ_detect_
34#endif // ndef DISABLED_LEGACY_ENGINE
35#include "errcode.h" // for ASSERT_HOST
36#include "helpers.h" // for IntCastRounded, chomp_string
37#include "host.h" // for MAX_PATH
38#include "imageio.h" // for IFF_TIFF_G4, IFF_TIFF, IFF_TIFF_G3, ...
39#ifndef DISABLED_LEGACY_ENGINE
40# include "intfx.h" // for INT_FX_RESULT_STRUCT
41#endif
42#include "mutableiterator.h" // for MutableIterator
43#include "normalis.h" // for kBlnBaselineOffset, kBlnXHeight
44#if defined(USE_OPENCL)
45# include "openclwrapper.h" // for OpenclDevice
46#endif
47#include "pageres.h" // for PAGE_RES_IT, WERD_RES, PAGE_RES, CR_DE...
48#include "paragraphs.h" // for DetectParagraphs
49#include "params.h" // for BoolParam, IntParam, DoubleParam, Stri...
50#include "pdblock.h" // for PDBLK
51#include "points.h" // for FCOORD
52#include "polyblk.h" // for POLY_BLOCK
53#include "rect.h" // for TBOX
54#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
55#include "tessdatamanager.h" // for TessdataManager, kTrainedDataSuffix
56#include "tesseractclass.h" // for Tesseract
57#include "tprintf.h" // for tprintf
58#include "werd.h" // for WERD, WERD_IT, W_FUZZY_NON, W_FUZZY_SP
59#include "thresholder.h" // for ImageThresholder
60
61#include <tesseract/baseapi.h>
62#include <tesseract/ocrclass.h> // for ETEXT_DESC
63#include <tesseract/osdetect.h> // for OSResults, OSBestResult, OrientationId...
64#include <tesseract/renderer.h> // for TessResultRenderer
65#include <tesseract/resultiterator.h> // for ResultIterator
66
67#include <cmath> // for round, M_PI
68#include <cstdint> // for int32_t
69#include <cstring> // for strcmp, strcpy
70#include <fstream> // for size_t
71#include <iostream> // for std::cin
72#include <locale> // for std::locale::classic
73#include <memory> // for std::unique_ptr
74#include <set> // for std::pair
75#include <sstream> // for std::stringstream
76#include <vector> // for std::vector
77
78#include <allheaders.h> // for pixDestroy, boxCreate, boxaAddBox, box...
79#ifdef HAVE_LIBCURL
80# include <curl/curl.h>
81#endif
82
83#ifdef __linux__
84# include <csignal> // for sigaction, SA_RESETHAND, SIGBUS, SIGFPE
85#endif
86
87#if defined(_WIN32)
88# include <fcntl.h>
89# include <io.h>
90#else
91# include <dirent.h> // for closedir, opendir, readdir, DIR, dirent
92# include <libgen.h>
93# include <sys/stat.h> // for stat, S_IFDIR
94# include <sys/types.h>
95# include <unistd.h>
96#endif // _WIN32
97
98namespace tesseract {
99
100static BOOL_VAR(stream_filelist, false, "Stream a filelist from stdin");
101static STRING_VAR(document_title, "", "Title of output document (used for hOCR and PDF output)");
102#ifdef HAVE_LIBCURL
103static INT_VAR(curl_timeout, 0, "Timeout for curl in seconds");
104#endif
105
107const int kMinRectSize = 10;
109const char kTesseractReject = '~';
111const char kUNLVReject = '~';
113const char kUNLVSuspect = '^';
117static const char *kOldVarsFile = "failed_vars.txt";
118
119#ifndef DISABLED_LEGACY_ENGINE
124static const char *kInputFile = "noname.tif";
125static const char kUnknownFontName[] = "UnknownFont";
126
127static STRING_VAR(classify_font_name, kUnknownFontName,
128 "Default font name to be used in training");
129
130// Finds the name of the training font and returns it in fontname, by cutting
131// it out based on the expectation that the filename is of the form:
132// /path/to/dir/[lang].[fontname].exp[num]
133// The [lang], [fontname] and [num] fields should not have '.' characters.
134// If the global parameter classify_font_name is set, its value is used instead.
135static void ExtractFontName(const char* filename, std::string* fontname) {
136 *fontname = classify_font_name;
137 if (*fontname == kUnknownFontName) {
138 // filename is expected to be of the form [lang].[fontname].exp[num]
139 // The [lang], [fontname] and [num] fields should not have '.' characters.
140 const char *basename = strrchr(filename, '/');
141 const char *firstdot = strchr(basename ? basename : filename, '.');
142 const char *lastdot = strrchr(filename, '.');
143 if (firstdot != lastdot && firstdot != nullptr && lastdot != nullptr) {
144 ++firstdot;
145 *fontname = firstdot;
146 fontname->resize(lastdot - firstdot);
147 }
148 }
149}
150#endif
151
152/* Add all available languages recursively.
153 */
154static void addAvailableLanguages(const std::string &datadir, const std::string &base,
155 std::vector<std::string> *langs) {
156 auto base2 = base;
157 if (!base2.empty()) {
158 base2 += "/";
159 }
160 const size_t extlen = sizeof(kTrainedDataSuffix);
161#ifdef _WIN32
162 WIN32_FIND_DATA data;
163 HANDLE handle = FindFirstFile((datadir + base2 + "*").c_str(), &data);
164 if (handle != INVALID_HANDLE_VALUE) {
165 BOOL result = TRUE;
166 for (; result;) {
167 char *name = data.cFileName;
168 // Skip '.', '..', and hidden files
169 if (name[0] != '.') {
170 if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) == FILE_ATTRIBUTE_DIRECTORY) {
171 addAvailableLanguages(datadir, base2 + name, langs);
172 } else {
173 size_t len = strlen(name);
174 if (len > extlen && name[len - extlen] == '.' &&
175 strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
176 name[len - extlen] = '\0';
177 langs->push_back(base2 + name);
178 }
179 }
180 }
181 result = FindNextFile(handle, &data);
182 }
183 FindClose(handle);
184 }
185#else // _WIN32
186 DIR *dir = opendir((datadir + base).c_str());
187 if (dir != nullptr) {
188 dirent *de;
189 while ((de = readdir(dir))) {
190 char *name = de->d_name;
191 // Skip '.', '..', and hidden files
192 if (name[0] != '.') {
193 struct stat st;
194 if (stat((datadir + base2 + name).c_str(), &st) == 0 && (st.st_mode & S_IFDIR) == S_IFDIR) {
195 addAvailableLanguages(datadir, base2 + name, langs);
196 } else {
197 size_t len = strlen(name);
198 if (len > extlen && name[len - extlen] == '.' &&
199 strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
200 name[len - extlen] = '\0';
201 langs->push_back(base2 + name);
202 }
203 }
204 }
205 }
206 closedir(dir);
207 }
208#endif
209}
210
212 : tesseract_(nullptr)
213 , osd_tesseract_(nullptr)
214 , equ_detect_(nullptr)
215 , reader_(nullptr)
216 ,
217 // thresholder_ is initialized to nullptr here, but will be set before use
218 // by: A constructor of a derived API or created
219 // implicitly when used in InternalSetImage.
220 thresholder_(nullptr)
221 , paragraph_models_(nullptr)
222 , block_list_(nullptr)
223 , page_res_(nullptr)
224 , last_oem_requested_(OEM_DEFAULT)
225 , recognition_done_(false)
226 , rect_left_(0)
227 , rect_top_(0)
228 , rect_width_(0)
229 , rect_height_(0)
230 , image_width_(0)
231 , image_height_(0) {
232}
233
235 End();
236}
237
241const char *TessBaseAPI::Version() {
242 return TESSERACT_VERSION_STR;
243}
244
252size_t TessBaseAPI::getOpenCLDevice(void **data) {
253#ifdef USE_OPENCL
254 ds_device device = OpenclDevice::getDeviceSelection();
255 if (device.type == DS_DEVICE_OPENCL_DEVICE) {
256 *data = new cl_device_id;
257 memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
258 return sizeof(cl_device_id);
259 }
260#endif
261
262 *data = nullptr;
263 return 0;
264}
265
270void TessBaseAPI::SetInputName(const char *name) {
271 input_file_ = name ? name : "";
272}
273
275void TessBaseAPI::SetOutputName(const char *name) {
276 output_file_ = name ? name : "";
277}
278
279bool TessBaseAPI::SetVariable(const char *name, const char *value) {
280 if (tesseract_ == nullptr) {
281 tesseract_ = new Tesseract;
282 }
284 tesseract_->params());
285}
286
287bool TessBaseAPI::SetDebugVariable(const char *name, const char *value) {
288 if (tesseract_ == nullptr) {
289 tesseract_ = new Tesseract;
290 }
292}
293
294bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
295 auto *p = ParamUtils::FindParam<IntParam>(name, GlobalParams()->int_params,
297 if (p == nullptr) {
298 return false;
299 }
300 *value = (int32_t)(*p);
301 return true;
302}
303
304bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
305 auto *p = ParamUtils::FindParam<BoolParam>(name, GlobalParams()->bool_params,
307 if (p == nullptr) {
308 return false;
309 }
310 *value = bool(*p);
311 return true;
312}
313
314const char *TessBaseAPI::GetStringVariable(const char *name) const {
315 auto *p = ParamUtils::FindParam<StringParam>(name, GlobalParams()->string_params,
317 return (p != nullptr) ? p->c_str() : nullptr;
318}
319
320bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
321 auto *p = ParamUtils::FindParam<DoubleParam>(name, GlobalParams()->double_params,
323 if (p == nullptr) {
324 return false;
325 }
326 *value = (double)(*p);
327 return true;
328}
329
331bool TessBaseAPI::GetVariableAsString(const char *name, std::string *val) const {
332 return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
333}
334
335#ifndef DISABLED_LEGACY_ENGINE
336
338void TessBaseAPI::PrintFontsTable(FILE *fp) const {
339 const int fontinfo_size = tesseract_->get_fontinfo_table().size();
340 for (int font_index = 1; font_index < fontinfo_size; ++font_index) {
341 FontInfo font = tesseract_->get_fontinfo_table().at(font_index);
342 fprintf(fp, "ID=%3d: %s is_italic=%s is_bold=%s"
343 " is_fixed_pitch=%s is_serif=%s is_fraktur=%s\n",
344 font_index, font.name,
345 font.is_italic() ? "true" : "false",
346 font.is_bold() ? "true" : "false",
347 font.is_fixed_pitch() ? "true" : "false",
348 font.is_serif() ? "true" : "false",
349 font.is_fraktur() ? "true" : "false");
350 }
351}
352
353#endif
354
356void TessBaseAPI::PrintVariables(FILE *fp) const {
358}
359
368int TessBaseAPI::Init(const char *datapath, const char *language, OcrEngineMode oem, char **configs,
369 int configs_size, const std::vector<std::string> *vars_vec,
370 const std::vector<std::string> *vars_values, bool set_only_non_debug_params) {
371 return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, vars_values,
372 set_only_non_debug_params, nullptr);
373}
374
375// In-memory version reads the traineddata file directly from the given
376// data[data_size] array. Also implements the version with a datapath in data,
377// flagged by data_size = 0.
378int TessBaseAPI::Init(const char *data, int data_size, const char *language, OcrEngineMode oem,
379 char **configs, int configs_size, const std::vector<std::string> *vars_vec,
380 const std::vector<std::string> *vars_values, bool set_only_non_debug_params,
381 FileReader reader) {
382 if (language == nullptr) {
383 language = "";
384 }
385 if (data == nullptr) {
386 data = "";
387 }
388 std::string datapath = data_size == 0 ? data : language;
389 // If the datapath, OcrEngineMode or the language have changed - start again.
390 // Note that the language_ field stores the last requested language that was
391 // initialized successfully, while tesseract_->lang stores the language
392 // actually used. They differ only if the requested language was nullptr, in
393 // which case tesseract_->lang is set to the Tesseract default ("eng").
394 if (tesseract_ != nullptr &&
395 (datapath_.empty() || language_.empty() || datapath_ != datapath ||
396 last_oem_requested_ != oem || (language_ != language && tesseract_->lang != language))) {
397 delete tesseract_;
398 tesseract_ = nullptr;
399 }
400#ifdef USE_OPENCL
401 OpenclDevice od;
402 od.InitEnv();
403#endif
404 bool reset_classifier = true;
405 if (tesseract_ == nullptr) {
406 reset_classifier = false;
407 tesseract_ = new Tesseract;
408 if (reader != nullptr) {
409 reader_ = reader;
410 }
412 if (data_size != 0) {
413 mgr.LoadMemBuffer(language, data, data_size);
414 }
415 if (tesseract_->init_tesseract(datapath, output_file_, language, oem, configs,
416 configs_size, vars_vec, vars_values, set_only_non_debug_params,
417 &mgr) != 0) {
418 return -1;
419 }
420 }
421
422 // Update datapath and language requested for the last valid initialization.
423 datapath_ = datapath;
424 if (datapath_.empty() && !tesseract_->datadir.empty()) {
426 }
427
428 language_ = language;
430
431#ifndef DISABLED_LEGACY_ENGINE
432 // For same language and datapath, just reset the adaptive classifier.
433 if (reset_classifier) {
435 }
436#endif // ndef DISABLED_LEGACY_ENGINE
437 return 0;
438}
439
449 return language_.c_str();
450}
451
457void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector<std::string> *langs) const {
458 langs->clear();
459 if (tesseract_ != nullptr) {
460 langs->push_back(tesseract_->lang);
461 int num_subs = tesseract_->num_sub_langs();
462 for (int i = 0; i < num_subs; ++i) {
463 langs->push_back(tesseract_->get_sub_lang(i)->lang);
464 }
465 }
466}
467
471void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector<std::string> *langs) const {
472 langs->clear();
473 if (tesseract_ != nullptr) {
474 addAvailableLanguages(tesseract_->datadir, "", langs);
475 std::sort(langs->begin(), langs->end());
476 }
477}
478
484 if (tesseract_ == nullptr) {
485 tesseract_ = new Tesseract;
486#ifndef DISABLED_LEGACY_ENGINE
488#endif
489 }
490}
491
497void TessBaseAPI::ReadConfigFile(const char *filename) {
499}
500
502void TessBaseAPI::ReadDebugConfigFile(const char *filename) {
504}
505
512 if (tesseract_ == nullptr) {
513 tesseract_ = new Tesseract;
514 }
515 tesseract_->tessedit_pageseg_mode.set_value(mode);
516}
517
520 if (tesseract_ == nullptr) {
521 return PSM_SINGLE_BLOCK;
522 }
523 return static_cast<PageSegMode>(static_cast<int>(tesseract_->tessedit_pageseg_mode));
524}
525
539char *TessBaseAPI::TesseractRect(const unsigned char *imagedata, int bytes_per_pixel,
540 int bytes_per_line, int left, int top, int width, int height) {
541 if (tesseract_ == nullptr || width < kMinRectSize || height < kMinRectSize) {
542 return nullptr; // Nothing worth doing.
543 }
544
545 // Since this original api didn't give the exact size of the image,
546 // we have to invent a reasonable value.
547 int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
548 SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top, bytes_per_pixel,
549 bytes_per_line);
550 SetRectangle(left, top, width, height);
551
552 return GetUTF8Text();
553}
554
555#ifndef DISABLED_LEGACY_ENGINE
561 if (tesseract_ == nullptr) {
562 return;
563 }
566}
567#endif // ndef DISABLED_LEGACY_ENGINE
568
576void TessBaseAPI::SetImage(const unsigned char *imagedata, int width, int height,
577 int bytes_per_pixel, int bytes_per_line) {
578 if (InternalSetImage()) {
579 thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line);
581 }
582}
583
585 if (thresholder_) {
587 } else {
588 tprintf("Please call SetImage before SetSourceResolution.\n");
589 }
590}
591
600void TessBaseAPI::SetImage(Pix *pix) {
601 if (InternalSetImage()) {
602 if (pixGetSpp(pix) == 4 && pixGetInputFormat(pix) == IFF_PNG) {
603 // remove alpha channel from png
604 Pix *p1 = pixRemoveAlpha(pix);
605 pixSetSpp(p1, 3);
606 (void)pixCopy(pix, p1);
607 pixDestroy(&p1);
608 }
611 }
612}
613
619void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
620 if (thresholder_ == nullptr) {
621 return;
622 }
623 thresholder_->SetRectangle(left, top, width, height);
624 ClearResults();
625}
626
632 if (tesseract_ == nullptr || thresholder_ == nullptr) {
633 return nullptr;
634 }
635 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
636 return nullptr;
637 }
638 return tesseract_->pix_binary().clone();
639}
640
646Boxa *TessBaseAPI::GetRegions(Pixa **pixa) {
647 return GetComponentImages(RIL_BLOCK, false, pixa, nullptr);
648}
649
658Boxa *TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa,
659 int **blockids, int **paraids) {
660 return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding, pixa, blockids, paraids);
661}
662
671Boxa *TessBaseAPI::GetStrips(Pixa **pixa, int **blockids) {
672 return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
673}
674
680Boxa *TessBaseAPI::GetWords(Pixa **pixa) {
681 return GetComponentImages(RIL_WORD, true, pixa, nullptr);
682}
683
691 return GetComponentImages(RIL_SYMBOL, true, pixa, nullptr);
692}
693
702Boxa *TessBaseAPI::GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image,
703 const int raw_padding, Pixa **pixa, int **blockids,
704 int **paraids) {
705 /*non-const*/ std::unique_ptr</*non-const*/ PageIterator> page_it(GetIterator());
706 if (page_it == nullptr) {
707 page_it.reset(AnalyseLayout());
708 }
709 if (page_it == nullptr) {
710 return nullptr; // Failed.
711 }
712
713 // Count the components to get a size for the arrays.
714 int component_count = 0;
715 int left, top, right, bottom;
716
717 if (raw_image) {
718 // Get bounding box in original raw image with padding.
719 do {
720 if (page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom) &&
721 (!text_only || PTIsTextType(page_it->BlockType()))) {
722 ++component_count;
723 }
724 } while (page_it->Next(level));
725 } else {
726 // Get bounding box from binarized imaged. Note that this could be
727 // differently scaled from the original image.
728 do {
729 if (page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom) &&
730 (!text_only || PTIsTextType(page_it->BlockType()))) {
731 ++component_count;
732 }
733 } while (page_it->Next(level));
734 }
735
736 Boxa *boxa = boxaCreate(component_count);
737 if (pixa != nullptr) {
738 *pixa = pixaCreate(component_count);
739 }
740 if (blockids != nullptr) {
741 *blockids = new int[component_count];
742 }
743 if (paraids != nullptr) {
744 *paraids = new int[component_count];
745 }
746
747 int blockid = 0;
748 int paraid = 0;
749 int component_index = 0;
750 page_it->Begin();
751 do {
752 bool got_bounding_box;
753 if (raw_image) {
754 got_bounding_box = page_it->BoundingBox(level, raw_padding, &left, &top, &right, &bottom);
755 } else {
756 got_bounding_box = page_it->BoundingBoxInternal(level, &left, &top, &right, &bottom);
757 }
758 if (got_bounding_box && (!text_only || PTIsTextType(page_it->BlockType()))) {
759 Box *lbox = boxCreate(left, top, right - left, bottom - top);
760 boxaAddBox(boxa, lbox, L_INSERT);
761 if (pixa != nullptr) {
762 Pix *pix = nullptr;
763 if (raw_image) {
764 pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, &top);
765 } else {
766 pix = page_it->GetBinaryImage(level);
767 }
768 pixaAddPix(*pixa, pix, L_INSERT);
769 pixaAddBox(*pixa, lbox, L_CLONE);
770 }
771 if (paraids != nullptr) {
772 (*paraids)[component_index] = paraid;
773 if (page_it->IsAtFinalElement(RIL_PARA, level)) {
774 ++paraid;
775 }
776 }
777 if (blockids != nullptr) {
778 (*blockids)[component_index] = blockid;
779 if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
780 ++blockid;
781 paraid = 0;
782 }
783 }
784 ++component_index;
785 }
786 } while (page_it->Next(level));
787 return boxa;
788}
789
791 if (thresholder_ == nullptr) {
792 return 0;
793 }
795}
796
813 return AnalyseLayout(false);
814}
815
816PageIterator *TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
817 if (FindLines() == 0) {
818 if (block_list_->empty()) {
819 return nullptr; // The page was empty.
820 }
821 page_res_ = new PAGE_RES(merge_similar_words, block_list_, nullptr);
822 DetectParagraphs(false);
826 }
827 return nullptr;
828}
829
835 if (tesseract_ == nullptr) {
836 return -1;
837 }
838 if (FindLines() != 0) {
839 return -1;
840 }
841 delete page_res_;
842 if (block_list_->empty()) {
844 return 0; // Empty page.
845 }
846
848 recognition_done_ = true;
849#ifndef DISABLED_LEGACY_ENGINE
850 if (tesseract_->tessedit_resegment_from_line_boxes) {
852 } else if (tesseract_->tessedit_resegment_from_boxes) {
854 } else
855#endif // ndef DISABLED_LEGACY_ENGINE
856 {
857 page_res_ =
859 }
860
861 if (page_res_ == nullptr) {
862 return -1;
863 }
864
865 if (tesseract_->tessedit_train_line_recognizer) {
867 return -1;
868 }
870 return 0;
871 }
872#ifndef DISABLED_LEGACY_ENGINE
873 if (tesseract_->tessedit_make_boxes_from_boxes) {
875 return 0;
876 }
877#endif // ndef DISABLED_LEGACY_ENGINE
878
879 int result = 0;
880 if (tesseract_->interactive_display_mode) {
881#ifndef GRAPHICS_DISABLED
883#endif // !GRAPHICS_DISABLED
884 // The page_res is invalid after an interactive session, so cleanup
885 // in a way that lets us continue to the next page without crashing.
886 delete page_res_;
887 page_res_ = nullptr;
888 return -1;
889#ifndef DISABLED_LEGACY_ENGINE
890 } else if (tesseract_->tessedit_train_from_boxes) {
891 std::string fontname;
892 ExtractFontName(output_file_.c_str(), &fontname);
894 } else if (tesseract_->tessedit_ambigs_training) {
895 FILE *training_output_file = tesseract_->init_recog_training(input_file_.c_str());
896 // OCR the page segmented into words by tesseract.
898 training_output_file);
899 fclose(training_output_file);
900#endif // ndef DISABLED_LEGACY_ENGINE
901 } else {
902 // Now run the main recognition.
903 bool wait_for_text = true;
904 GetBoolVariable("paragraph_text_based", &wait_for_text);
905 if (!wait_for_text) {
906 DetectParagraphs(false);
907 }
908 if (tesseract_->recog_all_words(page_res_, monitor, nullptr, nullptr, 0)) {
909 if (wait_for_text) {
910 DetectParagraphs(true);
911 }
912 } else {
913 result = -1;
914 }
915 }
916 return result;
917}
918
919// Takes ownership of the input pix.
922}
923
925 return tesseract_->pix_original();
926}
927
929 if (!input_file_.empty()) {
930 return input_file_.c_str();
931 }
932 return nullptr;
933}
934
936 return tesseract_->datadir.c_str();
937}
938
940 if (thresholder_ == nullptr)
941 return -1;
943}
944
945// If flist exists, get data from there. Otherwise get data from buf.
946// Seems convoluted, but is the easiest way I know of to meet multiple
947// goals. Support streaming from stdin, and also work on platforms
948// lacking fmemopen.
949// TODO: check different logic for flist/buf and simplify.
950bool TessBaseAPI::ProcessPagesFileList(FILE *flist, std::string *buf, const char *retry_config,
951 int timeout_millisec, TessResultRenderer *renderer,
952 int tessedit_page_number) {
953 if (!flist && !buf) {
954 return false;
955 }
956 unsigned page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
957 char pagename[MAX_PATH];
958
959 std::vector<std::string> lines;
960 if (!flist) {
961 std::string line;
962 for (const auto ch : *buf) {
963 if (ch == '\n') {
964 lines.push_back(line);
965 line.clear();
966 } else {
967 line.push_back(ch);
968 }
969 }
970 if (!line.empty()) {
971 // Add last line without terminating LF.
972 lines.push_back(line);
973 }
974 if (lines.empty()) {
975 return false;
976 }
977 }
978
979 // Skip to the requested page number.
980 for (unsigned i = 0; i < page; i++) {
981 if (flist) {
982 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
983 break;
984 }
985 }
986 }
987
988 // Begin producing output
989 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
990 return false;
991 }
992
993 // Loop over all pages - or just the requested one
994 while (true) {
995 if (flist) {
996 if (fgets(pagename, sizeof(pagename), flist) == nullptr) {
997 break;
998 }
999 } else {
1000 if (page >= lines.size()) {
1001 break;
1002 }
1003 snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
1004 }
1005 chomp_string(pagename);
1006 Pix *pix = pixRead(pagename);
1007 if (pix == nullptr) {
1008 tprintf("Image file %s cannot be read!\n", pagename);
1009 return false;
1010 }
1011 tprintf("Page %u : %s\n", page, pagename);
1012 bool r = ProcessPage(pix, page, pagename, retry_config, timeout_millisec, renderer);
1013 pixDestroy(&pix);
1014 if (!r) {
1015 return false;
1016 }
1017 if (tessedit_page_number >= 0) {
1018 break;
1019 }
1020 ++page;
1021 }
1022
1023 // Finish producing output
1024 if (renderer && !renderer->EndDocument()) {
1025 return false;
1026 }
1027 return true;
1028}
1029
1030bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, size_t size, const char *filename,
1031 const char *retry_config, int timeout_millisec,
1032 TessResultRenderer *renderer,
1033 int tessedit_page_number) {
1034 Pix *pix = nullptr;
1035 int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1036 size_t offset = 0;
1037 for (;; ++page) {
1038 if (tessedit_page_number >= 0) {
1039 page = tessedit_page_number;
1040 pix = (data) ? pixReadMemTiff(data, size, page) : pixReadTiff(filename, page);
1041 } else {
1042 pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1043 : pixReadFromMultipageTiff(filename, &offset);
1044 }
1045 if (pix == nullptr) {
1046 break;
1047 }
1048 if (offset || page > 0) {
1049 // Only print page number for multipage TIFF file.
1050 tprintf("Page %d\n", page + 1);
1051 }
1052 auto page_string = std::to_string(page);
1053 SetVariable("applybox_page", page_string.c_str());
1054 bool r = ProcessPage(pix, page, filename, retry_config, timeout_millisec, renderer);
1055 pixDestroy(&pix);
1056 if (!r) {
1057 return false;
1058 }
1059 if (tessedit_page_number >= 0) {
1060 break;
1061 }
1062 if (!offset) {
1063 break;
1064 }
1065 }
1066 return true;
1067}
1068
1069// Master ProcessPages calls ProcessPagesInternal and then does any post-
1070// processing required due to being in a training mode.
1071bool TessBaseAPI::ProcessPages(const char *filename, const char *retry_config, int timeout_millisec,
1072 TessResultRenderer *renderer) {
1073 bool result = ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1074#ifndef DISABLED_LEGACY_ENGINE
1075 if (result) {
1076 if (tesseract_->tessedit_train_from_boxes && !tesseract_->WriteTRFile(output_file_.c_str())) {
1077 tprintf("Write of TR file failed: %s\n", output_file_.c_str());
1078 return false;
1079 }
1080 }
1081#endif // ndef DISABLED_LEGACY_ENGINE
1082 return result;
1083}
1084
1085#ifdef HAVE_LIBCURL
1086static size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp) {
1087 size = size * nmemb;
1088 auto *buf = reinterpret_cast<std::string *>(userp);
1089 buf->append(reinterpret_cast<const char *>(contents), size);
1090 return size;
1091}
1092#endif
1093
1094// In the ideal scenario, Tesseract will start working on data as soon
1095// as it can. For example, if you stream a filelist through stdin, we
1096// should start the OCR process as soon as the first filename is
1097// available. This is particularly useful when hooking Tesseract up to
1098// slow hardware such as a book scanning machine.
1099//
1100// Unfortunately there are tradeoffs. You can't seek on stdin. That
1101// makes automatic detection of datatype (TIFF? filelist? PNG?)
1102// impractical. So we support a command line flag to explicitly
1103// identify the scenario that really matters: filelists on
1104// stdin. We'll still do our best if the user likes pipes.
1105bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_config,
1106 int timeout_millisec, TessResultRenderer *renderer) {
1107 bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1108 if (stdInput) {
1109#ifdef WIN32
1110 if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1111 tprintf("ERROR: cin to binary: %s", strerror(errno));
1112#endif // WIN32
1113 }
1114
1115 if (stream_filelist) {
1116 return ProcessPagesFileList(stdin, nullptr, retry_config, timeout_millisec, renderer,
1117 tesseract_->tessedit_page_number);
1118 }
1119
1120 // At this point we are officially in autodection territory.
1121 // That means any data in stdin must be buffered, to make it
1122 // seekable.
1123 std::string buf;
1124 const l_uint8 *data = nullptr;
1125 if (stdInput) {
1126 buf.assign((std::istreambuf_iterator<char>(std::cin)), (std::istreambuf_iterator<char>()));
1127 data = reinterpret_cast<const l_uint8 *>(buf.data());
1128 } else if (strstr(filename, "://") != nullptr) {
1129 // Get image or image list by URL.
1130#ifdef HAVE_LIBCURL
1131 CURL *curl = curl_easy_init();
1132 if (curl == nullptr) {
1133 fprintf(stderr, "Error, curl_easy_init failed\n");
1134 return false;
1135 } else {
1136 CURLcode curlcode;
1137 auto error = [curl, &curlcode](const char *function) {
1138 fprintf(stderr, "Error, %s failed with error %s\n", function, curl_easy_strerror(curlcode));
1139 curl_easy_cleanup(curl);
1140 return false;
1141 };
1142 curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
1143 if (curlcode != CURLE_OK) {
1144 return error("curl_easy_setopt");
1145 }
1146 // Follow HTTP, HTTPS, FTP and FTPS redirects.
1147 curlcode = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
1148 if (curlcode != CURLE_OK) {
1149 return error("curl_easy_setopt");
1150 }
1151 // Allow no more than 8 redirections to prevent endless loops.
1152 curlcode = curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 8);
1153 if (curlcode != CURLE_OK) {
1154 return error("curl_easy_setopt");
1155 }
1156 int timeout = curl_timeout;
1157 if (timeout > 0) {
1158 curlcode = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
1159 if (curlcode != CURLE_OK) {
1160 return error("curl_easy_setopt");
1161 }
1162 curlcode = curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
1163 if (curlcode != CURLE_OK) {
1164 return error("curl_easy_setopt");
1165 }
1166 }
1167 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
1168 if (curlcode != CURLE_OK) {
1169 return error("curl_easy_setopt");
1170 }
1171 curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
1172 if (curlcode != CURLE_OK) {
1173 return error("curl_easy_setopt");
1174 }
1175 curlcode = curl_easy_perform(curl);
1176 if (curlcode != CURLE_OK) {
1177 return error("curl_easy_perform");
1178 }
1179 curl_easy_cleanup(curl);
1180 data = reinterpret_cast<const l_uint8 *>(buf.data());
1181 }
1182#else
1183 fprintf(stderr, "Error, this tesseract has no URL support\n");
1184 return false;
1185#endif
1186 } else {
1187 // Check whether the input file can be read.
1188 if (FILE *file = fopen(filename, "rb")) {
1189 fclose(file);
1190 } else {
1191 fprintf(stderr, "Error, cannot read input file %s: %s\n", filename, strerror(errno));
1192 return false;
1193 }
1194 }
1195
1196 // Here is our autodetection
1197 int format;
1198 int r =
1199 (data != nullptr) ? findFileFormatBuffer(data, &format) : findFileFormat(filename, &format);
1200
1201 // Maybe we have a filelist
1202 if (r != 0 || format == IFF_UNKNOWN) {
1203 std::string s;
1204 if (data != nullptr) {
1205 s = buf.c_str();
1206 } else {
1207 std::ifstream t(filename);
1208 std::string u((std::istreambuf_iterator<char>(t)), std::istreambuf_iterator<char>());
1209 s = u.c_str();
1210 }
1211 return ProcessPagesFileList(nullptr, &s, retry_config, timeout_millisec, renderer,
1212 tesseract_->tessedit_page_number);
1213 }
1214
1215 // Maybe we have a TIFF which is potentially multipage
1216 bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS || format == IFF_TIFF_RLE ||
1217 format == IFF_TIFF_G3 || format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1218#if LIBLEPT_MAJOR_VERSION > 1 || LIBLEPT_MINOR_VERSION > 76
1219 format == IFF_TIFF_JPEG ||
1220#endif
1221 format == IFF_TIFF_ZIP);
1222
1223 // Fail early if we can, before producing any output
1224 Pix *pix = nullptr;
1225 if (!tiff) {
1226 pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
1227 if (pix == nullptr) {
1228 return false;
1229 }
1230 }
1231
1232 // Begin the output
1233 if (renderer && !renderer->BeginDocument(document_title.c_str())) {
1234 pixDestroy(&pix);
1235 return false;
1236 }
1237
1238 // Produce output
1239 r = (tiff) ? ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config, timeout_millisec,
1240 renderer, tesseract_->tessedit_page_number)
1241 : ProcessPage(pix, 0, filename, retry_config, timeout_millisec, renderer);
1242
1243 // Clean up memory as needed
1244 pixDestroy(&pix);
1245
1246 // End the output
1247 if (!r || (renderer && !renderer->EndDocument())) {
1248 return false;
1249 }
1250 return true;
1251}
1252
1253bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
1254 const char *retry_config, int timeout_millisec,
1255 TessResultRenderer *renderer) {
1256 SetInputName(filename);
1257 SetImage(pix);
1258 bool failed = false;
1259
1260 if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
1261 // Disabled character recognition
1262 if (! std::unique_ptr<const PageIterator>(AnalyseLayout())) {
1263 failed = true;
1264 }
1265 } else if (tesseract_->tessedit_pageseg_mode == PSM_OSD_ONLY) {
1266 failed = FindLines() != 0;
1267 } else if (timeout_millisec > 0) {
1268 // Running with a timeout.
1269 ETEXT_DESC monitor;
1270 monitor.cancel = nullptr;
1271 monitor.cancel_this = nullptr;
1272 monitor.set_deadline_msecs(timeout_millisec);
1273
1274 // Now run the main recognition.
1275 failed = Recognize(&monitor) < 0;
1276 } else {
1277 // Normal layout and character recognition with no timeout.
1278 failed = Recognize(nullptr) < 0;
1279 }
1280
1281 if (tesseract_->tessedit_write_images) {
1282 Pix *page_pix = GetThresholdedImage();
1283 std::string output_filename = output_file_ + ".processed";
1284 if (page_index > 0) {
1285 output_filename += std::to_string(page_index);
1286 }
1287 output_filename += ".tif";
1288 pixWrite(output_filename.c_str(), page_pix, IFF_TIFF_G4);
1289 pixDestroy(&page_pix);
1290 }
1291
1292 if (failed && retry_config != nullptr && retry_config[0] != '\0') {
1293 // Save current config variables before switching modes.
1294 FILE *fp = fopen(kOldVarsFile, "wb");
1295 if (fp == nullptr) {
1296 tprintf("Error, failed to open file \"%s\"\n", kOldVarsFile);
1297 } else {
1298 PrintVariables(fp);
1299 fclose(fp);
1300 }
1301 // Switch to alternate mode for retry.
1302 ReadConfigFile(retry_config);
1303 SetImage(pix);
1304 Recognize(nullptr);
1305 // Restore saved config variables.
1306 ReadConfigFile(kOldVarsFile);
1307 }
1308
1309 if (renderer && !failed) {
1310 failed = !renderer->AddImage(this);
1311 }
1312
1313 return !failed;
1314}
1315
1321 if (tesseract_ == nullptr || page_res_ == nullptr) {
1322 return nullptr;
1323 }
1327}
1328
1338 if (tesseract_ == nullptr || page_res_ == nullptr) {
1339 return nullptr;
1340 }
1344}
1345
1355 if (tesseract_ == nullptr || page_res_ == nullptr) {
1356 return nullptr;
1357 }
1361}
1362
1365 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1366 return nullptr;
1367 }
1368 std::string text("");
1369 const std::unique_ptr</*non-const*/ ResultIterator> it(GetIterator());
1370 do {
1371 if (it->Empty(RIL_PARA)) {
1372 continue;
1373 }
1374 auto block_type = it->BlockType();
1375 switch (block_type) {
1376 case PT_FLOWING_IMAGE:
1377 case PT_HEADING_IMAGE:
1378 case PT_PULLOUT_IMAGE:
1379 case PT_HORZ_LINE:
1380 case PT_VERT_LINE:
1381 // Ignore images and lines for text output.
1382 continue;
1383 case PT_NOISE:
1384 tprintf("TODO: Please report image which triggers the noise case.\n");
1385 ASSERT_HOST(false);
1386 default:
1387 break;
1388 }
1389
1390 const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1391 text += para_text.get();
1392 } while (it->Next(RIL_PARA));
1393 char *result = new char[text.length() + 1];
1394 strncpy(result, text.c_str(), text.length() + 1);
1395 return result;
1396}
1397
1398static void AddBoxToTSV(const PageIterator *it, PageIteratorLevel level, std::string &text) {
1399 int left, top, right, bottom;
1400 it->BoundingBox(level, &left, &top, &right, &bottom);
1401 text += "\t" + std::to_string(left);
1402 text += "\t" + std::to_string(top);
1403 text += "\t" + std::to_string(right - left);
1404 text += "\t" + std::to_string(bottom - top);
1405}
1406
1412char *TessBaseAPI::GetTSVText(int page_number) {
1413 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) {
1414 return nullptr;
1415 }
1416
1417 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1418 int page_id = page_number + 1; // we use 1-based page numbers.
1419
1420 int page_num = page_id;
1421 int block_num = 0;
1422 int par_num = 0;
1423 int line_num = 0;
1424 int word_num = 0;
1425
1426 std::string tsv_str;
1427 tsv_str += "1\t" + std::to_string(page_num); // level 1 - page
1428 tsv_str += "\t" + std::to_string(block_num);
1429 tsv_str += "\t" + std::to_string(par_num);
1430 tsv_str += "\t" + std::to_string(line_num);
1431 tsv_str += "\t" + std::to_string(word_num);
1432 tsv_str += "\t" + std::to_string(rect_left_);
1433 tsv_str += "\t" + std::to_string(rect_top_);
1434 tsv_str += "\t" + std::to_string(rect_width_);
1435 tsv_str += "\t" + std::to_string(rect_height_);
1436 tsv_str += "\t-1\t\n";
1437
1438 const std::unique_ptr</*non-const*/ ResultIterator> res_it(GetIterator());
1439 while (!res_it->Empty(RIL_BLOCK)) {
1440 if (res_it->Empty(RIL_WORD)) {
1441 res_it->Next(RIL_WORD);
1442 continue;
1443 }
1444
1445 // Add rows for any new block/paragraph/textline.
1446 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1447 block_num++;
1448 par_num = 0;
1449 line_num = 0;
1450 word_num = 0;
1451 tsv_str += "2\t" + std::to_string(page_num); // level 2 - block
1452 tsv_str += "\t" + std::to_string(block_num);
1453 tsv_str += "\t" + std::to_string(par_num);
1454 tsv_str += "\t" + std::to_string(line_num);
1455 tsv_str += "\t" + std::to_string(word_num);
1456 AddBoxToTSV(res_it.get(), RIL_BLOCK, tsv_str);
1457 tsv_str += "\t-1\t\n"; // end of row for block
1458 }
1459 if (res_it->IsAtBeginningOf(RIL_PARA)) {
1460 par_num++;
1461 line_num = 0;
1462 word_num = 0;
1463 tsv_str += "3\t" + std::to_string(page_num); // level 3 - paragraph
1464 tsv_str += "\t" + std::to_string(block_num);
1465 tsv_str += "\t" + std::to_string(par_num);
1466 tsv_str += "\t" + std::to_string(line_num);
1467 tsv_str += "\t" + std::to_string(word_num);
1468 AddBoxToTSV(res_it.get(), RIL_PARA, tsv_str);
1469 tsv_str += "\t-1\t\n"; // end of row for para
1470 }
1471 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1472 line_num++;
1473 word_num = 0;
1474 tsv_str += "4\t" + std::to_string(page_num); // level 4 - line
1475 tsv_str += "\t" + std::to_string(block_num);
1476 tsv_str += "\t" + std::to_string(par_num);
1477 tsv_str += "\t" + std::to_string(line_num);
1478 tsv_str += "\t" + std::to_string(word_num);
1479 AddBoxToTSV(res_it.get(), RIL_TEXTLINE, tsv_str);
1480 tsv_str += "\t-1\t\n"; // end of row for line
1481 }
1482
1483 // Now, process the word...
1484 int left, top, right, bottom;
1485 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1486 word_num++;
1487 tsv_str += "5\t" + std::to_string(page_num); // level 5 - word
1488 tsv_str += "\t" + std::to_string(block_num);
1489 tsv_str += "\t" + std::to_string(par_num);
1490 tsv_str += "\t" + std::to_string(line_num);
1491 tsv_str += "\t" + std::to_string(word_num);
1492 tsv_str += "\t" + std::to_string(left);
1493 tsv_str += "\t" + std::to_string(top);
1494 tsv_str += "\t" + std::to_string(right - left);
1495 tsv_str += "\t" + std::to_string(bottom - top);
1496 tsv_str += "\t" + std::to_string(res_it->Confidence(RIL_WORD));
1497 tsv_str += "\t";
1498
1499 // Increment counts if at end of block/paragraph/textline.
1500 if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) {
1501 lcnt++;
1502 }
1503 if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) {
1504 pcnt++;
1505 }
1506 if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) {
1507 bcnt++;
1508 }
1509
1510 do {
1511 tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1512 res_it->Next(RIL_SYMBOL);
1513 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1514 tsv_str += "\n"; // end of row
1515 wcnt++;
1516 }
1517
1518 char *ret = new char[tsv_str.length() + 1];
1519 strcpy(ret, tsv_str.c_str());
1520 return ret;
1521}
1522
1524const int kNumbersPerBlob = 5;
1529const int kBytesPerNumber = 5;
1545
1552char *TessBaseAPI::GetBoxText(int page_number) {
1553 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1554 return nullptr;
1555 }
1556 int blob_count;
1557 int utf8_length = TextLength(&blob_count);
1558 int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine;
1559 char *result = new char[total_length];
1560 result[0] = '\0';
1561 int output_length = 0;
1563 do {
1564 int left, top, right, bottom;
1565 if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1566 const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1567 // Tesseract uses space for recognition failure. Fix to a reject
1568 // character, kTesseractReject so we don't create illegal box files.
1569 for (int i = 0; text[i] != '\0'; ++i) {
1570 if (text[i] == ' ') {
1571 text[i] = kTesseractReject;
1572 }
1573 }
1574 snprintf(result + output_length, total_length - output_length, "%s %d %d %d %d %d\n",
1575 text.get(), left, image_height_ - bottom, right, image_height_ - top, page_number);
1576 output_length += strlen(result + output_length);
1577 // Just in case...
1578 if (output_length + kMaxBytesPerLine > total_length) {
1579 break;
1580 }
1581 }
1582 } while (it->Next(RIL_SYMBOL));
1583 delete it;
1584 return result;
1585}
1586
1592const int kUniChs[] = {0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0};
1594const int kLatinChs[] = {0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0};
1595
1602 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1603 return nullptr;
1604 }
1605 bool tilde_crunch_written = false;
1606 bool last_char_was_newline = true;
1607 bool last_char_was_tilde = false;
1608
1609 int total_length = TextLength(nullptr);
1610 PAGE_RES_IT page_res_it(page_res_);
1611 char *result = new char[total_length];
1612 char *ptr = result;
1613 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
1614 WERD_RES *word = page_res_it.word();
1615 // Process the current word.
1616 if (word->unlv_crunch_mode != CR_NONE) {
1617 if (word->unlv_crunch_mode != CR_DELETE &&
1618 (!tilde_crunch_written ||
1619 (word->unlv_crunch_mode == CR_KEEP_SPACE && word->word->space() > 0 &&
1620 !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)))) {
1621 if (!word->word->flag(W_BOL) && word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) &&
1622 !word->word->flag(W_FUZZY_SP)) {
1623 /* Write a space to separate from preceding good text */
1624 *ptr++ = ' ';
1625 last_char_was_tilde = false;
1626 }
1627 if (!last_char_was_tilde) {
1628 // Write a reject char.
1629 last_char_was_tilde = true;
1630 *ptr++ = kUNLVReject;
1631 tilde_crunch_written = true;
1632 last_char_was_newline = false;
1633 }
1634 }
1635 } else {
1636 // NORMAL PROCESSING of non tilde crunched words.
1637 tilde_crunch_written = false;
1639 const char *wordstr = word->best_choice->unichar_string().c_str();
1640 const auto &lengths = word->best_choice->unichar_lengths();
1641 int length = lengths.length();
1642 int i = 0;
1643 int offset = 0;
1644
1645 if (last_char_was_tilde && word->word->space() == 0 && wordstr[offset] == ' ') {
1646 // Prevent adjacent tilde across words - we know that adjacent tildes
1647 // within words have been removed.
1648 // Skip the first character.
1649 offset = lengths[i++];
1650 }
1651 if (i < length && wordstr[offset] != 0) {
1652 if (!last_char_was_newline) {
1653 *ptr++ = ' ';
1654 } else {
1655 last_char_was_newline = false;
1656 }
1657 for (; i < length; offset += lengths[i++]) {
1658 if (wordstr[offset] == ' ' || wordstr[offset] == kTesseractReject) {
1659 *ptr++ = kUNLVReject;
1660 last_char_was_tilde = true;
1661 } else {
1662 if (word->reject_map[i].rejected()) {
1663 *ptr++ = kUNLVSuspect;
1664 }
1665 UNICHAR ch(wordstr + offset, lengths[i]);
1666 int uni_ch = ch.first_uni();
1667 for (int j = 0; kUniChs[j] != 0; ++j) {
1668 if (kUniChs[j] == uni_ch) {
1669 uni_ch = kLatinChs[j];
1670 break;
1671 }
1672 }
1673 if (uni_ch <= 0xff) {
1674 *ptr++ = static_cast<char>(uni_ch);
1675 last_char_was_tilde = false;
1676 } else {
1677 *ptr++ = kUNLVReject;
1678 last_char_was_tilde = true;
1679 }
1680 }
1681 }
1682 }
1683 }
1684 if (word->word->flag(W_EOL) && !last_char_was_newline) {
1685 /* Add a new line output */
1686 *ptr++ = '\n';
1687 tilde_crunch_written = false;
1688 last_char_was_newline = true;
1689 last_char_was_tilde = false;
1690 }
1691 }
1692 *ptr++ = '\n';
1693 *ptr = '\0';
1694 return result;
1695}
1696
1697#ifndef DISABLED_LEGACY_ENGINE
1698
1708bool TessBaseAPI::DetectOrientationScript(int *orient_deg, float *orient_conf,
1709 const char **script_name, float *script_conf) {
1710 OSResults osr;
1711
1712 bool osd = DetectOS(&osr);
1713 if (!osd) {
1714 return false;
1715 }
1716
1717 int orient_id = osr.best_result.orientation_id;
1718 int script_id = osr.get_best_script(orient_id);
1719 if (orient_conf) {
1720 *orient_conf = osr.best_result.oconfidence;
1721 }
1722 if (orient_deg) {
1723 *orient_deg = orient_id * 90; // convert quadrant to degrees
1724 }
1725
1726 if (script_name) {
1727 const char *script = osr.unicharset->get_script_from_script_id(script_id);
1728
1729 *script_name = script;
1730 }
1731
1732 if (script_conf) {
1733 *script_conf = osr.best_result.sconfidence;
1734 }
1735
1736 return true;
1737}
1738
1744char *TessBaseAPI::GetOsdText(int page_number) {
1745 int orient_deg;
1746 float orient_conf;
1747 const char *script_name;
1748 float script_conf;
1749
1750 if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, &script_conf)) {
1751 return nullptr;
1752 }
1753
1754 // clockwise rotation needed to make the page upright
1755 int rotate = OrientationIdToValue(orient_deg / 90);
1756
1757 std::stringstream stream;
1758 // Use "C" locale (needed for float values orient_conf and script_conf).
1759 stream.imbue(std::locale::classic());
1760 // Use fixed notation with 2 digits after the decimal point for float values.
1761 stream.precision(2);
1762 stream << std::fixed << "Page number: " << page_number << "\n"
1763 << "Orientation in degrees: " << orient_deg << "\n"
1764 << "Rotate: " << rotate << "\n"
1765 << "Orientation confidence: " << orient_conf << "\n"
1766 << "Script: " << script_name << "\n"
1767 << "Script confidence: " << script_conf << "\n";
1768 const std::string &text = stream.str();
1769 char *result = new char[text.length() + 1];
1770 strcpy(result, text.c_str());
1771 return result;
1772}
1773
1774#endif // ndef DISABLED_LEGACY_ENGINE
1775
1778 int *conf = AllWordConfidences();
1779 if (!conf) {
1780 return 0;
1781 }
1782 int sum = 0;
1783 int *pt = conf;
1784 while (*pt >= 0) {
1785 sum += *pt++;
1786 }
1787 if (pt != conf) {
1788 sum /= pt - conf;
1789 }
1790 delete[] conf;
1791 return sum;
1792}
1793
1796 if (tesseract_ == nullptr || (!recognition_done_ && Recognize(nullptr) < 0)) {
1797 return nullptr;
1798 }
1799 int n_word = 0;
1800 PAGE_RES_IT res_it(page_res_);
1801 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1802 n_word++;
1803 }
1804
1805 int *conf = new int[n_word + 1];
1806 n_word = 0;
1807 for (res_it.restart_page(); res_it.word() != nullptr; res_it.forward()) {
1808 WERD_RES *word = res_it.word();
1809 WERD_CHOICE *choice = word->best_choice;
1810 int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1811 // This is the eq for converting Tesseract confidence to 1..100
1812 if (w_conf < 0) {
1813 w_conf = 0;
1814 }
1815 if (w_conf > 100) {
1816 w_conf = 100;
1817 }
1818 conf[n_word++] = w_conf;
1819 }
1820 conf[n_word] = -1;
1821 return conf;
1822}
1823
1824#ifndef DISABLED_LEGACY_ENGINE
1835bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char *wordstr) {
1836 int debug = 0;
1837 GetIntVariable("applybox_debug", &debug);
1838 bool success = true;
1839 PageSegMode current_psm = GetPageSegMode();
1840 SetPageSegMode(mode);
1841 SetVariable("classify_enable_learning", "0");
1842 const std::unique_ptr<const char[]> text(GetUTF8Text());
1843 if (debug) {
1844 tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1845 }
1846 if (text != nullptr) {
1848 WERD_RES *word_res = it.word();
1849 if (word_res != nullptr) {
1850 word_res->word->set_text(wordstr);
1851 // Check to see if text matches wordstr.
1852 int w = 0;
1853 int t;
1854 for (t = 0; text[t] != '\0'; ++t) {
1855 if (text[t] == '\n' || text[t] == ' ') {
1856 continue;
1857 }
1858 while (wordstr[w] == ' ') {
1859 ++w;
1860 }
1861 if (text[t] != wordstr[w]) {
1862 break;
1863 }
1864 ++w;
1865 }
1866 if (text[t] != '\0' || wordstr[w] != '\0') {
1867 // No match.
1868 delete page_res_;
1869 std::vector<TBOX> boxes;
1873 PAGE_RES_IT pr_it(page_res_);
1874 if (pr_it.word() == nullptr) {
1875 success = false;
1876 } else {
1877 word_res = pr_it.word();
1878 }
1879 } else {
1880 word_res->BestChoiceToCorrectText();
1881 }
1882 if (success) {
1883 tesseract_->EnableLearning = true;
1884 tesseract_->LearnWord(nullptr, word_res);
1885 }
1886 } else {
1887 success = false;
1888 }
1889 } else {
1890 success = false;
1891 }
1892 SetPageSegMode(current_psm);
1893 return success;
1894}
1895#endif // ndef DISABLED_LEGACY_ENGINE
1896
1904 if (thresholder_ != nullptr) {
1906 }
1907 ClearResults();
1908 if (tesseract_ != nullptr) {
1909 SetInputImage(nullptr);
1910 }
1911}
1912
1920 Clear();
1921 delete thresholder_;
1922 thresholder_ = nullptr;
1923 delete page_res_;
1924 page_res_ = nullptr;
1925 delete block_list_;
1926 block_list_ = nullptr;
1927 if (paragraph_models_ != nullptr) {
1928 for (auto model : *paragraph_models_) {
1929 delete model;
1930 }
1931 delete paragraph_models_;
1932 paragraph_models_ = nullptr;
1933 }
1934#ifndef DISABLED_LEGACY_ENGINE
1935 if (osd_tesseract_ == tesseract_) {
1936 osd_tesseract_ = nullptr;
1937 }
1938 delete osd_tesseract_;
1939 osd_tesseract_ = nullptr;
1940 delete equ_detect_;
1941 equ_detect_ = nullptr;
1942#endif // ndef DISABLED_LEGACY_ENGINE
1943 delete tesseract_;
1944 tesseract_ = nullptr;
1945 input_file_.clear();
1946 output_file_.clear();
1947 datapath_.clear();
1948 language_.clear();
1949}
1950
1951// Clear any library-level memory caches.
1952// There are a variety of expensive-to-load constant data structures (mostly
1953// language dictionaries) that are cached globally -- surviving the Init()
1954// and End() of individual TessBaseAPI's. This function allows the clearing
1955// of these caches.
1958}
1959
1964int TessBaseAPI::IsValidWord(const char *word) const {
1965 return tesseract_->getDict().valid_word(word);
1966}
1967// Returns true if utf8_character is defined in the UniCharset.
1968bool TessBaseAPI::IsValidCharacter(const char *utf8_character) const {
1969 return tesseract_->unicharset.contains_unichar(utf8_character);
1970}
1971
1972// TODO(rays) Obsolete this function and replace with a more aptly named
1973// function that returns image coordinates rather than tesseract coordinates.
1974bool TessBaseAPI::GetTextDirection(int *out_offset, float *out_slope) {
1975 const std::unique_ptr<const PageIterator> it(AnalyseLayout());
1976 if (it == nullptr) {
1977 return false;
1978 }
1979 int x1, x2, y1, y2;
1980 it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
1981 // Calculate offset and slope (NOTE: Kind of ugly)
1982 if (x2 <= x1) {
1983 x2 = x1 + 1;
1984 }
1985 // Convert the point pair to slope/offset of the baseline (in image coords.)
1986 *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
1987 *out_offset = static_cast<int>(y1 - *out_slope * x1);
1988 // Get the y-coord of the baseline at the left and right edges of the
1989 // textline's bounding box.
1990 int left, top, right, bottom;
1991 if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
1992 return false;
1993 }
1994 int left_y = IntCastRounded(*out_slope * left + *out_offset);
1995 int right_y = IntCastRounded(*out_slope * right + *out_offset);
1996 // Shift the baseline down so it passes through the nearest bottom-corner
1997 // of the textline's bounding box. This is the difference between the y
1998 // at the lowest (max) edge of the box and the actual box bottom.
1999 *out_offset += bottom - std::max(left_y, right_y);
2000 // Switch back to bottom-up tesseract coordinates. Requires negation of
2001 // the slope and height - offset for the offset.
2002 *out_slope = -*out_slope;
2003 *out_offset = rect_height_ - *out_offset;
2004
2005 return true;
2006}
2007
2010 if (tesseract_ != nullptr) {
2012 }
2013}
2014
2024 if (tesseract_ != nullptr) {
2026 // Set it for the sublangs too.
2027 int num_subs = tesseract_->num_sub_langs();
2028 for (int i = 0; i < num_subs; ++i) {
2030 }
2031 }
2032}
2033
2036 if (tesseract_ == nullptr) {
2037 tprintf("Please call Init before attempting to set an image.\n");
2038 return false;
2039 }
2040 if (thresholder_ == nullptr) {
2042 }
2043 ClearResults();
2044 return true;
2045}
2046
2053bool TessBaseAPI::Threshold(Pix **pix) {
2054 ASSERT_HOST(pix != nullptr);
2055 if (*pix != nullptr) {
2056 pixDestroy(pix);
2057 }
2058 // Zero resolution messes up the algorithms, so make sure it is credible.
2059 int user_dpi = 0;
2060 GetIntVariable("user_defined_dpi", &user_dpi);
2061 int y_res = thresholder_->GetScaledYResolution();
2062 if (user_dpi && (user_dpi < kMinCredibleResolution || user_dpi > kMaxCredibleResolution)) {
2063 tprintf(
2064 "Warning: User defined image dpi is outside of expected range "
2065 "(%d - %d)!\n",
2067 }
2068 // Always use user defined dpi
2069 if (user_dpi) {
2071 } else if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2072 if (y_res != 0) {
2073 // Show warning only if a resolution was given.
2074 tprintf("Warning: Invalid resolution %d dpi. Using %d instead.\n",
2075 y_res, kMinCredibleResolution);
2076 }
2078 }
2079
2080 auto thresholding_method = static_cast<ThresholdMethod>(static_cast<int>(tesseract_->thresholding_method));
2081
2082 if (thresholding_method == ThresholdMethod::Otsu) {
2083 Image pix_binary(*pix);
2084 if (!thresholder_->ThresholdToPix(&pix_binary)) {
2085 return false;
2086 }
2087 *pix = pix_binary;
2088
2089 if (!thresholder_->IsBinary()) {
2092 } else {
2094 tesseract_->set_pix_grey(nullptr);
2095 }
2096 } else {
2097 auto [ok, pix_grey, pix_binary, pix_thresholds] = thresholder_->Threshold(this, thresholding_method);
2098
2099 if (!ok) {
2100 return false;
2101 }
2102 *pix = pix_binary;
2103
2104 tesseract_->set_pix_thresholds(pix_thresholds);
2105 tesseract_->set_pix_grey(pix_grey);
2106 }
2107
2109 &image_height_);
2110
2111 // Set the internal resolution that is used for layout parameters from the
2112 // estimated resolution, rather than the image resolution, which may be
2113 // fabricated, but we will use the image resolution, if there is one, to
2114 // report output point sizes.
2117 if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2118 tprintf(
2119 "Estimated internal resolution %d out of range! "
2120 "Corrected to %d.\n",
2121 thresholder_->GetScaledEstimatedResolution(), estimated_res);
2122 }
2123 tesseract_->set_source_resolution(estimated_res);
2124 return true;
2125}
2126
2129 if (thresholder_ == nullptr || thresholder_->IsEmpty()) {
2130 tprintf("Please call SetImage before attempting recognition.\n");
2131 return -1;
2132 }
2133 if (recognition_done_) {
2134 ClearResults();
2135 }
2136 if (!block_list_->empty()) {
2137 return 0;
2138 }
2139 if (tesseract_ == nullptr) {
2140 tesseract_ = new Tesseract;
2141#ifndef DISABLED_LEGACY_ENGINE
2143#endif
2144 }
2145 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2146 return -1;
2147 }
2148
2150
2151#ifndef DISABLED_LEGACY_ENGINE
2152 if (tesseract_->textord_equation_detect) {
2153 if (equ_detect_ == nullptr && !datapath_.empty()) {
2154 equ_detect_ = new EquationDetect(datapath_.c_str(), nullptr);
2155 }
2156 if (equ_detect_ == nullptr) {
2157 tprintf("Warning: Could not set equation detector\n");
2158 } else {
2160 }
2161 }
2162#endif // ndef DISABLED_LEGACY_ENGINE
2163
2164 Tesseract *osd_tess = osd_tesseract_;
2165 OSResults osr;
2166#ifndef DISABLED_LEGACY_ENGINE
2167 if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == nullptr) {
2168 if (strcmp(language_.c_str(), "osd") == 0) {
2169 osd_tess = tesseract_;
2170 } else {
2173 if (datapath_.empty()) {
2174 tprintf(
2175 "Warning: Auto orientation and script detection requested,"
2176 " but data path is undefined\n");
2177 delete osd_tesseract_;
2178 osd_tesseract_ = nullptr;
2180 nullptr, 0, nullptr, nullptr, false, &mgr) == 0) {
2181 osd_tess = osd_tesseract_;
2183 } else {
2184 tprintf(
2185 "Warning: Auto orientation and script detection requested,"
2186 " but osd language failed to load\n");
2187 delete osd_tesseract_;
2188 osd_tesseract_ = nullptr;
2189 }
2190 }
2191 }
2192#endif // ndef DISABLED_LEGACY_ENGINE
2193
2194 if (tesseract_->SegmentPage(input_file_.c_str(), block_list_, osd_tess, &osr) < 0) {
2195 return -1;
2196 }
2197
2198 // If Devanagari is being recognized, we use different images for page seg
2199 // and for OCR.
2200 tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2201 return 0;
2202}
2203
2206 if (tesseract_ != nullptr) {
2207 tesseract_->Clear();
2208 }
2209 delete page_res_;
2210 page_res_ = nullptr;
2211 recognition_done_ = false;
2212 if (block_list_ == nullptr) {
2213 block_list_ = new BLOCK_LIST;
2214 } else {
2215 block_list_->clear();
2216 }
2217 if (paragraph_models_ != nullptr) {
2218 for (auto model : *paragraph_models_) {
2219 delete model;
2220 }
2221 delete paragraph_models_;
2222 paragraph_models_ = nullptr;
2223 }
2224}
2225
2233int TessBaseAPI::TextLength(int *blob_count) const {
2234 if (tesseract_ == nullptr || page_res_ == nullptr) {
2235 return 0;
2236 }
2237
2238 PAGE_RES_IT page_res_it(page_res_);
2239 int total_length = 2;
2240 int total_blobs = 0;
2241 // Iterate over the data structures to extract the recognition result.
2242 for (page_res_it.restart_page(); page_res_it.word() != nullptr; page_res_it.forward()) {
2243 WERD_RES *word = page_res_it.word();
2244 WERD_CHOICE *choice = word->best_choice;
2245 if (choice != nullptr) {
2246 total_blobs += choice->length() + 2;
2247 total_length += choice->unichar_string().length() + 2;
2248 for (int i = 0; i < word->reject_map.length(); ++i) {
2249 if (word->reject_map[i].rejected()) {
2250 ++total_length;
2251 }
2252 }
2253 }
2254 }
2255 if (blob_count != nullptr) {
2256 *blob_count = total_blobs;
2257 }
2258 return total_length;
2259}
2260
2261#ifndef DISABLED_LEGACY_ENGINE
2267 if (tesseract_ == nullptr) {
2268 return false;
2269 }
2270 ClearResults();
2271 if (tesseract_->pix_binary() == nullptr && !Threshold(&tesseract_->mutable_pix_binary()->pix_)) {
2272 return false;
2273 }
2274
2275 if (input_file_.empty()) {
2276 input_file_ = kInputFile;
2277 }
2279}
2280#endif // #ifndef DISABLED_LEGACY_ENGINE
2281
2283 tesseract_->min_orientation_margin.set_value(margin);
2284}
2285
2300void TessBaseAPI::GetBlockTextOrientations(int **block_orientation, bool **vertical_writing) {
2301 delete[] * block_orientation;
2302 *block_orientation = nullptr;
2303 delete[] * vertical_writing;
2304 *vertical_writing = nullptr;
2305 BLOCK_IT block_it(block_list_);
2306
2307 block_it.move_to_first();
2308 int num_blocks = 0;
2309 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2310 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2311 continue;
2312 }
2313 ++num_blocks;
2314 }
2315 if (!num_blocks) {
2316 tprintf("WARNING: Found no blocks\n");
2317 return;
2318 }
2319 *block_orientation = new int[num_blocks];
2320 *vertical_writing = new bool[num_blocks];
2321 block_it.move_to_first();
2322 int i = 0;
2323 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2324 if (!block_it.data()->pdblk.poly_block()->IsText()) {
2325 continue;
2326 }
2327 FCOORD re_rotation = block_it.data()->re_rotation();
2328 float re_theta = re_rotation.angle();
2329 FCOORD classify_rotation = block_it.data()->classify_rotation();
2330 float classify_theta = classify_rotation.angle();
2331 double rot_theta = -(re_theta - classify_theta) * 2.0 / M_PI;
2332 if (rot_theta < 0) {
2333 rot_theta += 4;
2334 }
2335 int num_rotations = static_cast<int>(rot_theta + 0.5);
2336 (*block_orientation)[i] = num_rotations;
2337 // The classify_rotation is non-zero only if the text has vertical
2338 // writing direction.
2339 (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2340 ++i;
2341 }
2342}
2343
2344void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2345 int debug_level = 0;
2346 GetIntVariable("paragraph_debug_level", &debug_level);
2347 if (paragraph_models_ == nullptr) {
2348 paragraph_models_ = new std::vector<ParagraphModel *>;
2349 }
2350 MutableIterator *result_it = GetMutableIterator();
2351 do { // Detect paragraphs for this block
2352 std::vector<ParagraphModel *> models;
2353 ::tesseract::DetectParagraphs(debug_level, after_text_recognition, result_it, &models);
2354 paragraph_models_->insert(paragraph_models_->end(), models.begin(), models.end());
2355 } while (result_it->Next(RIL_BLOCK));
2356 delete result_it;
2357}
2358
2360const char *TessBaseAPI::GetUnichar(int unichar_id) const {
2361 return tesseract_->unicharset.id_to_unichar(unichar_id);
2362}
2363
2365const Dawg *TessBaseAPI::GetDawg(int i) const {
2366 if (tesseract_ == nullptr || i >= NumDawgs()) {
2367 return nullptr;
2368 }
2369 return tesseract_->getDict().GetDawg(i);
2370}
2371
2374 return tesseract_ == nullptr ? 0 : tesseract_->getDict().NumDawgs();
2375}
2376
2378std::string HOcrEscape(const char *text) {
2379 std::string ret;
2380 const char *ptr;
2381 for (ptr = text; *ptr; ptr++) {
2382 switch (*ptr) {
2383 case '<':
2384 ret += "&lt;";
2385 break;
2386 case '>':
2387 ret += "&gt;";
2388 break;
2389 case '&':
2390 ret += "&amp;";
2391 break;
2392 case '"':
2393 ret += "&quot;";
2394 break;
2395 case '\'':
2396 ret += "&#39;";
2397 break;
2398 default:
2399 ret += *ptr;
2400 }
2401 }
2402 return ret;
2403}
2404
2405} // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:31
struct TessResultRenderer TessResultRenderer
Definition: capi.h:59
#define TRUE
Definition: capi.h:38
#define BOOL
Definition: capi.h:37
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define INT_VAR(name, val, comment)
Definition: params.h:357
#define STRING_VAR(name, val, comment)
Definition: params.h:363
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define MAX_PATH
Definition: host.h:41
int value
const char * p
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
@ OEM_TESSERACT_ONLY
Definition: publictypes.h:264
@ SET_PARAM_CONSTRAINT_NON_INIT_ONLY
Definition: params.h:43
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
Definition: params.h:41
const char kTesseractReject
Definition: baseapi.cpp:109
const int kMinRectSize
Definition: baseapi.cpp:107
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1535
TESS_API int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:566
@ CR_NONE
Definition: pageres.h:160
@ CR_KEEP_SPACE
Definition: pageres.h:160
@ CR_DELETE
Definition: pageres.h:160
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:186
@ PSM_OSD_ONLY
Orientation and script detection only.
Definition: publictypes.h:158
@ PSM_AUTO_ONLY
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:161
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:166
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
int(Dict::*)(void *, const UNICHARSET &, UNICHAR_ID, bool) const DictFunc
Definition: baseapi.h:64
void chomp_string(char *str)
Definition: helpers.h:91
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1537
double(Dict::*)(const char *, const char *, int, const char *, int) ProbabilityInContextFunc
Definition: baseapi.h:66
const int kMaxBytesPerLine
Definition: baseapi.cpp:1544
const int kLatinChs[]
Definition: baseapi.cpp:1594
int orientation_and_script_detection(const char *filename, OSResults *, tesseract::Tesseract *)
Definition: osdetect.cpp:188
constexpr int kMaxCredibleResolution
Definition: publictypes.h:38
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2378
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
const int kBytesPerNumber
Definition: baseapi.cpp:1529
const char kUNLVReject
Definition: baseapi.cpp:111
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36
const int kNumbersPerBlob
Definition: baseapi.cpp:1524
bool(*)(const char *filename, std::vector< char > *data) FileReader
Definition: baseapi.h:61
constexpr int kMinCredibleResolution
Definition: publictypes.h:36
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:80
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:63
@ PT_HEADING_IMAGE
Definition: publictypes.h:62
@ PT_HORZ_LINE
Definition: publictypes.h:64
@ PT_FLOWING_IMAGE
Definition: publictypes.h:61
@ PT_VERT_LINE
Definition: publictypes.h:65
const char kUNLVSuspect
Definition: baseapi.cpp:113
const int kUniChs[]
Definition: baseapi.cpp:1592
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:767
virtual ~TessBaseAPI()
Definition: baseapi.cpp:234
const char * GetInitLanguagesAsString() const
Definition: baseapi.cpp:448
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1105
const char * GetInputName()
Definition: baseapi.cpp:928
std::string input_file_
Name used by training code.
Definition: baseapi.h:773
virtual bool Threshold(Pix **pix)
Definition: baseapi.cpp:2053
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1253
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:834
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:772
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:511
void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing)
Definition: baseapi.cpp:2300
bool SetDebugVariable(const char *name, const char *value)
Definition: baseapi.cpp:287
const char * GetDatapath()
Definition: baseapi.cpp:935
bool GetVariableAsString(const char *name, std::string *val) const
Definition: baseapi.cpp:331
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:765
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:294
Boxa * GetTextlines(bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:658
void SetRectangle(int left, int top, int width, int height)
Definition: baseapi.cpp:619
int NumDawgs() const
Definition: baseapi.cpp:2373
MutableIterator * GetMutableIterator()
Definition: baseapi.cpp:1354
int IsValidWord(const char *word) const
Definition: baseapi.cpp:1964
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:279
bool IsValidCharacter(const char *utf8_character) const
Definition: baseapi.cpp:1968
void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2344
static const char * Version()
Definition: baseapi.cpp:241
Boxa * GetWords(Pixa **pixa)
Definition: baseapi.cpp:680
std::string language_
Last initialized language.
Definition: baseapi.h:776
void GetAvailableLanguagesAsVector(std::vector< std::string > *langs) const
Definition: baseapi.cpp:471
void SetSourceResolution(int ppi)
Definition: baseapi.cpp:584
void ReadDebugConfigFile(const char *filename)
Definition: baseapi.cpp:502
ResultIterator * GetIterator()
Definition: baseapi.cpp:1337
bool GetTextDirection(int *out_offset, float *out_slope)
Definition: baseapi.cpp:1974
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1071
int TextLength(int *blob_count) const
Definition: baseapi.cpp:2233
std::string datapath_
Current location of tessdata.
Definition: baseapi.h:775
int GetThresholdedImageScaleFactor() const
Definition: baseapi.cpp:790
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2266
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:519
static void ClearPersistentCache()
Definition: baseapi.cpp:1956
std::vector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:770
void SetDictFunc(DictFunc f)
Definition: baseapi.cpp:2009
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:778
const Dawg * GetDawg(int i) const
Definition: baseapi.cpp:2365
FileReader reader_
Reads files from any filesystem.
Definition: baseapi.h:768
char * GetTSVText(int page_number)
Definition: baseapi.cpp:1412
void SetInputName(const char *name)
Definition: baseapi.cpp:270
char * GetOsdText(int page_number)
Definition: baseapi.cpp:1744
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:368
OcrEngineMode oem() const
Definition: baseapi.h:715
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:356
void GetLoadedLanguagesAsVector(std::vector< std::string > *langs) const
Definition: baseapi.cpp:457
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:769
static size_t getOpenCLDevice(void **device)
Definition: baseapi.cpp:252
std::string output_file_
Name used by debug code.
Definition: baseapi.h:774
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:576
Boxa * GetComponentImages(PageIteratorLevel level, bool text_only, bool raw_image, int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:702
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:812
char * GetBoxText(int page_number)
Definition: baseapi.cpp:1552
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:314
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:497
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1835
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:771
void set_min_orientation_margin(double margin)
Definition: baseapi.cpp:2282
Boxa * GetStrips(Pixa **pixa, int **blockids)
Definition: baseapi.cpp:671
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1708
void PrintFontsTable(FILE *fp) const
Definition: baseapi.cpp:338
char * TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height)
Definition: baseapi.cpp:539
void SetProbabilityInContextFunc(ProbabilityInContextFunc f)
Definition: baseapi.cpp:2023
LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1320
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:766
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:304
void ClearAdaptiveClassifier()
Definition: baseapi.cpp:560
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:320
Pix * GetThresholdedImage()
Definition: baseapi.cpp:631
const char * GetUnichar(int unichar_id) const
Definition: baseapi.cpp:2360
Boxa * GetConnectedComponents(Pixa **cc)
Definition: baseapi.cpp:690
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:920
void SetOutputName(const char *name)
Definition: baseapi.cpp:275
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:777
Boxa * GetRegions(Pixa **pixa)
Definition: baseapi.cpp:646
char * GetUTF8Text(PageIteratorLevel level) const
void * cancel_this
monitor-aware progress callback
Definition: ocrclass.h:116
void set_deadline_msecs(int32_t deadline_msecs)
Definition: ocrclass.h:128
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:112
OSBestResult best_result
Definition: osdetect.h:80
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:113
UNICHARSET * unicharset
Definition: osdetect.h:79
virtual bool Next(PageIteratorLevel level)
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
bool AddImage(TessBaseAPI *api)
Definition: renderer.cpp:88
bool BeginDocument(const char *title)
Definition: renderer.cpp:75
bool Next(PageIteratorLevel level) override
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
void SetEquationDetect(EquationDetect *detector)
int init_tesseract(const std::string &arg0, const std::string &textbase, const std::string &language, OcrEngineMode oem, char **configs, int configs_size, const std::vector< std::string > *vars_vec, const std::vector< std::string > *vars_values, bool set_only_non_debug_params, TessdataManager *mgr)
Definition: tessedit.cpp:288
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:270
void set_pix_grey(Image grey_pix)
bool TrainLineRecognizer(const char *input_imagename, const std::string &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:41
PAGE_RES * ApplyBoxes(const char *filename, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:110
int num_sub_langs() const
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:685
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:46
void ApplyBoxTraining(const std::string &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:751
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:495
void set_pix_thresholds(Image thresholds)
Dict & getDict() override
Image pix_original() const
Image * mutable_pix_binary()
void recog_training_segmented(const char *filename, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
Tesseract * get_sub_lang(int index) const
void set_pix_original(Image original_pix)
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
void set_source_resolution(int ppi)
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:764
Image pix_binary() const
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:355
FILE * init_recog_training(const char *filename)
PAGE_RES * SetupApplyBoxes(const std::vector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:197
bool AnyLSTMLang() const
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:287
int GetScaledEstimatedResolution() const
Definition: thresholder.h:115
virtual Image GetPixRectThresholds()
int GetSourceYResolution() const
Definition: thresholder.h:99
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:64
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:76
int GetScaledYResolution() const
Definition: thresholder.h:102
virtual std::tuple< bool, Image, Image, Image > Threshold(TessBaseAPI *api, ThresholdMethod method)
void SetRectangle(int left, int top, int width, int height)
virtual Image GetPixRectGrey()
virtual bool ThresholdToPix(Image *pix)
Returns false on error.
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:84
void SetSourceYResolution(int ppi)
Definition: thresholder.h:95
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:59
bool is_italic() const
Definition: fontinfo.h:118
bool is_fixed_pitch() const
Definition: fontinfo.h:124
bool is_bold() const
Definition: fontinfo.h:121
bool is_fraktur() const
Definition: fontinfo.h:130
bool is_serif() const
Definition: fontinfo.h:127
Pix * pix_
Definition: image.h:27
Image clone() const
Definition: image.cpp:24
WERD_CHOICE * best_choice
Definition: pageres.h:239
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:313
void BestChoiceToCorrectText()
Definition: pageres.cpp:956
WERD_RES * forward()
Definition: pageres.h:743
WERD_RES * word() const
Definition: pageres.h:763
WERD_RES * restart_page()
Definition: pageres.h:710
float angle() const
find angle
Definition: points.h:246
float y() const
Definition: points.h:209
float certainty() const
Definition: ratngs.h:315
unsigned length() const
Definition: ratngs.h:287
const std::string & unichar_lengths() const
Definition: ratngs.h:533
std::string & unichar_string()
Definition: ratngs.h:519
uint16_t length() const
Definition: rejctmap.h:333
bool flag(WERD_FLAGS mask) const
Definition: werd.h:128
uint8_t space() const
Definition: werd.h:100
void set_text(const char *new_text)
Definition: werd.h:124
ParamsVectors * params()
Definition: ccutil.h:53
UNICHARSET unicharset
Definition: ccutil.h:61
std::string lang
Definition: ccutil.h:59
std::string datadir
Definition: ccutil.h:57
std::vector< BoolParam * > bool_params
Definition: params.h:48
std::vector< StringParam * > string_params
Definition: params.h:49
std::vector< IntParam * > int_params
Definition: params.h:47
std::vector< DoubleParam * > double_params
Definition: params.h:50
static bool GetParamAsString(const char *name, const ParamsVectors *member_params, std::string *value)
Definition: params.cpp:130
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:164
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:81
bool LoadMemBuffer(const char *name, const char *data, int size)
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:886
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:262
bool WriteTRFile(const char *filename)
Definition: blobclass.cpp:60
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:324
void DeleteUnusedDawgs()
Definition: dawg_cache.h:42
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:172
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:345
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:801
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:381
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:385
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:354
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:387