tesseract  4.00.00dev
baseapi.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith
5  * Created: Fri Oct 06 15:35:01 PDT 2006
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef __linux__
26 #include <signal.h>
27 #endif
28 
29 #if defined(_WIN32)
30 #ifdef _MSC_VER
31 #include "vcsversion.h"
32 #elif MINGW
33 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
34 #undef __STRICT_ANSI__
35 #endif // _MSC_VER
36 #include <fcntl.h>
37 #include <io.h>
38 #else
39 #include <dirent.h>
40 #include <libgen.h>
41 #include <string.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <unistd.h>
45 #endif // _WIN32
46 
47 #include <fstream>
48 #include <iostream>
49 #include <iterator>
50 #include <memory> // std::unique_ptr
51 #include <string>
52 
53 #include "allheaders.h"
54 
55 #include "baseapi.h"
56 #include "blobclass.h"
57 #include "resultiterator.h"
58 #include "mutableiterator.h"
59 #include "thresholder.h"
60 #include "tesseractclass.h"
61 #include "pageres.h"
62 #include "paragraphs.h"
63 #include "tessvars.h"
64 #include "control.h"
65 #include "dict.h"
66 #include "pgedit.h"
67 #include "paramsd.h"
68 #include "output.h"
69 #include "globaloc.h"
70 #include "globals.h"
71 #include "edgblob.h"
72 #include "equationdetect.h"
73 #include "tessbox.h"
74 #include "makerow.h"
75 #include "otsuthr.h"
76 #include "osdetect.h"
77 #include "params.h"
78 #include "renderer.h"
79 #include "strngs.h"
80 #include "openclwrapper.h"
81 
82 BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin");
83 
84 namespace tesseract {
85 
87 const int kMinRectSize = 10;
89 const char kTesseractReject = '~';
91 const char kUNLVReject = '~';
93 const char kUNLVSuspect = '^';
98 const char* kInputFile = "noname.tif";
102 const char* kOldVarsFile = "failed_vars.txt";
104 const int kMaxIntSize = 22;
105 
106 /* Add all available languages recursively.
107 */
108 static void addAvailableLanguages(const STRING &datadir, const STRING &base,
109  GenericVector<STRING>* langs)
110 {
111  const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
112  const size_t extlen = sizeof(kTrainedDataSuffix);
113 #ifdef _WIN32
114  WIN32_FIND_DATA data;
115  HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
116  if (handle != INVALID_HANDLE_VALUE) {
117  BOOL result = TRUE;
118  for (; result;) {
119  char *name = data.cFileName;
120  // Skip '.', '..', and hidden files
121  if (name[0] != '.') {
122  if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
123  FILE_ATTRIBUTE_DIRECTORY) {
124  addAvailableLanguages(datadir, base2 + name, langs);
125  } else {
126  size_t len = strlen(name);
127  if (len > extlen && name[len - extlen] == '.' &&
128  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
129  name[len - extlen] = '\0';
130  langs->push_back(base2 + name);
131  }
132  }
133  }
134  result = FindNextFile(handle, &data);
135  }
136  FindClose(handle);
137  }
138 #else // _WIN32
139  DIR* dir = opendir((datadir + base).string());
140  if (dir != NULL) {
141  dirent *de;
142  while ((de = readdir(dir))) {
143  char *name = de->d_name;
144  // Skip '.', '..', and hidden files
145  if (name[0] != '.') {
146  struct stat st;
147  if (stat((datadir + base2 + name).string(), &st) == 0 &&
148  (st.st_mode & S_IFDIR) == S_IFDIR) {
149  addAvailableLanguages(datadir, base2 + name, langs);
150  } else {
151  size_t len = strlen(name);
152  if (len > extlen && name[len - extlen] == '.' &&
153  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
154  name[len - extlen] = '\0';
155  langs->push_back(base2 + name);
156  }
157  }
158  }
159  }
160  closedir(dir);
161  }
162 #endif
163 }
164 
166  : tesseract_(nullptr),
167  osd_tesseract_(nullptr),
168  equ_detect_(nullptr),
169  reader_(nullptr),
170  // Thresholder is initialized to NULL here, but will be set before use by:
171  // A constructor of a derived API, SetThresholder(), or
172  // created implicitly when used in InternalSetImage.
173  thresholder_(nullptr),
174  paragraph_models_(nullptr),
175  block_list_(nullptr),
176  page_res_(nullptr),
177  input_file_(nullptr),
178  output_file_(nullptr),
179  datapath_(nullptr),
180  language_(nullptr),
181  last_oem_requested_(OEM_DEFAULT),
182  recognition_done_(false),
183  truth_cb_(NULL),
184  rect_left_(0),
185  rect_top_(0),
186  rect_width_(0),
187  rect_height_(0),
188  image_width_(0),
189  image_height_(0) {}
190 
192  End();
193 }
194 
198 const char* TessBaseAPI::Version() {
199 #if defined(GIT_REV) && (defined(DEBUG) || defined(_DEBUG))
200  return GIT_REV;
201 #else
202  return TESSERACT_VERSION_STR;
203 #endif
204 }
205 
213 #ifdef USE_OPENCL
214 #if USE_DEVICE_SELECTION
215 #include "opencl_device_selection.h"
216 #endif
217 #endif
218 size_t TessBaseAPI::getOpenCLDevice(void **data) {
219 #ifdef USE_OPENCL
220 #if USE_DEVICE_SELECTION
221  ds_device device = OpenclDevice::getDeviceSelection();
222  if (device.type == DS_DEVICE_OPENCL_DEVICE) {
223  *data = new cl_device_id;
224  memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
225  return sizeof(cl_device_id);
226  }
227 #endif
228 #endif
229 
230  *data = NULL;
231  return 0;
232 }
233 
239 #ifdef __linux__
240  struct sigaction action;
241  memset(&action, 0, sizeof(action));
242  action.sa_handler = &signal_exit;
243  action.sa_flags = SA_RESETHAND;
244  sigaction(SIGSEGV, &action, NULL);
245  sigaction(SIGFPE, &action, NULL);
246  sigaction(SIGBUS, &action, NULL);
247 #else
248  // Warn API users that an implementation is needed.
249  tprintf("CatchSignals has no non-linux implementation!\n");
250 #endif
251 }
252 
257 void TessBaseAPI::SetInputName(const char* name) {
258  if (input_file_ == NULL)
259  input_file_ = new STRING(name);
260  else
261  *input_file_ = name;
262 }
263 
265 void TessBaseAPI::SetOutputName(const char* name) {
266  if (output_file_ == NULL)
267  output_file_ = new STRING(name);
268  else
269  *output_file_ = name;
270 }
271 
272 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
273  if (tesseract_ == NULL) tesseract_ = new Tesseract;
275  tesseract_->params());
276 }
277 
278 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
279  if (tesseract_ == NULL) tesseract_ = new Tesseract;
281  tesseract_->params());
282 }
283 
284 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
285  IntParam *p = ParamUtils::FindParam<IntParam>(
287  if (p == NULL) return false;
288  *value = (inT32)(*p);
289  return true;
290 }
291 
292 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
293  BoolParam *p = ParamUtils::FindParam<BoolParam>(
295  if (p == NULL) return false;
296  *value = (BOOL8)(*p);
297  return true;
298 }
299 
300 const char *TessBaseAPI::GetStringVariable(const char *name) const {
301  StringParam *p = ParamUtils::FindParam<StringParam>(
303  return (p != NULL) ? p->string() : NULL;
304 }
305 
306 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
307  DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
309  if (p == NULL) return false;
310  *value = (double)(*p);
311  return true;
312 }
313 
315 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
316  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
317 }
318 
320 void TessBaseAPI::PrintVariables(FILE *fp) const {
322 }
323 
332 int TessBaseAPI::Init(const char* datapath, const char* language,
333  OcrEngineMode oem, char **configs, int configs_size,
334  const GenericVector<STRING> *vars_vec,
335  const GenericVector<STRING> *vars_values,
336  bool set_only_non_debug_params) {
337  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
338  vars_values, set_only_non_debug_params, nullptr);
339 }
340 
341 // In-memory version reads the traineddata file directly from the given
342 // data[data_size] array. Also implements the version with a datapath in data,
343 // flagged by data_size = 0.
344 int TessBaseAPI::Init(const char* data, int data_size, const char* language,
345  OcrEngineMode oem, char** configs, int configs_size,
346  const GenericVector<STRING>* vars_vec,
347  const GenericVector<STRING>* vars_values,
348  bool set_only_non_debug_params, FileReader reader) {
349  PERF_COUNT_START("TessBaseAPI::Init")
350  // Default language is "eng".
351  if (language == nullptr) language = "eng";
352  STRING datapath = data_size == 0 ? data : language;
353  // If the datapath, OcrEngineMode or the language have changed - start again.
354  // Note that the language_ field stores the last requested language that was
355  // initialized successfully, while tesseract_->lang stores the language
356  // actually used. They differ only if the requested language was NULL, in
357  // which case tesseract_->lang is set to the Tesseract default ("eng").
358  if (tesseract_ != nullptr &&
359  (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
360  last_oem_requested_ != oem ||
361  (*language_ != language && tesseract_->lang != language))) {
362  delete tesseract_;
363  tesseract_ = nullptr;
364  }
365  // PERF_COUNT_SUB("delete tesseract_")
366 #ifdef USE_OPENCL
367  OpenclDevice od;
368  od.InitEnv();
369 #endif
370  PERF_COUNT_SUB("OD::InitEnv()")
371  bool reset_classifier = true;
372  if (tesseract_ == nullptr) {
373  reset_classifier = false;
374  tesseract_ = new Tesseract;
375  if (reader != nullptr) reader_ = reader;
377  if (data_size != 0) {
378  mgr.LoadMemBuffer(language, data, data_size);
379  }
381  datapath.string(),
382  output_file_ != nullptr ? output_file_->string() : nullptr,
383  language, oem, configs, configs_size, vars_vec, vars_values,
384  set_only_non_debug_params, &mgr) != 0) {
385  return -1;
386  }
387  }
388  PERF_COUNT_SUB("update tesseract_")
389  // Update datapath and language requested for the last valid initialization.
390  if (datapath_ == nullptr)
391  datapath_ = new STRING(datapath);
392  else
393  *datapath_ = datapath;
394  if ((strcmp(datapath_->string(), "") == 0) &&
395  (strcmp(tesseract_->datadir.string(), "") != 0))
397 
398  if (language_ == nullptr)
399  language_ = new STRING(language);
400  else
401  *language_ = language;
403  // PERF_COUNT_SUB("update last_oem_requested_")
404  // For same language and datapath, just reset the adaptive classifier.
405  if (reset_classifier) {
407  PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()")
408  }
410  return 0;
411 }
412 
422  return (language_ == NULL || language_->string() == NULL) ?
423  "" : language_->string();
424 }
425 
432  GenericVector<STRING>* langs) const {
433  langs->clear();
434  if (tesseract_ != NULL) {
435  langs->push_back(tesseract_->lang);
436  int num_subs = tesseract_->num_sub_langs();
437  for (int i = 0; i < num_subs; ++i)
438  langs->push_back(tesseract_->get_sub_lang(i)->lang);
439  }
440 }
441 
446  GenericVector<STRING>* langs) const {
447  langs->clear();
448  if (tesseract_ != NULL) {
449  addAvailableLanguages(tesseract_->datadir, "", langs);
450  }
451 }
452 
459 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
460  if (tesseract_ == NULL)
461  tesseract_ = new Tesseract;
462  else
464  TessdataManager mgr;
465  return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr);
466 }
467 
473  if (tesseract_ == NULL) {
474  tesseract_ = new Tesseract;
476  }
477 }
478 
486 }
487 
491 }
492 
499  if (tesseract_ == NULL)
500  tesseract_ = new Tesseract;
501  tesseract_->tessedit_pageseg_mode.set_value(mode);
502 }
503 
506  if (tesseract_ == NULL)
507  return PSM_SINGLE_BLOCK;
508  return static_cast<PageSegMode>(
509  static_cast<int>(tesseract_->tessedit_pageseg_mode));
510 }
511 
525 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
526  int bytes_per_pixel,
527  int bytes_per_line,
528  int left, int top,
529  int width, int height) {
530  if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
531  return NULL; // Nothing worth doing.
532 
533  // Since this original api didn't give the exact size of the image,
534  // we have to invent a reasonable value.
535  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
536  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
537  bytes_per_pixel, bytes_per_line);
538  SetRectangle(left, top, width, height);
539 
540  return GetUTF8Text();
541 }
542 
548  if (tesseract_ == NULL)
549  return;
552 }
553 
561 void TessBaseAPI::SetImage(const unsigned char* imagedata,
562  int width, int height,
563  int bytes_per_pixel, int bytes_per_line) {
564  if (InternalSetImage()) {
565  thresholder_->SetImage(imagedata, width, height,
566  bytes_per_pixel, bytes_per_line);
568  }
569 }
570 
572  if (thresholder_)
574  else
575  tprintf("Please call SetImage before SetSourceResolution.\n");
576 }
577 
586 void TessBaseAPI::SetImage(Pix* pix) {
587  if (InternalSetImage()) {
588  thresholder_->SetImage(pix);
590  }
591 }
592 
598 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
599  if (thresholder_ == NULL)
600  return;
601  thresholder_->SetRectangle(left, top, width, height);
602  ClearResults();
603 }
604 
610  if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr;
611  if (tesseract_->pix_binary() == nullptr &&
613  return nullptr;
614  }
615  return pixClone(tesseract_->pix_binary());
616 }
617 
623 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
624  return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
625 }
626 
635 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
636  Pixa** pixa, int** blockids, int** paraids) {
637  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
638  pixa, blockids, paraids);
639 }
640 
649 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
650  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
651 }
652 
658 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
659  return GetComponentImages(RIL_WORD, true, pixa, NULL);
660 }
661 
669  return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
670 }
671 
681  bool text_only, bool raw_image,
682  const int raw_padding,
683  Pixa** pixa, int** blockids,
684  int** paraids) {
685  PageIterator* page_it = GetIterator();
686  if (page_it == NULL)
687  page_it = AnalyseLayout();
688  if (page_it == NULL)
689  return NULL; // Failed.
690 
691  // Count the components to get a size for the arrays.
692  int component_count = 0;
693  int left, top, right, bottom;
694 
695  TessResultCallback<bool>* get_bbox = NULL;
696  if (raw_image) {
697  // Get bounding box in original raw image with padding.
699  level, raw_padding,
700  &left, &top, &right, &bottom);
701  } else {
702  // Get bounding box from binarized imaged. Note that this could be
703  // differently scaled from the original image.
704  get_bbox = NewPermanentTessCallback(page_it,
706  level, &left, &top, &right, &bottom);
707  }
708  do {
709  if (get_bbox->Run() &&
710  (!text_only || PTIsTextType(page_it->BlockType())))
711  ++component_count;
712  } while (page_it->Next(level));
713 
714  Boxa* boxa = boxaCreate(component_count);
715  if (pixa != NULL)
716  *pixa = pixaCreate(component_count);
717  if (blockids != NULL)
718  *blockids = new int[component_count];
719  if (paraids != NULL)
720  *paraids = new int[component_count];
721 
722  int blockid = 0;
723  int paraid = 0;
724  int component_index = 0;
725  page_it->Begin();
726  do {
727  if (get_bbox->Run() &&
728  (!text_only || PTIsTextType(page_it->BlockType()))) {
729  Box* lbox = boxCreate(left, top, right - left, bottom - top);
730  boxaAddBox(boxa, lbox, L_INSERT);
731  if (pixa != NULL) {
732  Pix* pix = NULL;
733  if (raw_image) {
734  pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
735  &top);
736  } else {
737  pix = page_it->GetBinaryImage(level);
738  }
739  pixaAddPix(*pixa, pix, L_INSERT);
740  pixaAddBox(*pixa, lbox, L_CLONE);
741  }
742  if (paraids != NULL) {
743  (*paraids)[component_index] = paraid;
744  if (page_it->IsAtFinalElement(RIL_PARA, level))
745  ++paraid;
746  }
747  if (blockids != NULL) {
748  (*blockids)[component_index] = blockid;
749  if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
750  ++blockid;
751  paraid = 0;
752  }
753  }
754  ++component_index;
755  }
756  } while (page_it->Next(level));
757  delete page_it;
758  delete get_bbox;
759  return boxa;
760 }
761 
763  if (thresholder_ == NULL) {
764  return 0;
765  }
766  return thresholder_->GetScaleFactor();
767 }
768 
770 void TessBaseAPI::DumpPGM(const char* filename) {
771  if (tesseract_ == NULL)
772  return;
773  FILE *fp = fopen(filename, "wb");
774  Pix* pix = tesseract_->pix_binary();
775  int width = pixGetWidth(pix);
776  int height = pixGetHeight(pix);
777  l_uint32* data = pixGetData(pix);
778  fprintf(fp, "P5 %d %d 255\n", width, height);
779  for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
780  for (int x = 0; x < width; ++x) {
781  uint8_t b = GET_DATA_BIT(data, x) ? 0 : 255;
782  fwrite(&b, 1, 1, fp);
783  }
784  }
785  fclose(fp);
786 }
787 
804 
805 PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
806  if (FindLines() == 0) {
807  if (block_list_->empty())
808  return NULL; // The page was empty.
809  page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
810  DetectParagraphs(false);
811  return new PageIterator(
815  }
816  return NULL;
817 }
818 
824  if (tesseract_ == NULL)
825  return -1;
826  if (FindLines() != 0)
827  return -1;
828  delete page_res_;
829  if (block_list_->empty()) {
830  page_res_ = new PAGE_RES(false, block_list_,
832  return 0; // Empty page.
833  }
834 
836  recognition_done_ = true;
841  } else {
844  }
845  if (page_res_ == NULL) {
846  return -1;
847  }
851  return 0;
852  }
855  return 0;
856  }
857 
858  if (truth_cb_ != NULL) {
859  tesseract_->wordrec_run_blamer.set_value(true);
860  PageIterator *page_it = new PageIterator(
865  image_height_, page_it, this->tesseract()->pix_grey());
866  delete page_it;
867  }
868 
869  int result = 0;
871  #ifndef GRAPHICS_DISABLED
873  #endif // GRAPHICS_DISABLED
874  // The page_res is invalid after an interactive session, so cleanup
875  // in a way that lets us continue to the next page without crashing.
876  delete page_res_;
877  page_res_ = NULL;
878  return -1;
880  STRING fontname;
881  ExtractFontName(*output_file_, &fontname);
883  } else if (tesseract_->tessedit_ambigs_training) {
884  FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
885  // OCR the page segmented into words by tesseract.
887  *input_file_, page_res_, monitor, training_output_file);
888  fclose(training_output_file);
889  } else {
890  // Now run the main recognition.
891  bool wait_for_text = true;
892  GetBoolVariable("paragraph_text_based", &wait_for_text);
893  if (!wait_for_text) DetectParagraphs(false);
894  if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
895  if (wait_for_text) DetectParagraphs(true);
896  } else {
897  result = -1;
898  }
899  }
900  return result;
901 }
902 
905  if (tesseract_ == NULL)
906  return -1;
907  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
908  tprintf("Please call SetImage before attempting recognition.\n");
909  return -1;
910  }
911  if (page_res_ != NULL)
912  ClearResults();
913  if (FindLines() != 0)
914  return -1;
915  // Additional conditions under which chopper test cannot be run
916  if (tesseract_->interactive_display_mode) return -1;
917 
918  recognition_done_ = true;
919 
920  page_res_ = new PAGE_RES(false, block_list_,
922 
923  PAGE_RES_IT page_res_it(page_res_);
924 
925  while (page_res_it.word() != NULL) {
926  WERD_RES *word_res = page_res_it.word();
927  GenericVector<TBOX> boxes;
928  tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
929  page_res_it.row()->row, word_res);
930  page_res_it.forward();
931  }
932  return 0;
933 }
934 
935 // Takes ownership of the input pix.
937 
939 
941  if (input_file_)
942  return input_file_->c_str();
943  return NULL;
944 }
945 
946 const char * TessBaseAPI::GetDatapath() {
947  return tesseract_->datadir.c_str();
948 }
949 
952 }
953 
954 // If flist exists, get data from there. Otherwise get data from buf.
955 // Seems convoluted, but is the easiest way I know of to meet multiple
956 // goals. Support streaming from stdin, and also work on platforms
957 // lacking fmemopen.
958 bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
959  STRING *buf,
960  const char* retry_config,
961  int timeout_millisec,
962  TessResultRenderer* renderer,
963  int tessedit_page_number) {
964  if (!flist && !buf) return false;
965  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
966  char pagename[MAX_PATH];
967 
968  GenericVector<STRING> lines;
969  if (!flist) {
970  buf->split('\n', &lines);
971  if (lines.empty()) return false;
972  }
973 
974  // Skip to the requested page number.
975  for (int i = 0; i < page; i++) {
976  if (flist) {
977  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
978  }
979  }
980 
981  // Begin producing output
982  if (renderer && !renderer->BeginDocument(unknown_title_)) {
983  return false;
984  }
985 
986  // Loop over all pages - or just the requested one
987  while (true) {
988  if (flist) {
989  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
990  } else {
991  if (page >= lines.size()) break;
992  snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
993  }
994  chomp_string(pagename);
995  Pix *pix = pixRead(pagename);
996  if (pix == NULL) {
997  tprintf("Image file %s cannot be read!\n", pagename);
998  return false;
999  }
1000  tprintf("Page %d : %s\n", page, pagename);
1001  bool r = ProcessPage(pix, page, pagename, retry_config,
1002  timeout_millisec, renderer);
1003  pixDestroy(&pix);
1004  if (!r) return false;
1005  if (tessedit_page_number >= 0) break;
1006  ++page;
1007  }
1008 
1009  // Finish producing output
1010  if (renderer && !renderer->EndDocument()) {
1011  return false;
1012  }
1013  return true;
1014 }
1015 
1016 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
1017  size_t size,
1018  const char* filename,
1019  const char* retry_config,
1020  int timeout_millisec,
1021  TessResultRenderer* renderer,
1022  int tessedit_page_number) {
1023 #ifndef ANDROID_BUILD
1024  Pix *pix = NULL;
1025  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1026  size_t offset = 0;
1027  for (; ; ++page) {
1028  if (tessedit_page_number >= 0)
1029  page = tessedit_page_number;
1030  pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1031  : pixReadFromMultipageTiff(filename, &offset);
1032  if (pix == NULL) break;
1033  tprintf("Page %d\n", page + 1);
1034  char page_str[kMaxIntSize];
1035  snprintf(page_str, kMaxIntSize - 1, "%d", page);
1036  SetVariable("applybox_page", page_str);
1037  bool r = ProcessPage(pix, page, filename, retry_config,
1038  timeout_millisec, renderer);
1039  pixDestroy(&pix);
1040  if (!r) return false;
1041  if (tessedit_page_number >= 0) break;
1042  if (!offset) break;
1043  }
1044  return true;
1045 #else
1046  return false;
1047 #endif
1048 }
1049 
1050 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1051 // processing required due to being in a training mode.
1052 bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
1053  int timeout_millisec,
1054  TessResultRenderer* renderer) {
1055  bool result =
1056  ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1057  if (result) {
1060  tprintf("Write of TR file failed: %s\n", output_file_->string());
1061  return false;
1062  }
1063  }
1064  return result;
1065 }
1066 
1067 // In the ideal scenario, Tesseract will start working on data as soon
1068 // as it can. For example, if you stream a filelist through stdin, we
1069 // should start the OCR process as soon as the first filename is
1070 // available. This is particularly useful when hooking Tesseract up to
1071 // slow hardware such as a book scanning machine.
1072 //
1073 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1074 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1075 // impractical. So we support a command line flag to explicitly
1076 // identify the scenario that really matters: filelists on
1077 // stdin. We'll still do our best if the user likes pipes.
1078 bool TessBaseAPI::ProcessPagesInternal(const char* filename,
1079  const char* retry_config,
1080  int timeout_millisec,
1081  TessResultRenderer* renderer) {
1082  PERF_COUNT_START("ProcessPages")
1083  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1084  if (stdInput) {
1085 #ifdef WIN32
1086  if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1087  tprintf("ERROR: cin to binary: %s", strerror(errno));
1088 #endif // WIN32
1089  }
1090 
1091  if (stream_filelist) {
1092  return ProcessPagesFileList(stdin, NULL, retry_config,
1093  timeout_millisec, renderer,
1095  }
1096 
1097  // At this point we are officially in autodection territory.
1098  // That means any data in stdin must be buffered, to make it
1099  // seekable.
1100  std::string buf;
1101  const l_uint8 *data = NULL;
1102  if (stdInput) {
1103  buf.assign((std::istreambuf_iterator<char>(std::cin)),
1104  (std::istreambuf_iterator<char>()));
1105  data = reinterpret_cast<const l_uint8 *>(buf.data());
1106  }
1107 
1108  // Here is our autodetection
1109  int format;
1110  int r = (stdInput) ?
1111  findFileFormatBuffer(data, &format) :
1112  findFileFormat(filename, &format);
1113 
1114  // Maybe we have a filelist
1115  if (r != 0 || format == IFF_UNKNOWN) {
1116  STRING s;
1117  if (stdInput) {
1118  s = buf.c_str();
1119  } else {
1120  std::ifstream t(filename);
1121  std::string u((std::istreambuf_iterator<char>(t)),
1122  std::istreambuf_iterator<char>());
1123  s = u.c_str();
1124  }
1125  return ProcessPagesFileList(NULL, &s, retry_config,
1126  timeout_millisec, renderer,
1128  }
1129 
1130  // Maybe we have a TIFF which is potentially multipage
1131  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
1132  format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
1133  format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1134  format == IFF_TIFF_ZIP);
1135 
1136  // Fail early if we can, before producing any output
1137  Pix *pix = NULL;
1138  if (!tiff) {
1139  pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
1140  if (pix == NULL) {
1141  return false;
1142  }
1143  }
1144 
1145  // Begin the output
1146  if (renderer && !renderer->BeginDocument(unknown_title_)) {
1147  pixDestroy(&pix);
1148  return false;
1149  }
1150 
1151  // Produce output
1152  r = (tiff) ?
1153  ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
1154  timeout_millisec, renderer,
1156  ProcessPage(pix, 0, filename, retry_config,
1157  timeout_millisec, renderer);
1158 
1159  // Clean up memory as needed
1160  pixDestroy(&pix);
1161 
1162  // End the output
1163  if (!r || (renderer && !renderer->EndDocument())) {
1164  return false;
1165  }
1167  return true;
1168 }
1169 
1170 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
1171  const char* retry_config, int timeout_millisec,
1172  TessResultRenderer* renderer) {
1173  PERF_COUNT_START("ProcessPage")
1174  SetInputName(filename);
1175  SetImage(pix);
1176  bool failed = false;
1177 
1179  // Disabled character recognition
1180  PageIterator* it = AnalyseLayout();
1181 
1182  if (it == NULL) {
1183  failed = true;
1184  } else {
1185  delete it;
1186  }
1188  failed = FindLines() != 0;
1189  } else if (timeout_millisec > 0) {
1190  // Running with a timeout.
1191  ETEXT_DESC monitor;
1192  monitor.cancel = NULL;
1193  monitor.cancel_this = NULL;
1194  monitor.set_deadline_msecs(timeout_millisec);
1195 
1196  // Now run the main recognition.
1197  failed = Recognize(&monitor) < 0;
1198  } else {
1199  // Normal layout and character recognition with no timeout.
1200  failed = Recognize(NULL) < 0;
1201  }
1202 
1204 #ifndef ANDROID_BUILD
1205  Pix* page_pix = GetThresholdedImage();
1206  pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
1207 #endif // ANDROID_BUILD
1208  }
1209 
1210  if (failed && retry_config != NULL && retry_config[0] != '\0') {
1211  // Save current config variables before switching modes.
1212  FILE* fp = fopen(kOldVarsFile, "wb");
1213  PrintVariables(fp);
1214  fclose(fp);
1215  // Switch to alternate mode for retry.
1216  ReadConfigFile(retry_config);
1217  SetImage(pix);
1218  Recognize(NULL);
1219  // Restore saved config variables.
1220  ReadConfigFile(kOldVarsFile);
1221  }
1222 
1223  if (renderer && !failed) {
1224  failed = !renderer->AddImage(this);
1225  }
1226 
1228  return !failed;
1229 }
1230 
1236  if (tesseract_ == NULL || page_res_ == NULL)
1237  return NULL;
1238  return new LTRResultIterator(
1242 }
1243 
1253  if (tesseract_ == NULL || page_res_ == NULL)
1254  return NULL;
1259 }
1260 
1270  if (tesseract_ == NULL || page_res_ == NULL)
1271  return NULL;
1272  return new MutableIterator(page_res_, tesseract_,
1276 }
1277 
1280  if (tesseract_ == NULL ||
1281  (!recognition_done_ && Recognize(NULL) < 0))
1282  return NULL;
1283  STRING text("");
1284  ResultIterator *it = GetIterator();
1285  do {
1286  if (it->Empty(RIL_PARA)) continue;
1287  const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1288  text += para_text.get();
1289  } while (it->Next(RIL_PARA));
1290  char* result = new char[text.length() + 1];
1291  strncpy(result, text.string(), text.length() + 1);
1292  delete it;
1293  return result;
1294 }
1295 
1299 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
1300  tesseract::Orientation orientation;
1301  tesseract::WritingDirection writing_direction;
1302  tesseract::TextlineOrder textline_order;
1303  float deskew_angle;
1304  it->Orientation(&orientation, &writing_direction, &textline_order,
1305  &deskew_angle);
1306  return orientation;
1307 }
1308 
1317 static void AddBaselineCoordsTohOCR(const PageIterator *it,
1318  PageIteratorLevel level,
1319  STRING* hocr_str) {
1320  tesseract::Orientation orientation = GetBlockTextOrientation(it);
1321  if (orientation != ORIENTATION_PAGE_UP) {
1322  hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
1323  return;
1324  }
1325 
1326  int left, top, right, bottom;
1327  it->BoundingBox(level, &left, &top, &right, &bottom);
1328 
1329  // Try to get the baseline coordinates at this level.
1330  int x1, y1, x2, y2;
1331  if (!it->Baseline(level, &x1, &y1, &x2, &y2))
1332  return;
1333  // Following the description of this field of the hOCR spec, we convert the
1334  // baseline coordinates so that "the bottom left of the bounding box is the
1335  // origin".
1336  x1 -= left;
1337  x2 -= left;
1338  y1 -= bottom;
1339  y2 -= bottom;
1340 
1341  // Now fit a line through the points so we can extract coefficients for the
1342  // equation: y = p1 x + p0
1343  double p1 = 0;
1344  double p0 = 0;
1345  if (x1 == x2) {
1346  // Problem computing the polynomial coefficients.
1347  return;
1348  }
1349  p1 = (y2 - y1) / static_cast<double>(x2 - x1);
1350  p0 = y1 - static_cast<double>(p1 * x1);
1351 
1352  hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
1353  hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
1354 }
1355 
1356 static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1357  int num2) {
1358  const size_t BUFSIZE = 64;
1359  char id_buffer[BUFSIZE];
1360  if (num2 >= 0) {
1361  snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
1362  } else {
1363  snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
1364  }
1365  id_buffer[BUFSIZE - 1] = '\0';
1366  *hocr_str += " id='";
1367  *hocr_str += id_buffer;
1368  *hocr_str += "'";
1369 }
1370 
1371 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
1372  STRING* hocr_str) {
1373  int left, top, right, bottom;
1374  it->BoundingBox(level, &left, &top, &right, &bottom);
1375  // This is the only place we use double quotes instead of single quotes,
1376  // but it may too late to change for consistency
1377  hocr_str->add_str_int(" title=\"bbox ", left);
1378  hocr_str->add_str_int(" ", top);
1379  hocr_str->add_str_int(" ", right);
1380  hocr_str->add_str_int(" ", bottom);
1381  // Add baseline coordinates & heights for textlines only.
1382  if (level == RIL_TEXTLINE) {
1383  AddBaselineCoordsTohOCR(it, level, hocr_str);
1384  // add custom height measures
1385  float row_height, descenders, ascenders; // row attributes
1386  it->RowAttributes(&row_height, &descenders, &ascenders);
1387  // TODO(rays): Do we want to limit these to a single decimal place?
1388  hocr_str->add_str_double("; x_size ", row_height);
1389  hocr_str->add_str_double("; x_descenders ", descenders * -1);
1390  hocr_str->add_str_double("; x_ascenders ", ascenders);
1391  }
1392  *hocr_str += "\">";
1393 }
1394 
1395 static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
1396  STRING* hocr_str) {
1397  int left, top, right, bottom;
1398  it->BoundingBox(level, &left, &top, &right, &bottom);
1399  hocr_str->add_str_int("\t", left);
1400  hocr_str->add_str_int("\t", top);
1401  hocr_str->add_str_int("\t", right - left);
1402  hocr_str->add_str_int("\t", bottom - top);
1403 }
1404 
1414 char* TessBaseAPI::GetHOCRText(int page_number) {
1415  return GetHOCRText(NULL, page_number);
1416 }
1417 
1427 char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1428  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(monitor) < 0))
1429  return NULL;
1430 
1431  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1432  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1433  bool para_is_ltr = true; // Default direction is LTR
1434  const char* paragraph_lang = NULL;
1435  bool font_info = false;
1436  GetBoolVariable("hocr_font_info", &font_info);
1437 
1438  STRING hocr_str("");
1439 
1440  if (input_file_ == NULL)
1441  SetInputName(NULL);
1442 
1443 #ifdef _WIN32
1444  // convert input name from ANSI encoding to utf-8
1445  int str16_len =
1446  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, NULL, 0);
1447  wchar_t *uni16_str = new WCHAR[str16_len];
1448  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1449  uni16_str, str16_len);
1450  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, 0,
1451  NULL, NULL);
1452  char *utf8_str = new char[utf8_len];
1453  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1454  utf8_len, NULL, NULL);
1455  *input_file_ = utf8_str;
1456  delete[] uni16_str;
1457  delete[] utf8_str;
1458 #endif
1459 
1460  hocr_str += " <div class='ocr_page'";
1461  AddIdTohOCR(&hocr_str, "page", page_id, -1);
1462  hocr_str += " title='image \"";
1463  if (input_file_) {
1464  hocr_str += HOcrEscape(input_file_->string());
1465  } else {
1466  hocr_str += "unknown";
1467  }
1468  hocr_str.add_str_int("\"; bbox ", rect_left_);
1469  hocr_str.add_str_int(" ", rect_top_);
1470  hocr_str.add_str_int(" ", rect_width_);
1471  hocr_str.add_str_int(" ", rect_height_);
1472  hocr_str.add_str_int("; ppageno ", page_number);
1473  hocr_str += "'>\n";
1474 
1475  ResultIterator *res_it = GetIterator();
1476  while (!res_it->Empty(RIL_BLOCK)) {
1477  if (res_it->Empty(RIL_WORD)) {
1478  res_it->Next(RIL_WORD);
1479  continue;
1480  }
1481 
1482  // Open any new block/paragraph/textline.
1483  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1484  para_is_ltr = true; // reset to default direction
1485  hocr_str += " <div class='ocr_carea'";
1486  AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
1487  AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1488  }
1489  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1490  hocr_str += "\n <p class='ocr_par'";
1491  para_is_ltr = res_it->ParagraphIsLtr();
1492  if (!para_is_ltr) {
1493  hocr_str += " dir='rtl'";
1494  }
1495  AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
1496  paragraph_lang = res_it->WordRecognitionLanguage();
1497  if (paragraph_lang) {
1498  hocr_str += " lang='";
1499  hocr_str += paragraph_lang;
1500  hocr_str += "'";
1501  }
1502  AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1503  }
1504  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1505  hocr_str += "\n <span class='ocr_line'";
1506  AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
1507  AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1508  }
1509 
1510  // Now, process the word...
1511  hocr_str += "<span class='ocrx_word'";
1512  AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
1513  int left, top, right, bottom;
1514  bool bold, italic, underlined, monospace, serif, smallcaps;
1515  int pointsize, font_id;
1516  const char *font_name;
1517  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1518  font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1519  &monospace, &serif, &smallcaps,
1520  &pointsize, &font_id);
1521  hocr_str.add_str_int(" title='bbox ", left);
1522  hocr_str.add_str_int(" ", top);
1523  hocr_str.add_str_int(" ", right);
1524  hocr_str.add_str_int(" ", bottom);
1525  hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1526  if (font_info) {
1527  if (font_name) {
1528  hocr_str += "; x_font ";
1529  hocr_str += HOcrEscape(font_name);
1530  }
1531  hocr_str.add_str_int("; x_fsize ", pointsize);
1532  }
1533  hocr_str += "'";
1534  const char* lang = res_it->WordRecognitionLanguage();
1535  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
1536  hocr_str += " lang='";
1537  hocr_str += lang;
1538  hocr_str += "'";
1539  }
1540  switch (res_it->WordDirection()) {
1541  // Only emit direction if different from current paragraph direction
1542  case DIR_LEFT_TO_RIGHT:
1543  if (!para_is_ltr) hocr_str += " dir='ltr'";
1544  break;
1545  case DIR_RIGHT_TO_LEFT:
1546  if (para_is_ltr) hocr_str += " dir='rtl'";
1547  break;
1548  case DIR_MIX:
1549  case DIR_NEUTRAL:
1550  default: // Do nothing.
1551  break;
1552  }
1553  hocr_str += ">";
1554  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1555  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1556  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1557  if (bold) hocr_str += "<strong>";
1558  if (italic) hocr_str += "<em>";
1559  do {
1560  const std::unique_ptr<const char[]> grapheme(
1561  res_it->GetUTF8Text(RIL_SYMBOL));
1562  if (grapheme && grapheme[0] != 0) {
1563  hocr_str += HOcrEscape(grapheme.get());
1564  }
1565  res_it->Next(RIL_SYMBOL);
1566  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1567  if (italic) hocr_str += "</em>";
1568  if (bold) hocr_str += "</strong>";
1569  hocr_str += "</span> ";
1570  wcnt++;
1571  // Close any ending block/paragraph/textline.
1572  if (last_word_in_line) {
1573  hocr_str += "\n </span>";
1574  lcnt++;
1575  }
1576  if (last_word_in_para) {
1577  hocr_str += "\n </p>\n";
1578  pcnt++;
1579  para_is_ltr = true; // back to default direction
1580  }
1581  if (last_word_in_block) {
1582  hocr_str += " </div>\n";
1583  bcnt++;
1584  }
1585  }
1586  hocr_str += " </div>\n";
1587 
1588  char *ret = new char[hocr_str.length() + 1];
1589  strcpy(ret, hocr_str.string());
1590  delete res_it;
1591  return ret;
1592 }
1593 
1599 char* TessBaseAPI::GetTSVText(int page_number) {
1600  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0))
1601  return NULL;
1602 
1603  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1604  int page_id = page_number + 1; // we use 1-based page numbers.
1605 
1606  STRING tsv_str("");
1607 
1608  int page_num = page_id, block_num = 0, par_num = 0, line_num = 0,
1609  word_num = 0;
1610 
1611  tsv_str.add_str_int("1\t", page_num); // level 1 - page
1612  tsv_str.add_str_int("\t", block_num);
1613  tsv_str.add_str_int("\t", par_num);
1614  tsv_str.add_str_int("\t", line_num);
1615  tsv_str.add_str_int("\t", word_num);
1616  tsv_str.add_str_int("\t", rect_left_);
1617  tsv_str.add_str_int("\t", rect_top_);
1618  tsv_str.add_str_int("\t", rect_width_);
1619  tsv_str.add_str_int("\t", rect_height_);
1620  tsv_str += "\t-1\t\n";
1621 
1622  ResultIterator* res_it = GetIterator();
1623  while (!res_it->Empty(RIL_BLOCK)) {
1624  if (res_it->Empty(RIL_WORD)) {
1625  res_it->Next(RIL_WORD);
1626  continue;
1627  }
1628 
1629  // Add rows for any new block/paragraph/textline.
1630  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1631  block_num++, par_num = 0, line_num = 0, word_num = 0;
1632  tsv_str.add_str_int("2\t", page_num); // level 2 - block
1633  tsv_str.add_str_int("\t", block_num);
1634  tsv_str.add_str_int("\t", par_num);
1635  tsv_str.add_str_int("\t", line_num);
1636  tsv_str.add_str_int("\t", word_num);
1637  AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
1638  tsv_str += "\t-1\t\n"; // end of row for block
1639  }
1640  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1641  par_num++, line_num = 0, word_num = 0;
1642  tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
1643  tsv_str.add_str_int("\t", block_num);
1644  tsv_str.add_str_int("\t", par_num);
1645  tsv_str.add_str_int("\t", line_num);
1646  tsv_str.add_str_int("\t", word_num);
1647  AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
1648  tsv_str += "\t-1\t\n"; // end of row for para
1649  }
1650  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1651  line_num++, word_num = 0;
1652  tsv_str.add_str_int("4\t", page_num); // level 4 - line
1653  tsv_str.add_str_int("\t", block_num);
1654  tsv_str.add_str_int("\t", par_num);
1655  tsv_str.add_str_int("\t", line_num);
1656  tsv_str.add_str_int("\t", word_num);
1657  AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
1658  tsv_str += "\t-1\t\n"; // end of row for line
1659  }
1660 
1661  // Now, process the word...
1662  int left, top, right, bottom;
1663  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1664  word_num++;
1665  tsv_str.add_str_int("5\t", page_num); // level 5 - word
1666  tsv_str.add_str_int("\t", block_num);
1667  tsv_str.add_str_int("\t", par_num);
1668  tsv_str.add_str_int("\t", line_num);
1669  tsv_str.add_str_int("\t", word_num);
1670  tsv_str.add_str_int("\t", left);
1671  tsv_str.add_str_int("\t", top);
1672  tsv_str.add_str_int("\t", right - left);
1673  tsv_str.add_str_int("\t", bottom - top);
1674  tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1675  tsv_str += "\t";
1676 
1677  // Increment counts if at end of block/paragraph/textline.
1678  if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
1679  if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
1680  if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
1681 
1682  do {
1683  tsv_str +=
1684  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1685  res_it->Next(RIL_SYMBOL);
1686  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1687  tsv_str += "\n"; // end of row
1688  wcnt++;
1689  }
1690 
1691  char* ret = new char[tsv_str.length() + 1];
1692  strcpy(ret, tsv_str.string());
1693  delete res_it;
1694  return ret;
1695 }
1696 
1698 const int kNumbersPerBlob = 5;
1703 const int kBytesPerNumber = 5;
1709 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1711 const int kBytesPer64BitNumber = 20;
1718 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
1719  UNICHAR_LEN;
1720 
1727 char* TessBaseAPI::GetBoxText(int page_number) {
1728  if (tesseract_ == NULL ||
1729  (!recognition_done_ && Recognize(NULL) < 0))
1730  return NULL;
1731  int blob_count;
1732  int utf8_length = TextLength(&blob_count);
1733  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1735  char* result = new char[total_length];
1736  result[0] = '\0';
1737  int output_length = 0;
1739  do {
1740  int left, top, right, bottom;
1741  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1742  const std::unique_ptr</*non-const*/ char[]> text(
1743  it->GetUTF8Text(RIL_SYMBOL));
1744  // Tesseract uses space for recognition failure. Fix to a reject
1745  // character, kTesseractReject so we don't create illegal box files.
1746  for (int i = 0; text[i] != '\0'; ++i) {
1747  if (text[i] == ' ')
1748  text[i] = kTesseractReject;
1749  }
1750  snprintf(result + output_length, total_length - output_length,
1751  "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
1752  right, image_height_ - top, page_number);
1753  output_length += strlen(result + output_length);
1754  // Just in case...
1755  if (output_length + kMaxBytesPerLine > total_length)
1756  break;
1757  }
1758  } while (it->Next(RIL_SYMBOL));
1759  delete it;
1760  return result;
1761 }
1762 
1768 const int kUniChs[] = {
1769  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
1770 };
1772 const int kLatinChs[] = {
1773  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
1774 };
1775 
1782  if (tesseract_ == NULL ||
1783  (!recognition_done_ && Recognize(NULL) < 0))
1784  return NULL;
1785  bool tilde_crunch_written = false;
1786  bool last_char_was_newline = true;
1787  bool last_char_was_tilde = false;
1788 
1789  int total_length = TextLength(NULL);
1790  PAGE_RES_IT page_res_it(page_res_);
1791  char* result = new char[total_length];
1792  char* ptr = result;
1793  for (page_res_it.restart_page(); page_res_it.word () != NULL;
1794  page_res_it.forward()) {
1795  WERD_RES *word = page_res_it.word();
1796  // Process the current word.
1797  if (word->unlv_crunch_mode != CR_NONE) {
1798  if (word->unlv_crunch_mode != CR_DELETE &&
1799  (!tilde_crunch_written ||
1800  (word->unlv_crunch_mode == CR_KEEP_SPACE &&
1801  word->word->space() > 0 &&
1802  !word->word->flag(W_FUZZY_NON) &&
1803  !word->word->flag(W_FUZZY_SP)))) {
1804  if (!word->word->flag(W_BOL) &&
1805  word->word->space() > 0 &&
1806  !word->word->flag(W_FUZZY_NON) &&
1807  !word->word->flag(W_FUZZY_SP)) {
1808  /* Write a space to separate from preceding good text */
1809  *ptr++ = ' ';
1810  last_char_was_tilde = false;
1811  }
1812  if (!last_char_was_tilde) {
1813  // Write a reject char.
1814  last_char_was_tilde = true;
1815  *ptr++ = kUNLVReject;
1816  tilde_crunch_written = true;
1817  last_char_was_newline = false;
1818  }
1819  }
1820  } else {
1821  // NORMAL PROCESSING of non tilde crunched words.
1822  tilde_crunch_written = false;
1824  const char* wordstr = word->best_choice->unichar_string().string();
1825  const STRING& lengths = word->best_choice->unichar_lengths();
1826  int length = lengths.length();
1827  int i = 0;
1828  int offset = 0;
1829 
1830  if (last_char_was_tilde &&
1831  word->word->space() == 0 && wordstr[offset] == ' ') {
1832  // Prevent adjacent tilde across words - we know that adjacent tildes
1833  // within words have been removed.
1834  // Skip the first character.
1835  offset = lengths[i++];
1836  }
1837  if (i < length && wordstr[offset] != 0) {
1838  if (!last_char_was_newline)
1839  *ptr++ = ' ';
1840  else
1841  last_char_was_newline = false;
1842  for (; i < length; offset += lengths[i++]) {
1843  if (wordstr[offset] == ' ' ||
1844  wordstr[offset] == kTesseractReject) {
1845  *ptr++ = kUNLVReject;
1846  last_char_was_tilde = true;
1847  } else {
1848  if (word->reject_map[i].rejected())
1849  *ptr++ = kUNLVSuspect;
1850  UNICHAR ch(wordstr + offset, lengths[i]);
1851  int uni_ch = ch.first_uni();
1852  for (int j = 0; kUniChs[j] != 0; ++j) {
1853  if (kUniChs[j] == uni_ch) {
1854  uni_ch = kLatinChs[j];
1855  break;
1856  }
1857  }
1858  if (uni_ch <= 0xff) {
1859  *ptr++ = static_cast<char>(uni_ch);
1860  last_char_was_tilde = false;
1861  } else {
1862  *ptr++ = kUNLVReject;
1863  last_char_was_tilde = true;
1864  }
1865  }
1866  }
1867  }
1868  }
1869  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1870  /* Add a new line output */
1871  *ptr++ = '\n';
1872  tilde_crunch_written = false;
1873  last_char_was_newline = true;
1874  last_char_was_tilde = false;
1875  }
1876  }
1877  *ptr++ = '\n';
1878  *ptr = '\0';
1879  return result;
1880 }
1881 
1891 bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf,
1892  const char** script_name,
1893  float* script_conf) {
1894  OSResults osr;
1895 
1896  bool osd = DetectOS(&osr);
1897  if (!osd) {
1898  return false;
1899  }
1900 
1901  int orient_id = osr.best_result.orientation_id;
1902  int script_id = osr.get_best_script(orient_id);
1903  if (orient_conf) *orient_conf = osr.best_result.oconfidence;
1904  if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees
1905 
1906  if (script_name) {
1907  const char* script = osr.unicharset->get_script_from_script_id(script_id);
1908 
1909  *script_name = script;
1910  }
1911 
1912  if (script_conf) *script_conf = osr.best_result.sconfidence;
1913 
1914  return true;
1915 }
1916 
1922 char* TessBaseAPI::GetOsdText(int page_number) {
1923  int orient_deg;
1924  float orient_conf;
1925  const char* script_name;
1926  float script_conf;
1927 
1928  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name,
1929  &script_conf))
1930  return NULL;
1931 
1932  // clockwise rotation needed to make the page upright
1933  int rotate = OrientationIdToValue(orient_deg / 90);
1934 
1935  const int kOsdBufsize = 255;
1936  char* osd_buf = new char[kOsdBufsize];
1937  snprintf(osd_buf, kOsdBufsize,
1938  "Page number: %d\n"
1939  "Orientation in degrees: %d\n"
1940  "Rotate: %d\n"
1941  "Orientation confidence: %.2f\n"
1942  "Script: %s\n"
1943  "Script confidence: %.2f\n",
1944  page_number, orient_deg, rotate, orient_conf, script_name,
1945  script_conf);
1946 
1947  return osd_buf;
1948 }
1949 
1952  int* conf = AllWordConfidences();
1953  if (!conf) return 0;
1954  int sum = 0;
1955  int *pt = conf;
1956  while (*pt >= 0) sum += *pt++;
1957  if (pt != conf) sum /= pt - conf;
1958  delete [] conf;
1959  return sum;
1960 }
1961 
1964  if (tesseract_ == NULL ||
1965  (!recognition_done_ && Recognize(NULL) < 0))
1966  return NULL;
1967  int n_word = 0;
1968  PAGE_RES_IT res_it(page_res_);
1969  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
1970  n_word++;
1971 
1972  int* conf = new int[n_word+1];
1973  n_word = 0;
1974  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
1975  WERD_RES *word = res_it.word();
1976  WERD_CHOICE* choice = word->best_choice;
1977  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1978  // This is the eq for converting Tesseract confidence to 1..100
1979  if (w_conf < 0) w_conf = 0;
1980  if (w_conf > 100) w_conf = 100;
1981  conf[n_word++] = w_conf;
1982  }
1983  conf[n_word] = -1;
1984  return conf;
1985 }
1986 
1997 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
1998  int debug = 0;
1999  GetIntVariable("applybox_debug", &debug);
2000  bool success = true;
2001  PageSegMode current_psm = GetPageSegMode();
2002  SetPageSegMode(mode);
2003  SetVariable("classify_enable_learning", "0");
2004  const std::unique_ptr<const char[]> text(GetUTF8Text());
2005  if (debug) {
2006  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
2007  }
2008  if (text != NULL) {
2009  PAGE_RES_IT it(page_res_);
2010  WERD_RES* word_res = it.word();
2011  if (word_res != NULL) {
2012  word_res->word->set_text(wordstr);
2013  } else {
2014  success = false;
2015  }
2016  // Check to see if text matches wordstr.
2017  int w = 0;
2018  int t = 0;
2019  for (t = 0; text[t] != '\0'; ++t) {
2020  if (text[t] == '\n' || text[t] == ' ')
2021  continue;
2022  while (wordstr[w] == ' ') ++w;
2023  if (text[t] != wordstr[w])
2024  break;
2025  ++w;
2026  }
2027  if (text[t] != '\0' || wordstr[w] != '\0') {
2028  // No match.
2029  delete page_res_;
2030  GenericVector<TBOX> boxes;
2034  PAGE_RES_IT pr_it(page_res_);
2035  if (pr_it.word() == NULL)
2036  success = false;
2037  else
2038  word_res = pr_it.word();
2039  } else {
2040  word_res->BestChoiceToCorrectText();
2041  }
2042  if (success) {
2043  tesseract_->EnableLearning = true;
2044  tesseract_->LearnWord(NULL, word_res);
2045  }
2046  } else {
2047  success = false;
2048  }
2049  SetPageSegMode(current_psm);
2050  return success;
2051 }
2052 
2060  if (thresholder_ != NULL)
2061  thresholder_->Clear();
2062  ClearResults();
2063  if (tesseract_ != NULL) SetInputImage(NULL);
2064 }
2065 
2073  Clear();
2074  delete thresholder_;
2075  thresholder_ = NULL;
2076  delete page_res_;
2077  page_res_ = NULL;
2078  delete block_list_;
2079  block_list_ = NULL;
2080  if (paragraph_models_ != NULL) {
2082  delete paragraph_models_;
2083  paragraph_models_ = NULL;
2084  }
2085  if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
2086  delete tesseract_;
2087  tesseract_ = nullptr;
2088  delete osd_tesseract_;
2089  osd_tesseract_ = NULL;
2090  delete equ_detect_;
2091  equ_detect_ = NULL;
2092  delete input_file_;
2093  input_file_ = NULL;
2094  delete output_file_;
2095  output_file_ = NULL;
2096  delete datapath_;
2097  datapath_ = NULL;
2098  delete language_;
2099  language_ = NULL;
2100 }
2101 
2102 // Clear any library-level memory caches.
2103 // There are a variety of expensive-to-load constant data structures (mostly
2104 // language dictionaries) that are cached globally -- surviving the Init()
2105 // and End() of individual TessBaseAPI's. This function allows the clearing
2106 // of these caches.
2109 }
2110 
2115 int TessBaseAPI::IsValidWord(const char *word) {
2116  return tesseract_->getDict().valid_word(word);
2117 }
2118 // Returns true if utf8_character is defined in the UniCharset.
2119 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
2120  return tesseract_->unicharset.contains_unichar(utf8_character);
2121 }
2122 
2123 
2124 // TODO(rays) Obsolete this function and replace with a more aptly named
2125 // function that returns image coordinates rather than tesseract coordinates.
2126 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
2127  PageIterator* it = AnalyseLayout();
2128  if (it == NULL) {
2129  return false;
2130  }
2131  int x1, x2, y1, y2;
2132  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
2133  // Calculate offset and slope (NOTE: Kind of ugly)
2134  if (x2 <= x1) x2 = x1 + 1;
2135  // Convert the point pair to slope/offset of the baseline (in image coords.)
2136  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
2137  *out_offset = static_cast<int>(y1 - *out_slope * x1);
2138  // Get the y-coord of the baseline at the left and right edges of the
2139  // textline's bounding box.
2140  int left, top, right, bottom;
2141  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
2142  delete it;
2143  return false;
2144  }
2145  int left_y = IntCastRounded(*out_slope * left + *out_offset);
2146  int right_y = IntCastRounded(*out_slope * right + *out_offset);
2147  // Shift the baseline down so it passes through the nearest bottom-corner
2148  // of the textline's bounding box. This is the difference between the y
2149  // at the lowest (max) edge of the box and the actual box bottom.
2150  *out_offset += bottom - MAX(left_y, right_y);
2151  // Switch back to bottom-up tesseract coordinates. Requires negation of
2152  // the slope and height - offset for the offset.
2153  *out_slope = -*out_slope;
2154  *out_offset = rect_height_ - *out_offset;
2155  delete it;
2156 
2157  return true;
2158 }
2159 
2162  if (tesseract_ != NULL) {
2164  }
2165 }
2166 
2176  if (tesseract_ != NULL) {
2178  // Set it for the sublangs too.
2179  int num_subs = tesseract_->num_sub_langs();
2180  for (int i = 0; i < num_subs; ++i) {
2182  }
2183  }
2184 }
2185 
2188  if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
2189 }
2190 
2193  if (tesseract_ == NULL) {
2194  tprintf("Please call Init before attempting to set an image.\n");
2195  return false;
2196  }
2197  if (thresholder_ == NULL)
2199  ClearResults();
2200  return true;
2201 }
2202 
2209 bool TessBaseAPI::Threshold(Pix** pix) {
2210  ASSERT_HOST(pix != NULL);
2211  if (*pix != NULL)
2212  pixDestroy(pix);
2213  // Zero resolution messes up the algorithms, so make sure it is credible.
2214  int y_res = thresholder_->GetScaledYResolution();
2215  if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2216  // Use the minimum default resolution, as it is safer to under-estimate
2217  // than over-estimate resolution.
2218  tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", y_res,
2221  }
2222  PageSegMode pageseg_mode =
2223  static_cast<PageSegMode>(
2224  static_cast<int>(tesseract_->tessedit_pageseg_mode));
2225  if (!thresholder_->ThresholdToPix(pageseg_mode, pix)) return false;
2229  if (!thresholder_->IsBinary()) {
2232  } else {
2234  tesseract_->set_pix_grey(NULL);
2235  }
2236  // Set the internal resolution that is used for layout parameters from the
2237  // estimated resolution, rather than the image resolution, which may be
2238  // fabricated, but we will use the image resolution, if there is one, to
2239  // report output point sizes.
2240  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2243  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2244  tprintf("Estimated resolution %d out of range! Corrected to %d\n",
2245  thresholder_->GetScaledEstimatedResolution(), estimated_res);
2246  }
2247  tesseract_->set_source_resolution(estimated_res);
2248  SavePixForCrash(estimated_res, *pix);
2249  return true;
2250 }
2251 
2254  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
2255  tprintf("Please call SetImage before attempting recognition.\n");
2256  return -1;
2257  }
2258  if (recognition_done_)
2259  ClearResults();
2260  if (!block_list_->empty()) {
2261  return 0;
2262  }
2263  if (tesseract_ == NULL) {
2264  tesseract_ = new Tesseract;
2266  }
2267  if (tesseract_->pix_binary() == NULL &&
2269  return -1;
2270  }
2271 
2273 
2275  if (equ_detect_ == NULL && datapath_ != NULL) {
2276  equ_detect_ = new EquationDetect(datapath_->string(), NULL);
2277  }
2278  if (equ_detect_ == nullptr) {
2279  tprintf("Warning: Could not set equation detector\n");
2280  } else {
2282  }
2283  }
2284 
2285  Tesseract* osd_tess = osd_tesseract_;
2286  OSResults osr;
2288  osd_tess == nullptr) {
2289  if (strcmp(language_->string(), "osd") == 0) {
2290  osd_tess = tesseract_;
2291  } else {
2292  osd_tesseract_ = new Tesseract;
2293  TessdataManager mgr(reader_);
2294  if (datapath_ == nullptr) {
2295  tprintf("Warning: Auto orientation and script detection requested,"
2296  " but data path is undefined\n");
2297  delete osd_tesseract_;
2298  osd_tesseract_ = nullptr;
2299  } else if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr,
2300  "osd", OEM_TESSERACT_ONLY,
2301  nullptr, 0, nullptr, nullptr,
2302  false, &mgr) == 0) {
2303  osd_tess = osd_tesseract_;
2306  } else {
2307  tprintf("Warning: Auto orientation and script detection requested,"
2308  " but osd language failed to load\n");
2309  delete osd_tesseract_;
2310  osd_tesseract_ = nullptr;
2311  }
2312  }
2313  }
2314 
2315  if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
2316  return -1;
2317  // If Devanagari is being recognized, we use different images for page seg
2318  // and for OCR.
2319  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2320  return 0;
2321 }
2322 
2325  if (tesseract_ != NULL) {
2326  tesseract_->Clear();
2327  }
2328  if (page_res_ != NULL) {
2329  delete page_res_;
2330  page_res_ = NULL;
2331  }
2332  recognition_done_ = false;
2333  if (block_list_ == NULL)
2334  block_list_ = new BLOCK_LIST;
2335  else
2336  block_list_->clear();
2337  if (paragraph_models_ != NULL) {
2339  delete paragraph_models_;
2340  paragraph_models_ = NULL;
2341  }
2342  SavePixForCrash(0, NULL);
2343 }
2344 
2352 int TessBaseAPI::TextLength(int* blob_count) {
2353  if (tesseract_ == NULL || page_res_ == NULL)
2354  return 0;
2355 
2356  PAGE_RES_IT page_res_it(page_res_);
2357  int total_length = 2;
2358  int total_blobs = 0;
2359  // Iterate over the data structures to extract the recognition result.
2360  for (page_res_it.restart_page(); page_res_it.word () != NULL;
2361  page_res_it.forward()) {
2362  WERD_RES *word = page_res_it.word();
2363  WERD_CHOICE* choice = word->best_choice;
2364  if (choice != NULL) {
2365  total_blobs += choice->length() + 2;
2366  total_length += choice->unichar_string().length() + 2;
2367  for (int i = 0; i < word->reject_map.length(); ++i) {
2368  if (word->reject_map[i].rejected())
2369  ++total_length;
2370  }
2371  }
2372  }
2373  if (blob_count != NULL)
2374  *blob_count = total_blobs;
2375  return total_length;
2376 }
2377 
2383  if (tesseract_ == NULL)
2384  return false;
2385  ClearResults();
2386  if (tesseract_->pix_binary() == NULL &&
2388  return false;
2389  }
2390  if (input_file_ == NULL)
2391  input_file_ = new STRING(kInputFile);
2393 }
2394 
2396  tesseract_->min_orientation_margin.set_value(margin);
2397 }
2398 
2413 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
2414  bool** vertical_writing) {
2415  delete[] *block_orientation;
2416  *block_orientation = NULL;
2417  delete[] *vertical_writing;
2418  *vertical_writing = NULL;
2419  BLOCK_IT block_it(block_list_);
2420 
2421  block_it.move_to_first();
2422  int num_blocks = 0;
2423  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2424  if (!block_it.data()->poly_block()->IsText()) {
2425  continue;
2426  }
2427  ++num_blocks;
2428  }
2429  if (!num_blocks) {
2430  tprintf("WARNING: Found no blocks\n");
2431  return;
2432  }
2433  *block_orientation = new int[num_blocks];
2434  *vertical_writing = new bool[num_blocks];
2435  block_it.move_to_first();
2436  int i = 0;
2437  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
2438  block_it.forward()) {
2439  if (!block_it.data()->poly_block()->IsText()) {
2440  continue;
2441  }
2442  FCOORD re_rotation = block_it.data()->re_rotation();
2443  float re_theta = re_rotation.angle();
2444  FCOORD classify_rotation = block_it.data()->classify_rotation();
2445  float classify_theta = classify_rotation.angle();
2446  double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
2447  if (rot_theta < 0) rot_theta += 4;
2448  int num_rotations = static_cast<int>(rot_theta + 0.5);
2449  (*block_orientation)[i] = num_rotations;
2450  // The classify_rotation is non-zero only if the text has vertical
2451  // writing direction.
2452  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2453  ++i;
2454  }
2455 }
2456 
2457 // ____________________________________________________________________________
2458 // Ocropus add-ons.
2459 
2462  FindLines();
2463  BLOCK_LIST* result = block_list_;
2464  block_list_ = NULL;
2465  return result;
2466 }
2467 
2473 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
2474  delete block_list;
2475 }
2476 
2477 
2479  float xheight,
2480  float descender,
2481  float ascender) {
2482  inT32 xstarts[] = {-32000};
2483  double quad_coeffs[] = {0, 0, baseline};
2484  return new ROW(1,
2485  xstarts,
2486  quad_coeffs,
2487  xheight,
2488  ascender - (baseline + xheight),
2489  descender - baseline,
2490  0,
2491  0);
2492 }
2493 
2496  int width = pixGetWidth(pix);
2497  int height = pixGetHeight(pix);
2498  BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
2499 
2500  // Create C_BLOBs from the page
2501  extract_edges(pix, &block);
2502 
2503  // Merge all C_BLOBs
2504  C_BLOB_LIST *list = block.blob_list();
2505  C_BLOB_IT c_blob_it(list);
2506  if (c_blob_it.empty())
2507  return NULL;
2508  // Move all the outlines to the first blob.
2509  C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
2510  for (c_blob_it.forward();
2511  !c_blob_it.at_first();
2512  c_blob_it.forward()) {
2513  C_BLOB *c_blob = c_blob_it.data();
2514  ol_it.add_list_after(c_blob->out_list());
2515  }
2516  // Convert the first blob to the output TBLOB.
2517  return TBLOB::PolygonalCopy(false, c_blob_it.data());
2518 }
2519 
2525 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
2526  TBOX box = tblob->bounding_box();
2527  float x_center = (box.left() + box.right()) / 2.0f;
2528  float baseline = row->base_line(x_center);
2529  float scale = kBlnXHeight / row->x_height();
2530  tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale,
2531  0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL);
2532 }
2533 
2538 TBLOB *make_tesseract_blob(float baseline, float xheight,
2539  float descender, float ascender,
2540  bool numeric_mode, Pix* pix) {
2541  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
2542 
2543  // Normalize TBLOB
2544  ROW *row =
2545  TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
2546  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
2547  delete row;
2548  return tblob;
2549 }
2550 
2556 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
2557  int length,
2558  float baseline,
2559  float xheight,
2560  float descender,
2561  float ascender) {
2562  UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
2563  TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
2565  tesseract_->pix_binary());
2566  float threshold;
2567  float best_rating = -100;
2568 
2569 
2570  // Classify to get a raw choice.
2571  BLOB_CHOICE_LIST choices;
2572  tesseract_->AdaptiveClassifier(blob, &choices);
2573  BLOB_CHOICE_IT choice_it;
2574  choice_it.set_to_list(&choices);
2575  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2576  choice_it.forward()) {
2577  if (choice_it.data()->rating() > best_rating) {
2578  best_rating = choice_it.data()->rating();
2579  }
2580  }
2581 
2582  threshold = tesseract_->matcher_good_threshold;
2583 
2584  if (blob->outlines)
2585  tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold,
2587  delete blob;
2588 }
2589 
2590 
2591 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
2592  PAGE_RES *page_res = new PAGE_RES(false, block_list,
2594  tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
2595  return page_res;
2596 }
2597 
2598 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
2599  PAGE_RES* pass1_result) {
2600  if (!pass1_result)
2601  pass1_result = new PAGE_RES(false, block_list,
2603  tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
2604  return pass1_result;
2605 }
2606 
2607 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2608  int debug_level = 0;
2609  GetIntVariable("paragraph_debug_level", &debug_level);
2610  if (paragraph_models_ == NULL)
2612  MutableIterator *result_it = GetMutableIterator();
2613  do { // Detect paragraphs for this block
2615  ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
2616  result_it, &models);
2617  *paragraph_models_ += models;
2618  } while (result_it->Next(RIL_BLOCK));
2619  delete result_it;
2620 }
2621 
2624  int length; // of unicode_repr
2625  float cost;
2627 
2628  TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
2629  length = (len == -1 ? strlen(repr) : len);
2630  unicode_repr = new char[length + 1];
2631  strncpy(unicode_repr, repr, length);
2632  }
2633 
2634  TESS_CHAR() { // Satisfies ELISTIZE.
2635  }
2637  delete [] unicode_repr;
2638  }
2639 };
2640 
2643 
2644 static void add_space(TESS_CHAR_IT* it) {
2645  TESS_CHAR *t = new TESS_CHAR(0, " ");
2646  it->add_after_then_move(t);
2647 }
2648 
2649 
2650 static float rating_to_cost(float rating) {
2651  rating = 100 + rating;
2652  // cuddled that to save from coverage profiler
2653  // (I have never seen ratings worse than -100,
2654  // but the check won't hurt)
2655  if (rating < 0) rating = 0;
2656  return rating;
2657 }
2658 
2663 static void extract_result(TESS_CHAR_IT* out,
2664  PAGE_RES* page_res) {
2665  PAGE_RES_IT page_res_it(page_res);
2666  int word_count = 0;
2667  while (page_res_it.word() != NULL) {
2668  WERD_RES *word = page_res_it.word();
2669  const char *str = word->best_choice->unichar_string().string();
2670  const char *len = word->best_choice->unichar_lengths().string();
2671  TBOX real_rect = word->word->bounding_box();
2672 
2673  if (word_count)
2674  add_space(out);
2675  int n = strlen(len);
2676  for (int i = 0; i < n; i++) {
2677  TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
2678  str, *len);
2679  tc->box = real_rect.intersection(word->box_word->BlobBox(i));
2680  out->add_after_then_move(tc);
2681  str += *len;
2682  len++;
2683  }
2684  page_res_it.forward();
2685  word_count++;
2686  }
2687 }
2688 
2694  int** lengths,
2695  float** costs,
2696  int** x0,
2697  int** y0,
2698  int** x1,
2699  int** y1,
2700  PAGE_RES* page_res) {
2701  TESS_CHAR_LIST tess_chars;
2702  TESS_CHAR_IT tess_chars_it(&tess_chars);
2703  extract_result(&tess_chars_it, page_res);
2704  tess_chars_it.move_to_first();
2705  int n = tess_chars.length();
2706  int text_len = 0;
2707  *lengths = new int[n];
2708  *costs = new float[n];
2709  *x0 = new int[n];
2710  *y0 = new int[n];
2711  *x1 = new int[n];
2712  *y1 = new int[n];
2713  int i = 0;
2714  for (tess_chars_it.mark_cycle_pt();
2715  !tess_chars_it.cycled_list();
2716  tess_chars_it.forward(), i++) {
2717  TESS_CHAR *tc = tess_chars_it.data();
2718  text_len += (*lengths)[i] = tc->length;
2719  (*costs)[i] = tc->cost;
2720  (*x0)[i] = tc->box.left();
2721  (*y0)[i] = tc->box.bottom();
2722  (*x1)[i] = tc->box.right();
2723  (*y1)[i] = tc->box.top();
2724  }
2725  char *p = *text = new char[text_len];
2726 
2727  tess_chars_it.move_to_first();
2728  for (tess_chars_it.mark_cycle_pt();
2729  !tess_chars_it.cycled_list();
2730  tess_chars_it.forward()) {
2731  TESS_CHAR *tc = tess_chars_it.data();
2732  strncpy(p, tc->unicode_repr, tc->length);
2733  p += tc->length;
2734  }
2735  return n;
2736 }
2737 
2739 // The resulting features are returned in int_features, which must be
2740 // of size MAX_NUM_INT_FEATURES. The number of features is returned in
2741 // num_features (or 0 if there was a failure).
2742 // On return feature_outline_index is filled with an index of the outline
2743 // corresponding to each feature in int_features.
2744 // TODO(rays) Fix the caller to out outline_counts instead.
2746  INT_FEATURE_STRUCT* int_features,
2747  int* num_features,
2748  int* feature_outline_index) {
2749  GenericVector<int> outline_counts;
2752  INT_FX_RESULT_STRUCT fx_info;
2753  tesseract_->ExtractFeatures(*blob, false, &bl_features,
2754  &cn_features, &fx_info, &outline_counts);
2755  if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) {
2756  *num_features = 0;
2757  return; // Feature extraction failed.
2758  }
2759  *num_features = cn_features.size();
2760  memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
2761  // TODO(rays) Pass outline_counts back and simplify the calling code.
2762  if (feature_outline_index != NULL) {
2763  int f = 0;
2764  for (int i = 0; i < outline_counts.size(); ++i) {
2765  while (f < outline_counts[i])
2766  feature_outline_index[f++] = i;
2767  }
2768  }
2769 }
2770 
2771 // This method returns the row to which a box of specified dimensions would
2772 // belong. If no good match is found, it returns NULL.
2773 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
2774  int left, int top, int right, int bottom) {
2775  TBOX box(left, bottom, right, top);
2776  BLOCK_IT b_it(blocks);
2777  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
2778  BLOCK* block = b_it.data();
2779  if (!box.major_overlap(block->bounding_box()))
2780  continue;
2781  ROW_IT r_it(block->row_list());
2782  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
2783  ROW* row = r_it.data();
2784  if (!box.major_overlap(row->bounding_box()))
2785  continue;
2786  WERD_IT w_it(row->word_list());
2787  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
2788  WERD* word = w_it.data();
2789  if (box.major_overlap(word->bounding_box()))
2790  return row;
2791  }
2792  }
2793  }
2794  return NULL;
2795 }
2796 
2799  int num_max_matches,
2800  int* unichar_ids,
2801  float* ratings,
2802  int* num_matches_returned) {
2803  BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
2804  tesseract_->AdaptiveClassifier(blob, choices);
2805  BLOB_CHOICE_IT choices_it(choices);
2806  int& index = *num_matches_returned;
2807  index = 0;
2808  for (choices_it.mark_cycle_pt();
2809  !choices_it.cycled_list() && index < num_max_matches;
2810  choices_it.forward()) {
2811  BLOB_CHOICE* choice = choices_it.data();
2812  unichar_ids[index] = choice->unichar_id();
2813  ratings[index] = choice->rating();
2814  ++index;
2815  }
2816  *num_matches_returned = index;
2817  delete choices;
2818 }
2819 
2821 const char* TessBaseAPI::GetUnichar(int unichar_id) {
2822  return tesseract_->unicharset.id_to_unichar(unichar_id);
2823 }
2824 
2826 const Dawg *TessBaseAPI::GetDawg(int i) const {
2827  if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
2828  return tesseract_->getDict().GetDawg(i);
2829 }
2830 
2833  return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
2834 }
2835 
2837 STRING HOcrEscape(const char* text) {
2838  STRING ret;
2839  const char *ptr;
2840  for (ptr = text; *ptr; ptr++) {
2841  switch (*ptr) {
2842  case '<': ret += "&lt;"; break;
2843  case '>': ret += "&gt;"; break;
2844  case '&': ret += "&amp;"; break;
2845  case '"': ret += "&quot;"; break;
2846  case '\'': ret += "&#39;"; break;
2847  default: ret += *ptr;
2848  }
2849  }
2850  return ret;
2851 }
2852 
2853 } // namespace tesseract.
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:472
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
static void CatchSignals()
Definition: baseapi.cpp:238
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:416
float x_height() const
Definition: ocrrow.h:61
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1078
Definition: points.h:189
bool empty() const
Definition: genericvector.h:91
Tesseract * tesseract() const
Definition: baseapi.h:764
GenericVector< IntParam * > int_params
Definition: params.h:44
TESS_CHAR(float _cost, const char *repr, int len=-1)
Definition: baseapi.cpp:2628
void DeleteUnusedDawgs()
Definition: dawg_cache.h:43
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
int GetScaledEstimatedResolution() const
Definition: thresholder.h:106
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
void GetAvailableLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:445
Dict & getDict() override
bool AddImage(TessBaseAPI *api)
Definition: renderer.cpp:83
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
#define MAX_PATH
Definition: platform.h:49
float y() const
Definition: points.h:212
ROW_RES * row() const
Definition: pageres.h:739
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:866
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1891
TESS_LOCAL PAGE_RES * RecognitionPass1(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2591
#define TRUE
Definition: capi.h:45
GenericVector< DoubleParam * > double_params
Definition: params.h:47
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
Boxa * GetConnectedComponents(Pixa **cc)
Definition: baseapi.cpp:668
void GetFeaturesForBlob(TBLOB *blob, INT_FEATURE_STRUCT *int_features, int *num_features, int *feature_outline_index)
Definition: baseapi.cpp:2745
TESSLINE * outlines
Definition: blobs.h:377
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
WERD_RES * restart_page()
Definition: pageres.h:683
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
Definition: werd.h:36
static ROW * MakeTessOCRRow(float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2478
STRING * language_
Last initialized language.
Definition: baseapi.h:876
TESS_LOCAL PAGE_RES * RecognitionPass2(BLOCK_LIST *block_list, PAGE_RES *pass1_result)
Definition: baseapi.cpp:2598
const int kBytesPerNumber
Definition: baseapi.cpp:1703
UNICHARSET * unicharset
Definition: osdetect.h:78
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
BLOCK * block
Definition: pageres.h:99
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:284
GenericVector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:870
Pix * pix_binary() const
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:865
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:285
#define MAX(x, y)
Definition: ndminx.h:24
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
char * GetTSVText(int page_number)
Definition: baseapi.cpp:1599
bool SetDebugVariable(const char *name, const char *value)
Definition: baseapi.cpp:278
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
const char kTesseractReject
Definition: baseapi.cpp:89
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:867
#define PERF_COUNT_SUB(SUB)
bool GetTextDirection(int *out_offset, float *out_slope)
Definition: baseapi.cpp:2126
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
Pix * pix_grey() const
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void set_pix_original(Pix *original_pix)
const char * GetInputName()
Definition: baseapi.cpp:940
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:412
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:853
TruthCallback * truth_cb_
Definition: baseapi.h:879
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:874
#define PERF_COUNT_END
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:414
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:337
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300
virtual bool Next(PageIteratorLevel level)
GenericVector< BoolParam * > bool_params
Definition: params.h:45
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:332
Tesseract * get_sub_lang(int index) const
STRING * input_file_
Name used by training code.
Definition: baseapi.h:873
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
double(Dict::* ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Definition: baseapi.h:78
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
BLOCK_LIST * FindLinesCreateBlockList()
Definition: baseapi.cpp:2461
float Confidence(PageIteratorLevel level) const
int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:562
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:320
FileReader reader_
Reads files from any filesystem.
Definition: baseapi.h:868
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:823
float sconfidence
Definition: osdetect.h:43
bool tessedit_resegment_from_line_boxes
void RunAdaptiveClassifier(TBLOB *blob, int num_max_matches, int *unichar_ids, float *ratings, int *num_matches_returned)
Definition: baseapi.cpp:2798
static size_t getOpenCLDevice(void **device)
Definition: baseapi.cpp:218
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2837
float rating() const
Definition: ratngs.h:323
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:498
const char * GetDatapath()
Definition: baseapi.cpp:946
FILE * init_recog_training(const STRING &fname)
const int kLatinChs[]
Definition: baseapi.cpp:1772
void SetEquationDetect(EquationDetect *detector)
Definition: ocrrow.h:32
TESS_LOCAL int TextLength(int *blob_count)
Definition: baseapi.cpp:2352
Pix * pix_original() const
static ROW * FindRowForBox(BLOCK_LIST *blocks, int left, int top, int right, int bottom)
Definition: baseapi.cpp:2773
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
float certainty() const
Definition: ratngs.h:326
int size() const
Definition: genericvector.h:72
WERD_CHOICE * best_choice
Definition: pageres.h:219
REJMAP reject_map
Definition: pageres.h:271
WERD_RES * word() const
Definition: pageres.h:736
Definition: ocrblock.h:30
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:872
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:772
#define BOOL
Definition: capi.h:44
const int kBlnXHeight
Definition: normalis.h:28
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:132
const char * GetUnichar(int unichar_id)
Definition: baseapi.cpp:2821
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:878
void SetSourceYResolution(int ppi)
Definition: thresholder.h:86
Definition: werd.h:35
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
void delete_data_pointers()
TESS_LOCAL bool InternalSetImage()
Definition: baseapi.cpp:2192
PolyBlockType BlockType() const
void chomp_string(char *str)
Definition: helpers.h:82
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1997
void SetRectangle(int left, int top, int width, int height)
#define tprintf(...)
Definition: tprintf.h:31
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:97
char * GetBoxText(int page_number)
Definition: baseapi.cpp:1727
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
void assign(const char *cstr, int len)
Definition: strngs.cpp:422
virtual TESS_LOCAL bool Threshold(Pix **pix)
Definition: baseapi.cpp:2209
STRING datadir
Definition: ccutil.h:64
void SetInputName(const char *name)
Definition: baseapi.cpp:257
int RecognizeForChopTest(ETEXT_DESC *monitor)
Definition: baseapi.cpp:904
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:505
WERD * word
Definition: pageres.h:175
int InitLangMod(const char *datapath, const char *language)
Definition: baseapi.cpp:459
void SetSourceResolution(int ppi)
Definition: baseapi.cpp:571
float angle() const
find angle
Definition: points.h:249
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:872
void ClearAdaptiveClassifier()
Definition: baseapi.cpp:547
void SetFillLatticeFunc(FillLatticeFunc f)
Definition: baseapi.cpp:2187
static void ClearPersistentCache()
Definition: baseapi.cpp:2107
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
void SetProbabilityInContextFunc(ProbabilityInContextFunc f)
Definition: baseapi.cpp:2175
void add_str_double(const char *str, double number)
Definition: strngs.cpp:391
int num_sub_langs() const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
double matcher_good_threshold
Definition: classify.h:419
bool IsValidCharacter(const char *utf8_character)
Definition: baseapi.cpp:2119
UNICHARSET unicharset
Definition: ccutil.h:68
static TESS_LOCAL int TesseractExtractResult(char **text, int **lengths, float **costs, int **x0, int **y0, int **x1, int **y1, PAGE_RES *page_res)
Definition: baseapi.cpp:2693
const char * string() const
Definition: strngs.cpp:198
int push_back(T object)
char * GetUTF8Text(PageIteratorLevel level) const
void RowAttributes(float *row_height, float *descenders, float *ascenders) const
static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode)
Definition: baseapi.cpp:2525
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
int(Dict::* DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: baseapi.h:76
OSBestResult best_result
Definition: osdetect.h:79
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:561
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
bool LoadMemBuffer(const char *name, const char *data, int size)
void set_deadline_msecs(inT32 deadline_msecs)
Definition: ocrclass.h:146
void SetRectangle(int left, int top, int width, int height)
Definition: baseapi.cpp:598
inT16 top() const
Definition: rect.h:54
int orientation_id
Definition: osdetect.h:41
void set_pix_thresholds(Pix *thresholds)
const int kNumbersPerBlob
Definition: baseapi.cpp:1698
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1235
static bool GetParamAsString(const char *name, const ParamsVectors *member_params, STRING *value)
Definition: params.cpp:135
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:45
char * TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height)
Definition: baseapi.cpp:525
#define FALSE
Definition: capi.h:46
Boxa * GetStrips(Pixa **pixa, int **blockids)
Definition: baseapi.cpp:649
float oconfidence
Definition: osdetect.h:44
void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing)
Definition: baseapi.cpp:2413
Pix * GetThresholdedImage()
Definition: baseapi.cpp:609
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
STRING lang
Definition: ccutil.h:66
const char kUNLVReject
Definition: baseapi.cpp:91
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:306
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
const char * string() const
Definition: params.h:202
bool AnyLSTMLang() const
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:877
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:869
#define PERF_COUNT_START(FUNCT_NAME)
const STRING & unichar_lengths() const
Definition: ratngs.h:544
STRING * datapath_
Current location of tessdata.
Definition: baseapi.h:875
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:50
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1170
inT16 bottom() const
Definition: rect.h:61
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:936
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:117
TESS_LOCAL void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2607
Definition: strngs.h:45
int32_t inT32
Definition: host.h:38
const Dawg * GetDawg(int i) const
Definition: baseapi.cpp:2826
Definition: rect.h:30
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:167
void GetLoadedLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:431
#define PI
Definition: const.h:19
OcrEngineMode oem() const
Definition: baseapi.h:766
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:261
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2382
bool stream_filelist
Definition: baseapi.cpp:82
Boxa * GetComponentImages(const PageIteratorLevel level, const bool text_only, const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:680
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:412
#define UNICHAR_LEN
Definition: unichar.h:31
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1052
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:871
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
void SetOutputName(const char *name)
Definition: baseapi.cpp:265
Pix * GetBinaryImage(PageIteratorLevel level) const
virtual bool ThresholdToPix(PageSegMode pageseg_mode, Pix **pix)
Returns false on error.
void DumpPGM(const char *filename)
Definition: baseapi.cpp:770
void BestChoiceToCorrectText()
Definition: pageres.cpp:918
Pix * GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
TESS_LOCAL void AdaptToCharacter(const char *unichar_repr, int length, float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2556
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
ROW * row
Definition: pageres.h:127
virtual R Run()=0
#define DIR
Definition: polyaprx.cpp:39
CMD_EVENTS mode
Definition: pgedit.cpp:116
WERD_RES * forward()
Definition: pageres.h:716
static const char * Version()
Definition: baseapi.cpp:198
void set_source_resolution(int ppi)
bool Empty(PageIteratorLevel level) const
inT32 length() const
Definition: rejctmap.h:226
virtual bool Next(PageIteratorLevel level)
bool BeginDocument(const char *title)
Definition: renderer.cpp:72
virtual void Run(A1, A2, A3, A4)=0
float base_line(float xpos) const
Definition: ocrrow.h:56
GenericVector< StringParam * > string_params
Definition: params.h:46
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
const char kUNLVSuspect
Definition: baseapi.cpp:93
unsigned char BOOL8
Definition: host.h:44
TBOX bounding_box() const
Definition: blobs.cpp:482
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:292
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:447
const char * GetInitLanguagesAsString() const
Definition: baseapi.cpp:421
float rating() const
Definition: ratngs.h:79
int GetThresholdedImageScaleFactor() const
Definition: baseapi.cpp:762
char * GetOsdText(int page_number)
Definition: baseapi.cpp:1922
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
int first_uni() const
Definition: unichar.cpp:99
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:752
uinT8 space()
Definition: werd.h:104
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:300
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1711
const int kMaxIntSize
Definition: baseapi.cpp:104
tesseract::BoxWord * box_word
Definition: pageres.h:250
void SavePixForCrash(int resolution, Pix *pix)
Definition: globaloc.cpp:34
void ReadDebugConfigFile(const char *filename)
Definition: baseapi.cpp:489
void signal_exit(int signal_code)
Definition: globaloc.cpp:52
const int kMaxBytesPerLine
Definition: baseapi.cpp:1718
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool classify_bln_numeric_mode
Definition: classify.h:499
static void DeleteBlockList(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2473
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
void set_pix_grey(Pix *grey_pix)
void set_min_orientation_margin(double margin)
Definition: baseapi.cpp:2395
const char * WordRecognitionLanguage() const
Definition: werd.h:60
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1709
ParamsVectors * params()
Definition: ccutil.h:62
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
int IsValidWord(const char *word)
Definition: baseapi.cpp:2115
BLOCK_RES * block() const
Definition: pageres.h:742
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:803
bool GetVariableAsString(const char *name, STRING *val)
Definition: baseapi.cpp:315
const int kBlnBaselineOffset
Definition: normalis.h:29
int NumDawgs() const
Definition: baseapi.cpp:2832
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:46
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
int length() const
Definition: ratngs.h:299
void SetDictFunc(DictFunc f)
Definition: baseapi.cpp:2161
virtual Pix * GetPixRectThresholds()
int GetScaledYResolution() const
Definition: thresholder.h:93
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
void(Wordrec::* FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: baseapi.h:85
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:366
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
bool wordrec_run_blamer
Definition: wordrec.h:168
const int kMinRectSize
Definition: baseapi.cpp:87
TBOX bounding_box() const
Definition: werd.cpp:160
TBLOB * make_tesseract_blob(float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix)
Definition: baseapi.cpp:2538
#define GIT_REV
Definition: config_auto.h:14
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:484
void set_text(const char *new_text)
Definition: werd.h:126
inT16 right() const
Definition: rect.h:75
static TBLOB * MakeTBLOB(Pix *pix)
Definition: baseapi.cpp:2495
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
Boxa * GetRegions(Pixa **pixa)
Definition: baseapi.cpp:623
WERD_LIST * word_list()
Definition: ocrrow.h:52
int orientation_and_script_detection(STRING &filename, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:188
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
int IntCastRounded(double x)
Definition: helpers.h:179
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:272
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:948
const char * kOldVarsFile
Definition: baseapi.cpp:102
virtual ~TessBaseAPI()
Definition: baseapi.cpp:191
int GetSourceYResolution() const
Definition: thresholder.h:90
Boxa * GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:635
const char * c_str() const
Definition: strngs.cpp:209
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:42
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:796
MutableIterator * GetMutableIterator()
Definition: baseapi.cpp:1269
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
Definition: baseapi.cpp:1427
TBOX bounding_box() const
Definition: ocrrow.h:85
StrongScriptDirection WordDirection() const
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
const char * kInputFile
Definition: baseapi.cpp:98
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:75
#define TESSERACT_VERSION_STR
Definition: version.h:8
static void ResetToDefaults(ParamsVectors *member_params)
Definition: params.cpp:198
inT32 length() const
Definition: strngs.cpp:193
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2253
const int kUniChs[]
Definition: baseapi.cpp:1768
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
int UNICHAR_ID
Definition: unichar.h:35
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
Orientation and script detection only.
Definition: publictypes.h:164
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:111
ResultIterator * GetIterator()
Definition: baseapi.cpp:1252
const STRING & unichar_string() const
Definition: ratngs.h:537
virtual Pix * GetPixRectGrey()
Boxa * GetWords(Pixa **pixa)
Definition: baseapi.cpp:658