tesseract  4.0.0-beta.1-59-g2cc4
baseapi.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith
5  * Created: Fri Oct 06 15:35:01 PDT 2006
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef __linux__
26 #include <signal.h>
27 #endif
28 
29 #if defined(_WIN32)
30 #if defined(MINGW)
31 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
32 #undef __STRICT_ANSI__
33 #endif // MINGW
34 #include <fcntl.h>
35 #include <io.h>
36 #else
37 #include <dirent.h>
38 #include <libgen.h>
39 #include <string.h>
40 #include <sys/types.h>
41 #include <sys/stat.h>
42 #include <unistd.h>
43 #endif // _WIN32
44 
45 #include <fstream>
46 #include <iostream>
47 #include <iterator>
48 #include <memory> // std::unique_ptr
49 #include <string>
50 
51 #include "allheaders.h"
52 
53 #include "baseapi.h"
54 #include "blobclass.h"
55 #include "resultiterator.h"
56 #include "mutableiterator.h"
57 #include "thresholder.h"
58 #include "tesseractclass.h"
59 #include "pageres.h"
60 #include "paragraphs.h"
61 #include "tessvars.h"
62 #include "control.h"
63 #include "dict.h"
64 #include "pgedit.h"
65 #include "paramsd.h"
66 #include "output.h"
67 #include "globaloc.h"
68 #include "globals.h"
69 #include "edgblob.h"
70 #include "equationdetect.h"
71 #include "tessbox.h"
72 #include "makerow.h"
73 #include "otsuthr.h"
74 #include "osdetect.h"
75 #include "params.h"
76 #include "renderer.h"
77 #include "strngs.h"
78 #include "openclwrapper.h"
79 
80 BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin");
81 
82 namespace tesseract {
83 
85 const int kMinRectSize = 10;
87 const char kTesseractReject = '~';
89 const char kUNLVReject = '~';
91 const char kUNLVSuspect = '^';
96 const char* kInputFile = "noname.tif";
100 const char* kOldVarsFile = "failed_vars.txt";
102 const int kMaxIntSize = 22;
103 
104 /* Add all available languages recursively.
105 */
106 static void addAvailableLanguages(const STRING &datadir, const STRING &base,
107  GenericVector<STRING>* langs)
108 {
109  const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
110  const size_t extlen = sizeof(kTrainedDataSuffix);
111 #ifdef _WIN32
112  WIN32_FIND_DATA data;
113  HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
114  if (handle != INVALID_HANDLE_VALUE) {
115  BOOL result = TRUE;
116  for (; result;) {
117  char *name = data.cFileName;
118  // Skip '.', '..', and hidden files
119  if (name[0] != '.') {
120  if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
121  FILE_ATTRIBUTE_DIRECTORY) {
122  addAvailableLanguages(datadir, base2 + name, langs);
123  } else {
124  size_t len = strlen(name);
125  if (len > extlen && name[len - extlen] == '.' &&
126  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
127  name[len - extlen] = '\0';
128  langs->push_back(base2 + name);
129  }
130  }
131  }
132  result = FindNextFile(handle, &data);
133  }
134  FindClose(handle);
135  }
136 #else // _WIN32
137  DIR* dir = opendir((datadir + base).string());
138  if (dir != NULL) {
139  dirent *de;
140  while ((de = readdir(dir))) {
141  char *name = de->d_name;
142  // Skip '.', '..', and hidden files
143  if (name[0] != '.') {
144  struct stat st;
145  if (stat((datadir + base2 + name).string(), &st) == 0 &&
146  (st.st_mode & S_IFDIR) == S_IFDIR) {
147  addAvailableLanguages(datadir, base2 + name, langs);
148  } else {
149  size_t len = strlen(name);
150  if (len > extlen && name[len - extlen] == '.' &&
151  strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
152  name[len - extlen] = '\0';
153  langs->push_back(base2 + name);
154  }
155  }
156  }
157  }
158  closedir(dir);
159  }
160 #endif
161 }
162 
164  : tesseract_(nullptr),
165  osd_tesseract_(nullptr),
166  equ_detect_(nullptr),
167  reader_(nullptr),
168  // Thresholder is initialized to NULL here, but will be set before use by:
169  // A constructor of a derived API, SetThresholder(), or
170  // created implicitly when used in InternalSetImage.
171  thresholder_(nullptr),
172  paragraph_models_(nullptr),
173  block_list_(nullptr),
174  page_res_(nullptr),
175  input_file_(nullptr),
176  output_file_(nullptr),
177  datapath_(nullptr),
178  language_(nullptr),
179  last_oem_requested_(OEM_DEFAULT),
180  recognition_done_(false),
181  truth_cb_(NULL),
182  rect_left_(0),
183  rect_top_(0),
184  rect_width_(0),
185  rect_height_(0),
186  image_width_(0),
187  image_height_(0) {}
188 
190  End();
191 }
192 
196 const char* TessBaseAPI::Version() {
197  return PACKAGE_VERSION;
198 }
199 
207 #ifdef USE_OPENCL
208 #if USE_DEVICE_SELECTION
209 #include "opencl_device_selection.h"
210 #endif
211 #endif
212 size_t TessBaseAPI::getOpenCLDevice(void **data) {
213 #ifdef USE_OPENCL
214 #if USE_DEVICE_SELECTION
215  ds_device device = OpenclDevice::getDeviceSelection();
216  if (device.type == DS_DEVICE_OPENCL_DEVICE) {
217  *data = new cl_device_id;
218  memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
219  return sizeof(cl_device_id);
220  }
221 #endif
222 #endif
223 
224  *data = NULL;
225  return 0;
226 }
227 
233 #ifdef __linux__
234  struct sigaction action;
235  memset(&action, 0, sizeof(action));
236  action.sa_handler = &signal_exit;
237  action.sa_flags = SA_RESETHAND;
238  sigaction(SIGSEGV, &action, NULL);
239  sigaction(SIGFPE, &action, NULL);
240  sigaction(SIGBUS, &action, NULL);
241 #else
242  // Warn API users that an implementation is needed.
243  tprintf("CatchSignals has no non-linux implementation!\n");
244 #endif
245 }
246 
251 void TessBaseAPI::SetInputName(const char* name) {
252  if (input_file_ == NULL)
253  input_file_ = new STRING(name);
254  else
255  *input_file_ = name;
256 }
257 
259 void TessBaseAPI::SetOutputName(const char* name) {
260  if (output_file_ == NULL)
261  output_file_ = new STRING(name);
262  else
263  *output_file_ = name;
264 }
265 
266 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
267  if (tesseract_ == NULL) tesseract_ = new Tesseract;
269  tesseract_->params());
270 }
271 
272 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
273  if (tesseract_ == NULL) tesseract_ = new Tesseract;
275  tesseract_->params());
276 }
277 
278 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
279  IntParam *p = ParamUtils::FindParam<IntParam>(
281  if (p == NULL) return false;
282  *value = (int32_t)(*p);
283  return true;
284 }
285 
286 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
287  BoolParam *p = ParamUtils::FindParam<BoolParam>(
289  if (p == NULL) return false;
290  *value = (BOOL8)(*p);
291  return true;
292 }
293 
294 const char *TessBaseAPI::GetStringVariable(const char *name) const {
295  StringParam *p = ParamUtils::FindParam<StringParam>(
297  return (p != NULL) ? p->string() : NULL;
298 }
299 
300 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
301  DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
303  if (p == NULL) return false;
304  *value = (double)(*p);
305  return true;
306 }
307 
309 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
310  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
311 }
312 
314 void TessBaseAPI::PrintVariables(FILE *fp) const {
316 }
317 
326 int TessBaseAPI::Init(const char* datapath, const char* language,
327  OcrEngineMode oem, char **configs, int configs_size,
328  const GenericVector<STRING> *vars_vec,
329  const GenericVector<STRING> *vars_values,
330  bool set_only_non_debug_params) {
331  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
332  vars_values, set_only_non_debug_params, nullptr);
333 }
334 
335 // In-memory version reads the traineddata file directly from the given
336 // data[data_size] array. Also implements the version with a datapath in data,
337 // flagged by data_size = 0.
338 int TessBaseAPI::Init(const char* data, int data_size, const char* language,
339  OcrEngineMode oem, char** configs, int configs_size,
340  const GenericVector<STRING>* vars_vec,
341  const GenericVector<STRING>* vars_values,
342  bool set_only_non_debug_params, FileReader reader) {
343  PERF_COUNT_START("TessBaseAPI::Init")
344  // Default language is "eng".
345  if (language == nullptr) language = "eng";
346  STRING datapath = data_size == 0 ? data : language;
347  // If the datapath, OcrEngineMode or the language have changed - start again.
348  // Note that the language_ field stores the last requested language that was
349  // initialized successfully, while tesseract_->lang stores the language
350  // actually used. They differ only if the requested language was NULL, in
351  // which case tesseract_->lang is set to the Tesseract default ("eng").
352  if (tesseract_ != nullptr &&
353  (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
354  last_oem_requested_ != oem ||
355  (*language_ != language && tesseract_->lang != language))) {
356  delete tesseract_;
357  tesseract_ = nullptr;
358  }
359  // PERF_COUNT_SUB("delete tesseract_")
360 #ifdef USE_OPENCL
361  OpenclDevice od;
362  od.InitEnv();
363 #endif
364  PERF_COUNT_SUB("OD::InitEnv()")
365  bool reset_classifier = true;
366  if (tesseract_ == nullptr) {
367  reset_classifier = false;
368  tesseract_ = new Tesseract;
369  if (reader != nullptr) reader_ = reader;
371  if (data_size != 0) {
372  mgr.LoadMemBuffer(language, data, data_size);
373  }
375  datapath.string(),
376  output_file_ != nullptr ? output_file_->string() : nullptr,
377  language, oem, configs, configs_size, vars_vec, vars_values,
378  set_only_non_debug_params, &mgr) != 0) {
379  return -1;
380  }
381  }
382  PERF_COUNT_SUB("update tesseract_")
383  // Update datapath and language requested for the last valid initialization.
384  if (datapath_ == nullptr)
385  datapath_ = new STRING(datapath);
386  else
387  *datapath_ = datapath;
388  if ((strcmp(datapath_->string(), "") == 0) &&
389  (strcmp(tesseract_->datadir.string(), "") != 0))
391 
392  if (language_ == nullptr)
393  language_ = new STRING(language);
394  else
395  *language_ = language;
397  // PERF_COUNT_SUB("update last_oem_requested_")
398  // For same language and datapath, just reset the adaptive classifier.
399  if (reset_classifier) {
401  PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()")
402  }
404  return 0;
405 }
406 
416  return (language_ == NULL || language_->string() == NULL) ?
417  "" : language_->string();
418 }
419 
426  GenericVector<STRING>* langs) const {
427  langs->clear();
428  if (tesseract_ != NULL) {
429  langs->push_back(tesseract_->lang);
430  int num_subs = tesseract_->num_sub_langs();
431  for (int i = 0; i < num_subs; ++i)
432  langs->push_back(tesseract_->get_sub_lang(i)->lang);
433  }
434 }
435 
440  GenericVector<STRING>* langs) const {
441  langs->clear();
442  if (tesseract_ != NULL) {
443  addAvailableLanguages(tesseract_->datadir, "", langs);
444  }
445 }
446 
453 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
454  if (tesseract_ == NULL)
455  tesseract_ = new Tesseract;
456  else
458  TessdataManager mgr;
459  return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr);
460 }
461 
467  if (tesseract_ == NULL) {
468  tesseract_ = new Tesseract;
470  }
471 }
472 
480 }
481 
485 }
486 
493  if (tesseract_ == NULL)
494  tesseract_ = new Tesseract;
495  tesseract_->tessedit_pageseg_mode.set_value(mode);
496 }
497 
500  if (tesseract_ == NULL)
501  return PSM_SINGLE_BLOCK;
502  return static_cast<PageSegMode>(
503  static_cast<int>(tesseract_->tessedit_pageseg_mode));
504 }
505 
519 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
520  int bytes_per_pixel,
521  int bytes_per_line,
522  int left, int top,
523  int width, int height) {
524  if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
525  return NULL; // Nothing worth doing.
526 
527  // Since this original api didn't give the exact size of the image,
528  // we have to invent a reasonable value.
529  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
530  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
531  bytes_per_pixel, bytes_per_line);
532  SetRectangle(left, top, width, height);
533 
534  return GetUTF8Text();
535 }
536 
542  if (tesseract_ == NULL)
543  return;
546 }
547 
555 void TessBaseAPI::SetImage(const unsigned char* imagedata,
556  int width, int height,
557  int bytes_per_pixel, int bytes_per_line) {
558  if (InternalSetImage()) {
559  thresholder_->SetImage(imagedata, width, height,
560  bytes_per_pixel, bytes_per_line);
562  }
563 }
564 
566  if (thresholder_)
568  else
569  tprintf("Please call SetImage before SetSourceResolution.\n");
570 }
571 
580 void TessBaseAPI::SetImage(Pix* pix) {
581  if (InternalSetImage()) {
582  thresholder_->SetImage(pix);
584  }
585 }
586 
592 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
593  if (thresholder_ == NULL)
594  return;
595  thresholder_->SetRectangle(left, top, width, height);
596  ClearResults();
597 }
598 
604  if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr;
605  if (tesseract_->pix_binary() == nullptr &&
607  return nullptr;
608  }
609  return pixClone(tesseract_->pix_binary());
610 }
611 
617 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
618  return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
619 }
620 
629 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
630  Pixa** pixa, int** blockids, int** paraids) {
631  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
632  pixa, blockids, paraids);
633 }
634 
643 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
644  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
645 }
646 
652 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
653  return GetComponentImages(RIL_WORD, true, pixa, NULL);
654 }
655 
663  return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
664 }
665 
675  bool text_only, bool raw_image,
676  const int raw_padding,
677  Pixa** pixa, int** blockids,
678  int** paraids) {
679  PageIterator* page_it = GetIterator();
680  if (page_it == NULL)
681  page_it = AnalyseLayout();
682  if (page_it == NULL)
683  return NULL; // Failed.
684 
685  // Count the components to get a size for the arrays.
686  int component_count = 0;
687  int left, top, right, bottom;
688 
689  TessResultCallback<bool>* get_bbox = NULL;
690  if (raw_image) {
691  // Get bounding box in original raw image with padding.
693  level, raw_padding,
694  &left, &top, &right, &bottom);
695  } else {
696  // Get bounding box from binarized imaged. Note that this could be
697  // differently scaled from the original image.
698  get_bbox = NewPermanentTessCallback(page_it,
700  level, &left, &top, &right, &bottom);
701  }
702  do {
703  if (get_bbox->Run() &&
704  (!text_only || PTIsTextType(page_it->BlockType())))
705  ++component_count;
706  } while (page_it->Next(level));
707 
708  Boxa* boxa = boxaCreate(component_count);
709  if (pixa != NULL)
710  *pixa = pixaCreate(component_count);
711  if (blockids != NULL)
712  *blockids = new int[component_count];
713  if (paraids != NULL)
714  *paraids = new int[component_count];
715 
716  int blockid = 0;
717  int paraid = 0;
718  int component_index = 0;
719  page_it->Begin();
720  do {
721  if (get_bbox->Run() &&
722  (!text_only || PTIsTextType(page_it->BlockType()))) {
723  Box* lbox = boxCreate(left, top, right - left, bottom - top);
724  boxaAddBox(boxa, lbox, L_INSERT);
725  if (pixa != NULL) {
726  Pix* pix = NULL;
727  if (raw_image) {
728  pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
729  &top);
730  } else {
731  pix = page_it->GetBinaryImage(level);
732  }
733  pixaAddPix(*pixa, pix, L_INSERT);
734  pixaAddBox(*pixa, lbox, L_CLONE);
735  }
736  if (paraids != NULL) {
737  (*paraids)[component_index] = paraid;
738  if (page_it->IsAtFinalElement(RIL_PARA, level))
739  ++paraid;
740  }
741  if (blockids != NULL) {
742  (*blockids)[component_index] = blockid;
743  if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
744  ++blockid;
745  paraid = 0;
746  }
747  }
748  ++component_index;
749  }
750  } while (page_it->Next(level));
751  delete page_it;
752  delete get_bbox;
753  return boxa;
754 }
755 
757  if (thresholder_ == NULL) {
758  return 0;
759  }
760  return thresholder_->GetScaleFactor();
761 }
762 
779 
780 PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
781  if (FindLines() == 0) {
782  if (block_list_->empty())
783  return NULL; // The page was empty.
784  page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
785  DetectParagraphs(false);
786  return new PageIterator(
790  }
791  return NULL;
792 }
793 
799  if (tesseract_ == NULL)
800  return -1;
801  if (FindLines() != 0)
802  return -1;
803  delete page_res_;
804  if (block_list_->empty()) {
805  page_res_ = new PAGE_RES(false, block_list_,
807  return 0; // Empty page.
808  }
809 
811  recognition_done_ = true;
816  } else {
819  }
820  if (page_res_ == NULL) {
821  return -1;
822  }
826  return 0;
827  }
830  return 0;
831  }
832 
833  if (truth_cb_ != NULL) {
834  tesseract_->wordrec_run_blamer.set_value(true);
835  PageIterator *page_it = new PageIterator(
840  image_height_, page_it, this->tesseract()->pix_grey());
841  delete page_it;
842  }
843 
844  int result = 0;
846  #ifndef GRAPHICS_DISABLED
848  #endif // GRAPHICS_DISABLED
849  // The page_res is invalid after an interactive session, so cleanup
850  // in a way that lets us continue to the next page without crashing.
851  delete page_res_;
852  page_res_ = NULL;
853  return -1;
855  STRING fontname;
856  ExtractFontName(*output_file_, &fontname);
858  } else if (tesseract_->tessedit_ambigs_training) {
859  FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
860  // OCR the page segmented into words by tesseract.
862  *input_file_, page_res_, monitor, training_output_file);
863  fclose(training_output_file);
864  } else {
865  // Now run the main recognition.
866  bool wait_for_text = true;
867  GetBoolVariable("paragraph_text_based", &wait_for_text);
868  if (!wait_for_text) DetectParagraphs(false);
869  if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
870  if (wait_for_text) DetectParagraphs(true);
871  } else {
872  result = -1;
873  }
874  }
875  return result;
876 }
877 
880  if (tesseract_ == NULL)
881  return -1;
882  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
883  tprintf("Please call SetImage before attempting recognition.\n");
884  return -1;
885  }
886  if (page_res_ != NULL)
887  ClearResults();
888  if (FindLines() != 0)
889  return -1;
890  // Additional conditions under which chopper test cannot be run
891  if (tesseract_->interactive_display_mode) return -1;
892 
893  recognition_done_ = true;
894 
895  page_res_ = new PAGE_RES(false, block_list_,
897 
898  PAGE_RES_IT page_res_it(page_res_);
899 
900  while (page_res_it.word() != NULL) {
901  WERD_RES *word_res = page_res_it.word();
902  GenericVector<TBOX> boxes;
903  tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
904  page_res_it.row()->row, word_res);
905  page_res_it.forward();
906  }
907  return 0;
908 }
909 
910 // Takes ownership of the input pix.
912 
914 
916  if (input_file_)
917  return input_file_->c_str();
918  return NULL;
919 }
920 
921 const char * TessBaseAPI::GetDatapath() {
922  return tesseract_->datadir.c_str();
923 }
924 
927 }
928 
929 // If flist exists, get data from there. Otherwise get data from buf.
930 // Seems convoluted, but is the easiest way I know of to meet multiple
931 // goals. Support streaming from stdin, and also work on platforms
932 // lacking fmemopen.
933 bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
934  STRING *buf,
935  const char* retry_config,
936  int timeout_millisec,
937  TessResultRenderer* renderer,
938  int tessedit_page_number) {
939  if (!flist && !buf) return false;
940  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
941  char pagename[MAX_PATH];
942 
943  GenericVector<STRING> lines;
944  if (!flist) {
945  buf->split('\n', &lines);
946  if (lines.empty()) return false;
947  }
948 
949  // Skip to the requested page number.
950  for (int i = 0; i < page; i++) {
951  if (flist) {
952  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
953  }
954  }
955 
956  // Begin producing output
957  if (renderer && !renderer->BeginDocument(unknown_title_)) {
958  return false;
959  }
960 
961  // Loop over all pages - or just the requested one
962  while (true) {
963  if (flist) {
964  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
965  } else {
966  if (page >= lines.size()) break;
967  snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
968  }
969  chomp_string(pagename);
970  Pix *pix = pixRead(pagename);
971  if (pix == NULL) {
972  tprintf("Image file %s cannot be read!\n", pagename);
973  return false;
974  }
975  tprintf("Page %d : %s\n", page, pagename);
976  bool r = ProcessPage(pix, page, pagename, retry_config,
977  timeout_millisec, renderer);
978  pixDestroy(&pix);
979  if (!r) return false;
980  if (tessedit_page_number >= 0) break;
981  ++page;
982  }
983 
984  // Finish producing output
985  if (renderer && !renderer->EndDocument()) {
986  return false;
987  }
988  return true;
989 }
990 
991 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
992  size_t size,
993  const char* filename,
994  const char* retry_config,
995  int timeout_millisec,
996  TessResultRenderer* renderer,
997  int tessedit_page_number) {
998 #ifndef ANDROID_BUILD
999  Pix *pix = NULL;
1000  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1001  size_t offset = 0;
1002  for (; ; ++page) {
1003  if (tessedit_page_number >= 0)
1004  page = tessedit_page_number;
1005  pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1006  : pixReadFromMultipageTiff(filename, &offset);
1007  if (pix == NULL) break;
1008  tprintf("Page %d\n", page + 1);
1009  char page_str[kMaxIntSize];
1010  snprintf(page_str, kMaxIntSize - 1, "%d", page);
1011  SetVariable("applybox_page", page_str);
1012  bool r = ProcessPage(pix, page, filename, retry_config,
1013  timeout_millisec, renderer);
1014  pixDestroy(&pix);
1015  if (!r) return false;
1016  if (tessedit_page_number >= 0) break;
1017  if (!offset) break;
1018  }
1019  return true;
1020 #else
1021  return false;
1022 #endif
1023 }
1024 
1025 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1026 // processing required due to being in a training mode.
1027 bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
1028  int timeout_millisec,
1029  TessResultRenderer* renderer) {
1030  bool result =
1031  ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1032  if (result) {
1035  tprintf("Write of TR file failed: %s\n", output_file_->string());
1036  return false;
1037  }
1038  }
1039  return result;
1040 }
1041 
1042 // In the ideal scenario, Tesseract will start working on data as soon
1043 // as it can. For example, if you stream a filelist through stdin, we
1044 // should start the OCR process as soon as the first filename is
1045 // available. This is particularly useful when hooking Tesseract up to
1046 // slow hardware such as a book scanning machine.
1047 //
1048 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1049 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1050 // impractical. So we support a command line flag to explicitly
1051 // identify the scenario that really matters: filelists on
1052 // stdin. We'll still do our best if the user likes pipes.
1053 bool TessBaseAPI::ProcessPagesInternal(const char* filename,
1054  const char* retry_config,
1055  int timeout_millisec,
1056  TessResultRenderer* renderer) {
1057  PERF_COUNT_START("ProcessPages")
1058  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1059  if (stdInput) {
1060 #ifdef WIN32
1061  if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1062  tprintf("ERROR: cin to binary: %s", strerror(errno));
1063 #endif // WIN32
1064  }
1065 
1066  if (stream_filelist) {
1067  return ProcessPagesFileList(stdin, NULL, retry_config,
1068  timeout_millisec, renderer,
1070  }
1071 
1072  // At this point we are officially in autodection territory.
1073  // That means any data in stdin must be buffered, to make it
1074  // seekable.
1075  std::string buf;
1076  const l_uint8 *data = NULL;
1077  if (stdInput) {
1078  buf.assign((std::istreambuf_iterator<char>(std::cin)),
1079  (std::istreambuf_iterator<char>()));
1080  data = reinterpret_cast<const l_uint8 *>(buf.data());
1081  }
1082 
1083  // Here is our autodetection
1084  int format;
1085  int r = (stdInput) ?
1086  findFileFormatBuffer(data, &format) :
1087  findFileFormat(filename, &format);
1088 
1089  // Maybe we have a filelist
1090  if (r != 0 || format == IFF_UNKNOWN) {
1091  STRING s;
1092  if (stdInput) {
1093  s = buf.c_str();
1094  } else {
1095  std::ifstream t(filename);
1096  std::string u((std::istreambuf_iterator<char>(t)),
1097  std::istreambuf_iterator<char>());
1098  s = u.c_str();
1099  }
1100  return ProcessPagesFileList(NULL, &s, retry_config,
1101  timeout_millisec, renderer,
1103  }
1104 
1105  // Maybe we have a TIFF which is potentially multipage
1106  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
1107  format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
1108  format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1109  format == IFF_TIFF_ZIP);
1110 
1111  // Fail early if we can, before producing any output
1112  Pix *pix = NULL;
1113  if (!tiff) {
1114  pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
1115  if (pix == NULL) {
1116  return false;
1117  }
1118  }
1119 
1120  // Begin the output
1121  if (renderer && !renderer->BeginDocument(unknown_title_)) {
1122  pixDestroy(&pix);
1123  return false;
1124  }
1125 
1126  // Produce output
1127  r = (tiff) ?
1128  ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
1129  timeout_millisec, renderer,
1131  ProcessPage(pix, 0, filename, retry_config,
1132  timeout_millisec, renderer);
1133 
1134  // Clean up memory as needed
1135  pixDestroy(&pix);
1136 
1137  // End the output
1138  if (!r || (renderer && !renderer->EndDocument())) {
1139  return false;
1140  }
1142  return true;
1143 }
1144 
1145 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
1146  const char* retry_config, int timeout_millisec,
1147  TessResultRenderer* renderer) {
1148  PERF_COUNT_START("ProcessPage")
1149  SetInputName(filename);
1150  SetImage(pix);
1151  bool failed = false;
1152 
1154  // Disabled character recognition
1155  PageIterator* it = AnalyseLayout();
1156 
1157  if (it == NULL) {
1158  failed = true;
1159  } else {
1160  delete it;
1161  }
1163  failed = FindLines() != 0;
1164  } else if (timeout_millisec > 0) {
1165  // Running with a timeout.
1166  ETEXT_DESC monitor;
1167  monitor.cancel = NULL;
1168  monitor.cancel_this = NULL;
1169  monitor.set_deadline_msecs(timeout_millisec);
1170 
1171  // Now run the main recognition.
1172  failed = Recognize(&monitor) < 0;
1173  } else {
1174  // Normal layout and character recognition with no timeout.
1175  failed = Recognize(NULL) < 0;
1176  }
1177 
1179 #ifndef ANDROID_BUILD
1180  Pix* page_pix = GetThresholdedImage();
1181  pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
1182 #endif // ANDROID_BUILD
1183  }
1184 
1185  if (failed && retry_config != NULL && retry_config[0] != '\0') {
1186  // Save current config variables before switching modes.
1187  FILE* fp = fopen(kOldVarsFile, "wb");
1188  PrintVariables(fp);
1189  fclose(fp);
1190  // Switch to alternate mode for retry.
1191  ReadConfigFile(retry_config);
1192  SetImage(pix);
1193  Recognize(NULL);
1194  // Restore saved config variables.
1195  ReadConfigFile(kOldVarsFile);
1196  }
1197 
1198  if (renderer && !failed) {
1199  failed = !renderer->AddImage(this);
1200  }
1201 
1203  return !failed;
1204 }
1205 
1211  if (tesseract_ == NULL || page_res_ == NULL)
1212  return NULL;
1213  return new LTRResultIterator(
1217 }
1218 
1228  if (tesseract_ == NULL || page_res_ == NULL)
1229  return NULL;
1234 }
1235 
1245  if (tesseract_ == NULL || page_res_ == NULL)
1246  return NULL;
1247  return new MutableIterator(page_res_, tesseract_,
1251 }
1252 
1255  if (tesseract_ == NULL ||
1256  (!recognition_done_ && Recognize(NULL) < 0))
1257  return NULL;
1258  STRING text("");
1259  ResultIterator *it = GetIterator();
1260  do {
1261  if (it->Empty(RIL_PARA)) continue;
1262  const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1263  text += para_text.get();
1264  } while (it->Next(RIL_PARA));
1265  char* result = new char[text.length() + 1];
1266  strncpy(result, text.string(), text.length() + 1);
1267  delete it;
1268  return result;
1269 }
1270 
1274 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
1275  tesseract::Orientation orientation;
1276  tesseract::WritingDirection writing_direction;
1277  tesseract::TextlineOrder textline_order;
1278  float deskew_angle;
1279  it->Orientation(&orientation, &writing_direction, &textline_order,
1280  &deskew_angle);
1281  return orientation;
1282 }
1283 
1292 static void AddBaselineCoordsTohOCR(const PageIterator *it,
1293  PageIteratorLevel level,
1294  STRING* hocr_str) {
1295  tesseract::Orientation orientation = GetBlockTextOrientation(it);
1296  if (orientation != ORIENTATION_PAGE_UP) {
1297  hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
1298  return;
1299  }
1300 
1301  int left, top, right, bottom;
1302  it->BoundingBox(level, &left, &top, &right, &bottom);
1303 
1304  // Try to get the baseline coordinates at this level.
1305  int x1, y1, x2, y2;
1306  if (!it->Baseline(level, &x1, &y1, &x2, &y2))
1307  return;
1308  // Following the description of this field of the hOCR spec, we convert the
1309  // baseline coordinates so that "the bottom left of the bounding box is the
1310  // origin".
1311  x1 -= left;
1312  x2 -= left;
1313  y1 -= bottom;
1314  y2 -= bottom;
1315 
1316  // Now fit a line through the points so we can extract coefficients for the
1317  // equation: y = p1 x + p0
1318  double p1 = 0;
1319  double p0 = 0;
1320  if (x1 == x2) {
1321  // Problem computing the polynomial coefficients.
1322  return;
1323  }
1324  p1 = (y2 - y1) / static_cast<double>(x2 - x1);
1325  p0 = y1 - static_cast<double>(p1 * x1);
1326 
1327  hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
1328  hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
1329 }
1330 
1331 static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1332  int num2) {
1333  const size_t BUFSIZE = 64;
1334  char id_buffer[BUFSIZE];
1335  if (num2 >= 0) {
1336  snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
1337  } else {
1338  snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
1339  }
1340  id_buffer[BUFSIZE - 1] = '\0';
1341  *hocr_str += " id='";
1342  *hocr_str += id_buffer;
1343  *hocr_str += "'";
1344 }
1345 
1346 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
1347  STRING* hocr_str) {
1348  int left, top, right, bottom;
1349  it->BoundingBox(level, &left, &top, &right, &bottom);
1350  // This is the only place we use double quotes instead of single quotes,
1351  // but it may too late to change for consistency
1352  hocr_str->add_str_int(" title=\"bbox ", left);
1353  hocr_str->add_str_int(" ", top);
1354  hocr_str->add_str_int(" ", right);
1355  hocr_str->add_str_int(" ", bottom);
1356  // Add baseline coordinates & heights for textlines only.
1357  if (level == RIL_TEXTLINE) {
1358  AddBaselineCoordsTohOCR(it, level, hocr_str);
1359  // add custom height measures
1360  float row_height, descenders, ascenders; // row attributes
1361  it->RowAttributes(&row_height, &descenders, &ascenders);
1362  // TODO(rays): Do we want to limit these to a single decimal place?
1363  hocr_str->add_str_double("; x_size ", row_height);
1364  hocr_str->add_str_double("; x_descenders ", descenders * -1);
1365  hocr_str->add_str_double("; x_ascenders ", ascenders);
1366  }
1367  *hocr_str += "\">";
1368 }
1369 
1370 static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
1371  STRING* hocr_str) {
1372  int left, top, right, bottom;
1373  it->BoundingBox(level, &left, &top, &right, &bottom);
1374  hocr_str->add_str_int("\t", left);
1375  hocr_str->add_str_int("\t", top);
1376  hocr_str->add_str_int("\t", right - left);
1377  hocr_str->add_str_int("\t", bottom - top);
1378 }
1379 
1389 char* TessBaseAPI::GetHOCRText(int page_number) {
1390  return GetHOCRText(NULL, page_number);
1391 }
1392 
1402 char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1403  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(monitor) < 0))
1404  return NULL;
1405 
1406  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1407  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1408  bool para_is_ltr = true; // Default direction is LTR
1409  const char* paragraph_lang = NULL;
1410  bool font_info = false;
1411  GetBoolVariable("hocr_font_info", &font_info);
1412 
1413  STRING hocr_str("");
1414 
1415  if (input_file_ == NULL)
1416  SetInputName(NULL);
1417 
1418 #ifdef _WIN32
1419  // convert input name from ANSI encoding to utf-8
1420  int str16_len =
1421  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, NULL, 0);
1422  wchar_t *uni16_str = new WCHAR[str16_len];
1423  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1424  uni16_str, str16_len);
1425  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, 0,
1426  NULL, NULL);
1427  char *utf8_str = new char[utf8_len];
1428  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1429  utf8_len, NULL, NULL);
1430  *input_file_ = utf8_str;
1431  delete[] uni16_str;
1432  delete[] utf8_str;
1433 #endif
1434 
1435  hocr_str += " <div class='ocr_page'";
1436  AddIdTohOCR(&hocr_str, "page", page_id, -1);
1437  hocr_str += " title='image \"";
1438  if (input_file_) {
1439  hocr_str += HOcrEscape(input_file_->string());
1440  } else {
1441  hocr_str += "unknown";
1442  }
1443  hocr_str.add_str_int("\"; bbox ", rect_left_);
1444  hocr_str.add_str_int(" ", rect_top_);
1445  hocr_str.add_str_int(" ", rect_width_);
1446  hocr_str.add_str_int(" ", rect_height_);
1447  hocr_str.add_str_int("; ppageno ", page_number);
1448  hocr_str += "'>\n";
1449 
1450  ResultIterator *res_it = GetIterator();
1451  while (!res_it->Empty(RIL_BLOCK)) {
1452  if (res_it->Empty(RIL_WORD)) {
1453  res_it->Next(RIL_WORD);
1454  continue;
1455  }
1456 
1457  // Open any new block/paragraph/textline.
1458  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1459  para_is_ltr = true; // reset to default direction
1460  hocr_str += " <div class='ocr_carea'";
1461  AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
1462  AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1463  }
1464  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1465  hocr_str += "\n <p class='ocr_par'";
1466  para_is_ltr = res_it->ParagraphIsLtr();
1467  if (!para_is_ltr) {
1468  hocr_str += " dir='rtl'";
1469  }
1470  AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
1471  paragraph_lang = res_it->WordRecognitionLanguage();
1472  if (paragraph_lang) {
1473  hocr_str += " lang='";
1474  hocr_str += paragraph_lang;
1475  hocr_str += "'";
1476  }
1477  AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1478  }
1479  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1480  hocr_str += "\n <span class='ocr_line'";
1481  AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
1482  AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1483  }
1484 
1485  // Now, process the word...
1486  hocr_str += "<span class='ocrx_word'";
1487  AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
1488  int left, top, right, bottom;
1489  bool bold, italic, underlined, monospace, serif, smallcaps;
1490  int pointsize, font_id;
1491  const char *font_name;
1492  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1493  font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1494  &monospace, &serif, &smallcaps,
1495  &pointsize, &font_id);
1496  hocr_str.add_str_int(" title='bbox ", left);
1497  hocr_str.add_str_int(" ", top);
1498  hocr_str.add_str_int(" ", right);
1499  hocr_str.add_str_int(" ", bottom);
1500  hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1501  if (font_info) {
1502  if (font_name) {
1503  hocr_str += "; x_font ";
1504  hocr_str += HOcrEscape(font_name);
1505  }
1506  hocr_str.add_str_int("; x_fsize ", pointsize);
1507  }
1508  hocr_str += "'";
1509  const char* lang = res_it->WordRecognitionLanguage();
1510  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
1511  hocr_str += " lang='";
1512  hocr_str += lang;
1513  hocr_str += "'";
1514  }
1515  switch (res_it->WordDirection()) {
1516  // Only emit direction if different from current paragraph direction
1517  case DIR_LEFT_TO_RIGHT:
1518  if (!para_is_ltr) hocr_str += " dir='ltr'";
1519  break;
1520  case DIR_RIGHT_TO_LEFT:
1521  if (para_is_ltr) hocr_str += " dir='rtl'";
1522  break;
1523  case DIR_MIX:
1524  case DIR_NEUTRAL:
1525  default: // Do nothing.
1526  break;
1527  }
1528  hocr_str += ">";
1529  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1530  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1531  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1532  if (bold) hocr_str += "<strong>";
1533  if (italic) hocr_str += "<em>";
1534  do {
1535  const std::unique_ptr<const char[]> grapheme(
1536  res_it->GetUTF8Text(RIL_SYMBOL));
1537  if (grapheme && grapheme[0] != 0) {
1538  hocr_str += HOcrEscape(grapheme.get());
1539  }
1540  res_it->Next(RIL_SYMBOL);
1541  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1542  if (italic) hocr_str += "</em>";
1543  if (bold) hocr_str += "</strong>";
1544  hocr_str += "</span> ";
1545  wcnt++;
1546  // Close any ending block/paragraph/textline.
1547  if (last_word_in_line) {
1548  hocr_str += "\n </span>";
1549  lcnt++;
1550  }
1551  if (last_word_in_para) {
1552  hocr_str += "\n </p>\n";
1553  pcnt++;
1554  para_is_ltr = true; // back to default direction
1555  }
1556  if (last_word_in_block) {
1557  hocr_str += " </div>\n";
1558  bcnt++;
1559  }
1560  }
1561  hocr_str += " </div>\n";
1562 
1563  char *ret = new char[hocr_str.length() + 1];
1564  strcpy(ret, hocr_str.string());
1565  delete res_it;
1566  return ret;
1567 }
1568 
1574 char* TessBaseAPI::GetTSVText(int page_number) {
1575  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0))
1576  return NULL;
1577 
1578  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1579  int page_id = page_number + 1; // we use 1-based page numbers.
1580 
1581  STRING tsv_str("");
1582 
1583  int page_num = page_id, block_num = 0, par_num = 0, line_num = 0,
1584  word_num = 0;
1585 
1586  tsv_str.add_str_int("1\t", page_num); // level 1 - page
1587  tsv_str.add_str_int("\t", block_num);
1588  tsv_str.add_str_int("\t", par_num);
1589  tsv_str.add_str_int("\t", line_num);
1590  tsv_str.add_str_int("\t", word_num);
1591  tsv_str.add_str_int("\t", rect_left_);
1592  tsv_str.add_str_int("\t", rect_top_);
1593  tsv_str.add_str_int("\t", rect_width_);
1594  tsv_str.add_str_int("\t", rect_height_);
1595  tsv_str += "\t-1\t\n";
1596 
1597  ResultIterator* res_it = GetIterator();
1598  while (!res_it->Empty(RIL_BLOCK)) {
1599  if (res_it->Empty(RIL_WORD)) {
1600  res_it->Next(RIL_WORD);
1601  continue;
1602  }
1603 
1604  // Add rows for any new block/paragraph/textline.
1605  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1606  block_num++, par_num = 0, line_num = 0, word_num = 0;
1607  tsv_str.add_str_int("2\t", page_num); // level 2 - block
1608  tsv_str.add_str_int("\t", block_num);
1609  tsv_str.add_str_int("\t", par_num);
1610  tsv_str.add_str_int("\t", line_num);
1611  tsv_str.add_str_int("\t", word_num);
1612  AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
1613  tsv_str += "\t-1\t\n"; // end of row for block
1614  }
1615  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1616  par_num++, line_num = 0, word_num = 0;
1617  tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
1618  tsv_str.add_str_int("\t", block_num);
1619  tsv_str.add_str_int("\t", par_num);
1620  tsv_str.add_str_int("\t", line_num);
1621  tsv_str.add_str_int("\t", word_num);
1622  AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
1623  tsv_str += "\t-1\t\n"; // end of row for para
1624  }
1625  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1626  line_num++, word_num = 0;
1627  tsv_str.add_str_int("4\t", page_num); // level 4 - line
1628  tsv_str.add_str_int("\t", block_num);
1629  tsv_str.add_str_int("\t", par_num);
1630  tsv_str.add_str_int("\t", line_num);
1631  tsv_str.add_str_int("\t", word_num);
1632  AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
1633  tsv_str += "\t-1\t\n"; // end of row for line
1634  }
1635 
1636  // Now, process the word...
1637  int left, top, right, bottom;
1638  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1639  word_num++;
1640  tsv_str.add_str_int("5\t", page_num); // level 5 - word
1641  tsv_str.add_str_int("\t", block_num);
1642  tsv_str.add_str_int("\t", par_num);
1643  tsv_str.add_str_int("\t", line_num);
1644  tsv_str.add_str_int("\t", word_num);
1645  tsv_str.add_str_int("\t", left);
1646  tsv_str.add_str_int("\t", top);
1647  tsv_str.add_str_int("\t", right - left);
1648  tsv_str.add_str_int("\t", bottom - top);
1649  tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1650  tsv_str += "\t";
1651 
1652  // Increment counts if at end of block/paragraph/textline.
1653  if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
1654  if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
1655  if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
1656 
1657  do {
1658  tsv_str +=
1659  std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1660  res_it->Next(RIL_SYMBOL);
1661  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1662  tsv_str += "\n"; // end of row
1663  wcnt++;
1664  }
1665 
1666  char* ret = new char[tsv_str.length() + 1];
1667  strcpy(ret, tsv_str.string());
1668  delete res_it;
1669  return ret;
1670 }
1671 
1673 const int kNumbersPerBlob = 5;
1678 const int kBytesPerNumber = 5;
1684 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1686 const int kBytesPer64BitNumber = 20;
1693 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
1694  UNICHAR_LEN;
1695 
1702 char* TessBaseAPI::GetBoxText(int page_number) {
1703  if (tesseract_ == NULL ||
1704  (!recognition_done_ && Recognize(NULL) < 0))
1705  return NULL;
1706  int blob_count;
1707  int utf8_length = TextLength(&blob_count);
1708  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1710  char* result = new char[total_length];
1711  result[0] = '\0';
1712  int output_length = 0;
1714  do {
1715  int left, top, right, bottom;
1716  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1717  const std::unique_ptr</*non-const*/ char[]> text(
1718  it->GetUTF8Text(RIL_SYMBOL));
1719  // Tesseract uses space for recognition failure. Fix to a reject
1720  // character, kTesseractReject so we don't create illegal box files.
1721  for (int i = 0; text[i] != '\0'; ++i) {
1722  if (text[i] == ' ')
1723  text[i] = kTesseractReject;
1724  }
1725  snprintf(result + output_length, total_length - output_length,
1726  "%s %d %d %d %d %d\n", text.get(), left, image_height_ - bottom,
1727  right, image_height_ - top, page_number);
1728  output_length += strlen(result + output_length);
1729  // Just in case...
1730  if (output_length + kMaxBytesPerLine > total_length)
1731  break;
1732  }
1733  } while (it->Next(RIL_SYMBOL));
1734  delete it;
1735  return result;
1736 }
1737 
1743 const int kUniChs[] = {
1744  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
1745 };
1747 const int kLatinChs[] = {
1748  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
1749 };
1750 
1757  if (tesseract_ == NULL ||
1758  (!recognition_done_ && Recognize(NULL) < 0))
1759  return NULL;
1760  bool tilde_crunch_written = false;
1761  bool last_char_was_newline = true;
1762  bool last_char_was_tilde = false;
1763 
1764  int total_length = TextLength(NULL);
1765  PAGE_RES_IT page_res_it(page_res_);
1766  char* result = new char[total_length];
1767  char* ptr = result;
1768  for (page_res_it.restart_page(); page_res_it.word () != NULL;
1769  page_res_it.forward()) {
1770  WERD_RES *word = page_res_it.word();
1771  // Process the current word.
1772  if (word->unlv_crunch_mode != CR_NONE) {
1773  if (word->unlv_crunch_mode != CR_DELETE &&
1774  (!tilde_crunch_written ||
1775  (word->unlv_crunch_mode == CR_KEEP_SPACE &&
1776  word->word->space() > 0 &&
1777  !word->word->flag(W_FUZZY_NON) &&
1778  !word->word->flag(W_FUZZY_SP)))) {
1779  if (!word->word->flag(W_BOL) &&
1780  word->word->space() > 0 &&
1781  !word->word->flag(W_FUZZY_NON) &&
1782  !word->word->flag(W_FUZZY_SP)) {
1783  /* Write a space to separate from preceding good text */
1784  *ptr++ = ' ';
1785  last_char_was_tilde = false;
1786  }
1787  if (!last_char_was_tilde) {
1788  // Write a reject char.
1789  last_char_was_tilde = true;
1790  *ptr++ = kUNLVReject;
1791  tilde_crunch_written = true;
1792  last_char_was_newline = false;
1793  }
1794  }
1795  } else {
1796  // NORMAL PROCESSING of non tilde crunched words.
1797  tilde_crunch_written = false;
1799  const char* wordstr = word->best_choice->unichar_string().string();
1800  const STRING& lengths = word->best_choice->unichar_lengths();
1801  int length = lengths.length();
1802  int i = 0;
1803  int offset = 0;
1804 
1805  if (last_char_was_tilde &&
1806  word->word->space() == 0 && wordstr[offset] == ' ') {
1807  // Prevent adjacent tilde across words - we know that adjacent tildes
1808  // within words have been removed.
1809  // Skip the first character.
1810  offset = lengths[i++];
1811  }
1812  if (i < length && wordstr[offset] != 0) {
1813  if (!last_char_was_newline)
1814  *ptr++ = ' ';
1815  else
1816  last_char_was_newline = false;
1817  for (; i < length; offset += lengths[i++]) {
1818  if (wordstr[offset] == ' ' ||
1819  wordstr[offset] == kTesseractReject) {
1820  *ptr++ = kUNLVReject;
1821  last_char_was_tilde = true;
1822  } else {
1823  if (word->reject_map[i].rejected())
1824  *ptr++ = kUNLVSuspect;
1825  UNICHAR ch(wordstr + offset, lengths[i]);
1826  int uni_ch = ch.first_uni();
1827  for (int j = 0; kUniChs[j] != 0; ++j) {
1828  if (kUniChs[j] == uni_ch) {
1829  uni_ch = kLatinChs[j];
1830  break;
1831  }
1832  }
1833  if (uni_ch <= 0xff) {
1834  *ptr++ = static_cast<char>(uni_ch);
1835  last_char_was_tilde = false;
1836  } else {
1837  *ptr++ = kUNLVReject;
1838  last_char_was_tilde = true;
1839  }
1840  }
1841  }
1842  }
1843  }
1844  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1845  /* Add a new line output */
1846  *ptr++ = '\n';
1847  tilde_crunch_written = false;
1848  last_char_was_newline = true;
1849  last_char_was_tilde = false;
1850  }
1851  }
1852  *ptr++ = '\n';
1853  *ptr = '\0';
1854  return result;
1855 }
1856 
1866 bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf,
1867  const char** script_name,
1868  float* script_conf) {
1869  OSResults osr;
1870 
1871  bool osd = DetectOS(&osr);
1872  if (!osd) {
1873  return false;
1874  }
1875 
1876  int orient_id = osr.best_result.orientation_id;
1877  int script_id = osr.get_best_script(orient_id);
1878  if (orient_conf) *orient_conf = osr.best_result.oconfidence;
1879  if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees
1880 
1881  if (script_name) {
1882  const char* script = osr.unicharset->get_script_from_script_id(script_id);
1883 
1884  *script_name = script;
1885  }
1886 
1887  if (script_conf) *script_conf = osr.best_result.sconfidence;
1888 
1889  return true;
1890 }
1891 
1897 char* TessBaseAPI::GetOsdText(int page_number) {
1898  int orient_deg;
1899  float orient_conf;
1900  const char* script_name;
1901  float script_conf;
1902 
1903  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name,
1904  &script_conf))
1905  return NULL;
1906 
1907  // clockwise rotation needed to make the page upright
1908  int rotate = OrientationIdToValue(orient_deg / 90);
1909 
1910  const int kOsdBufsize = 255;
1911  char* osd_buf = new char[kOsdBufsize];
1912  snprintf(osd_buf, kOsdBufsize,
1913  "Page number: %d\n"
1914  "Orientation in degrees: %d\n"
1915  "Rotate: %d\n"
1916  "Orientation confidence: %.2f\n"
1917  "Script: %s\n"
1918  "Script confidence: %.2f\n",
1919  page_number, orient_deg, rotate, orient_conf, script_name,
1920  script_conf);
1921 
1922  return osd_buf;
1923 }
1924 
1927  int* conf = AllWordConfidences();
1928  if (!conf) return 0;
1929  int sum = 0;
1930  int *pt = conf;
1931  while (*pt >= 0) sum += *pt++;
1932  if (pt != conf) sum /= pt - conf;
1933  delete [] conf;
1934  return sum;
1935 }
1936 
1939  if (tesseract_ == NULL ||
1940  (!recognition_done_ && Recognize(NULL) < 0))
1941  return NULL;
1942  int n_word = 0;
1943  PAGE_RES_IT res_it(page_res_);
1944  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
1945  n_word++;
1946 
1947  int* conf = new int[n_word+1];
1948  n_word = 0;
1949  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
1950  WERD_RES *word = res_it.word();
1951  WERD_CHOICE* choice = word->best_choice;
1952  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1953  // This is the eq for converting Tesseract confidence to 1..100
1954  if (w_conf < 0) w_conf = 0;
1955  if (w_conf > 100) w_conf = 100;
1956  conf[n_word++] = w_conf;
1957  }
1958  conf[n_word] = -1;
1959  return conf;
1960 }
1961 
1972 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
1973  int debug = 0;
1974  GetIntVariable("applybox_debug", &debug);
1975  bool success = true;
1976  PageSegMode current_psm = GetPageSegMode();
1977  SetPageSegMode(mode);
1978  SetVariable("classify_enable_learning", "0");
1979  const std::unique_ptr<const char[]> text(GetUTF8Text());
1980  if (debug) {
1981  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1982  }
1983  if (text != NULL) {
1984  PAGE_RES_IT it(page_res_);
1985  WERD_RES* word_res = it.word();
1986  if (word_res != NULL) {
1987  word_res->word->set_text(wordstr);
1988  } else {
1989  success = false;
1990  }
1991  // Check to see if text matches wordstr.
1992  int w = 0;
1993  int t = 0;
1994  for (t = 0; text[t] != '\0'; ++t) {
1995  if (text[t] == '\n' || text[t] == ' ')
1996  continue;
1997  while (wordstr[w] == ' ') ++w;
1998  if (text[t] != wordstr[w])
1999  break;
2000  ++w;
2001  }
2002  if (text[t] != '\0' || wordstr[w] != '\0') {
2003  // No match.
2004  delete page_res_;
2005  GenericVector<TBOX> boxes;
2009  PAGE_RES_IT pr_it(page_res_);
2010  if (pr_it.word() == NULL)
2011  success = false;
2012  else
2013  word_res = pr_it.word();
2014  } else {
2015  word_res->BestChoiceToCorrectText();
2016  }
2017  if (success) {
2018  tesseract_->EnableLearning = true;
2019  tesseract_->LearnWord(NULL, word_res);
2020  }
2021  } else {
2022  success = false;
2023  }
2024  SetPageSegMode(current_psm);
2025  return success;
2026 }
2027 
2035  if (thresholder_ != NULL)
2036  thresholder_->Clear();
2037  ClearResults();
2038  if (tesseract_ != NULL) SetInputImage(NULL);
2039 }
2040 
2048  Clear();
2049  delete thresholder_;
2050  thresholder_ = NULL;
2051  delete page_res_;
2052  page_res_ = NULL;
2053  delete block_list_;
2054  block_list_ = NULL;
2055  if (paragraph_models_ != NULL) {
2057  delete paragraph_models_;
2058  paragraph_models_ = NULL;
2059  }
2060  if (osd_tesseract_ == tesseract_) osd_tesseract_ = nullptr;
2061  delete tesseract_;
2062  tesseract_ = nullptr;
2063  delete osd_tesseract_;
2064  osd_tesseract_ = NULL;
2065  delete equ_detect_;
2066  equ_detect_ = NULL;
2067  delete input_file_;
2068  input_file_ = NULL;
2069  delete output_file_;
2070  output_file_ = NULL;
2071  delete datapath_;
2072  datapath_ = NULL;
2073  delete language_;
2074  language_ = NULL;
2075 }
2076 
2077 // Clear any library-level memory caches.
2078 // There are a variety of expensive-to-load constant data structures (mostly
2079 // language dictionaries) that are cached globally -- surviving the Init()
2080 // and End() of individual TessBaseAPI's. This function allows the clearing
2081 // of these caches.
2084 }
2085 
2090 int TessBaseAPI::IsValidWord(const char *word) {
2091  return tesseract_->getDict().valid_word(word);
2092 }
2093 // Returns true if utf8_character is defined in the UniCharset.
2094 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
2095  return tesseract_->unicharset.contains_unichar(utf8_character);
2096 }
2097 
2098 
2099 // TODO(rays) Obsolete this function and replace with a more aptly named
2100 // function that returns image coordinates rather than tesseract coordinates.
2101 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
2102  PageIterator* it = AnalyseLayout();
2103  if (it == NULL) {
2104  return false;
2105  }
2106  int x1, x2, y1, y2;
2107  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
2108  // Calculate offset and slope (NOTE: Kind of ugly)
2109  if (x2 <= x1) x2 = x1 + 1;
2110  // Convert the point pair to slope/offset of the baseline (in image coords.)
2111  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
2112  *out_offset = static_cast<int>(y1 - *out_slope * x1);
2113  // Get the y-coord of the baseline at the left and right edges of the
2114  // textline's bounding box.
2115  int left, top, right, bottom;
2116  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
2117  delete it;
2118  return false;
2119  }
2120  int left_y = IntCastRounded(*out_slope * left + *out_offset);
2121  int right_y = IntCastRounded(*out_slope * right + *out_offset);
2122  // Shift the baseline down so it passes through the nearest bottom-corner
2123  // of the textline's bounding box. This is the difference between the y
2124  // at the lowest (max) edge of the box and the actual box bottom.
2125  *out_offset += bottom - MAX(left_y, right_y);
2126  // Switch back to bottom-up tesseract coordinates. Requires negation of
2127  // the slope and height - offset for the offset.
2128  *out_slope = -*out_slope;
2129  *out_offset = rect_height_ - *out_offset;
2130  delete it;
2131 
2132  return true;
2133 }
2134 
2137  if (tesseract_ != NULL) {
2139  }
2140 }
2141 
2151  if (tesseract_ != NULL) {
2153  // Set it for the sublangs too.
2154  int num_subs = tesseract_->num_sub_langs();
2155  for (int i = 0; i < num_subs; ++i) {
2157  }
2158  }
2159 }
2160 
2163  if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
2164 }
2165 
2168  if (tesseract_ == NULL) {
2169  tprintf("Please call Init before attempting to set an image.\n");
2170  return false;
2171  }
2172  if (thresholder_ == NULL)
2174  ClearResults();
2175  return true;
2176 }
2177 
2184 bool TessBaseAPI::Threshold(Pix** pix) {
2185  ASSERT_HOST(pix != NULL);
2186  if (*pix != NULL)
2187  pixDestroy(pix);
2188  // Zero resolution messes up the algorithms, so make sure it is credible.
2189  int y_res = thresholder_->GetScaledYResolution();
2190  if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2191  // Use the minimum default resolution, as it is safer to under-estimate
2192  // than over-estimate resolution.
2193  tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", y_res,
2196  }
2197  PageSegMode pageseg_mode =
2198  static_cast<PageSegMode>(
2199  static_cast<int>(tesseract_->tessedit_pageseg_mode));
2200  if (!thresholder_->ThresholdToPix(pageseg_mode, pix)) return false;
2204  if (!thresholder_->IsBinary()) {
2207  } else {
2209  tesseract_->set_pix_grey(NULL);
2210  }
2211  // Set the internal resolution that is used for layout parameters from the
2212  // estimated resolution, rather than the image resolution, which may be
2213  // fabricated, but we will use the image resolution, if there is one, to
2214  // report output point sizes.
2215  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2218  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2219  tprintf("Estimated resolution %d out of range! Corrected to %d\n",
2220  thresholder_->GetScaledEstimatedResolution(), estimated_res);
2221  }
2222  tesseract_->set_source_resolution(estimated_res);
2223  SavePixForCrash(estimated_res, *pix);
2224  return true;
2225 }
2226 
2229  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
2230  tprintf("Please call SetImage before attempting recognition.\n");
2231  return -1;
2232  }
2233  if (recognition_done_)
2234  ClearResults();
2235  if (!block_list_->empty()) {
2236  return 0;
2237  }
2238  if (tesseract_ == NULL) {
2239  tesseract_ = new Tesseract;
2241  }
2242  if (tesseract_->pix_binary() == NULL &&
2244  return -1;
2245  }
2246 
2248 
2250  if (equ_detect_ == NULL && datapath_ != NULL) {
2251  equ_detect_ = new EquationDetect(datapath_->string(), NULL);
2252  }
2253  if (equ_detect_ == nullptr) {
2254  tprintf("Warning: Could not set equation detector\n");
2255  } else {
2257  }
2258  }
2259 
2260  Tesseract* osd_tess = osd_tesseract_;
2261  OSResults osr;
2263  osd_tess == nullptr) {
2264  if (strcmp(language_->string(), "osd") == 0) {
2265  osd_tess = tesseract_;
2266  } else {
2267  osd_tesseract_ = new Tesseract;
2268  TessdataManager mgr(reader_);
2269  if (datapath_ == nullptr) {
2270  tprintf("Warning: Auto orientation and script detection requested,"
2271  " but data path is undefined\n");
2272  delete osd_tesseract_;
2273  osd_tesseract_ = nullptr;
2274  } else if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr,
2275  "osd", OEM_TESSERACT_ONLY,
2276  nullptr, 0, nullptr, nullptr,
2277  false, &mgr) == 0) {
2278  osd_tess = osd_tesseract_;
2281  } else {
2282  tprintf("Warning: Auto orientation and script detection requested,"
2283  " but osd language failed to load\n");
2284  delete osd_tesseract_;
2285  osd_tesseract_ = nullptr;
2286  }
2287  }
2288  }
2289 
2290  if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
2291  return -1;
2292  // If Devanagari is being recognized, we use different images for page seg
2293  // and for OCR.
2294  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2295  return 0;
2296 }
2297 
2300  if (tesseract_ != NULL) {
2301  tesseract_->Clear();
2302  }
2303  if (page_res_ != NULL) {
2304  delete page_res_;
2305  page_res_ = NULL;
2306  }
2307  recognition_done_ = false;
2308  if (block_list_ == NULL)
2309  block_list_ = new BLOCK_LIST;
2310  else
2311  block_list_->clear();
2312  if (paragraph_models_ != NULL) {
2314  delete paragraph_models_;
2315  paragraph_models_ = NULL;
2316  }
2317  SavePixForCrash(0, NULL);
2318 }
2319 
2327 int TessBaseAPI::TextLength(int* blob_count) {
2328  if (tesseract_ == NULL || page_res_ == NULL)
2329  return 0;
2330 
2331  PAGE_RES_IT page_res_it(page_res_);
2332  int total_length = 2;
2333  int total_blobs = 0;
2334  // Iterate over the data structures to extract the recognition result.
2335  for (page_res_it.restart_page(); page_res_it.word () != NULL;
2336  page_res_it.forward()) {
2337  WERD_RES *word = page_res_it.word();
2338  WERD_CHOICE* choice = word->best_choice;
2339  if (choice != NULL) {
2340  total_blobs += choice->length() + 2;
2341  total_length += choice->unichar_string().length() + 2;
2342  for (int i = 0; i < word->reject_map.length(); ++i) {
2343  if (word->reject_map[i].rejected())
2344  ++total_length;
2345  }
2346  }
2347  }
2348  if (blob_count != NULL)
2349  *blob_count = total_blobs;
2350  return total_length;
2351 }
2352 
2358  if (tesseract_ == NULL)
2359  return false;
2360  ClearResults();
2361  if (tesseract_->pix_binary() == NULL &&
2363  return false;
2364  }
2365  if (input_file_ == NULL)
2366  input_file_ = new STRING(kInputFile);
2368 }
2369 
2371  tesseract_->min_orientation_margin.set_value(margin);
2372 }
2373 
2388 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
2389  bool** vertical_writing) {
2390  delete[] *block_orientation;
2391  *block_orientation = NULL;
2392  delete[] *vertical_writing;
2393  *vertical_writing = NULL;
2394  BLOCK_IT block_it(block_list_);
2395 
2396  block_it.move_to_first();
2397  int num_blocks = 0;
2398  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2399  if (!block_it.data()->poly_block()->IsText()) {
2400  continue;
2401  }
2402  ++num_blocks;
2403  }
2404  if (!num_blocks) {
2405  tprintf("WARNING: Found no blocks\n");
2406  return;
2407  }
2408  *block_orientation = new int[num_blocks];
2409  *vertical_writing = new bool[num_blocks];
2410  block_it.move_to_first();
2411  int i = 0;
2412  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
2413  block_it.forward()) {
2414  if (!block_it.data()->poly_block()->IsText()) {
2415  continue;
2416  }
2417  FCOORD re_rotation = block_it.data()->re_rotation();
2418  float re_theta = re_rotation.angle();
2419  FCOORD classify_rotation = block_it.data()->classify_rotation();
2420  float classify_theta = classify_rotation.angle();
2421  double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
2422  if (rot_theta < 0) rot_theta += 4;
2423  int num_rotations = static_cast<int>(rot_theta + 0.5);
2424  (*block_orientation)[i] = num_rotations;
2425  // The classify_rotation is non-zero only if the text has vertical
2426  // writing direction.
2427  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2428  ++i;
2429  }
2430 }
2431 
2432 // ____________________________________________________________________________
2433 // Ocropus add-ons.
2434 
2437  FindLines();
2438  BLOCK_LIST* result = block_list_;
2439  block_list_ = NULL;
2440  return result;
2441 }
2442 
2448 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
2449  delete block_list;
2450 }
2451 
2452 
2454  float xheight,
2455  float descender,
2456  float ascender) {
2457  int32_t xstarts[] = {-32000};
2458  double quad_coeffs[] = {0, 0, baseline};
2459  return new ROW(1,
2460  xstarts,
2461  quad_coeffs,
2462  xheight,
2463  ascender - (baseline + xheight),
2464  descender - baseline,
2465  0,
2466  0);
2467 }
2468 
2471  int width = pixGetWidth(pix);
2472  int height = pixGetHeight(pix);
2473  BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
2474 
2475  // Create C_BLOBs from the page
2476  extract_edges(pix, &block);
2477 
2478  // Merge all C_BLOBs
2479  C_BLOB_LIST *list = block.blob_list();
2480  C_BLOB_IT c_blob_it(list);
2481  if (c_blob_it.empty())
2482  return NULL;
2483  // Move all the outlines to the first blob.
2484  C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
2485  for (c_blob_it.forward();
2486  !c_blob_it.at_first();
2487  c_blob_it.forward()) {
2488  C_BLOB *c_blob = c_blob_it.data();
2489  ol_it.add_list_after(c_blob->out_list());
2490  }
2491  // Convert the first blob to the output TBLOB.
2492  return TBLOB::PolygonalCopy(false, c_blob_it.data());
2493 }
2494 
2500 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
2501  TBOX box = tblob->bounding_box();
2502  float x_center = (box.left() + box.right()) / 2.0f;
2503  float baseline = row->base_line(x_center);
2504  float scale = kBlnXHeight / row->x_height();
2505  tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale,
2506  0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL);
2507 }
2508 
2513 TBLOB *make_tesseract_blob(float baseline, float xheight,
2514  float descender, float ascender,
2515  bool numeric_mode, Pix* pix) {
2516  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
2517 
2518  // Normalize TBLOB
2519  ROW *row =
2520  TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
2521  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
2522  delete row;
2523  return tblob;
2524 }
2525 
2531 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
2532  int length,
2533  float baseline,
2534  float xheight,
2535  float descender,
2536  float ascender) {
2537  UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
2538  TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
2540  tesseract_->pix_binary());
2541  float threshold;
2542  float best_rating = -100;
2543 
2544 
2545  // Classify to get a raw choice.
2546  BLOB_CHOICE_LIST choices;
2547  tesseract_->AdaptiveClassifier(blob, &choices);
2548  BLOB_CHOICE_IT choice_it;
2549  choice_it.set_to_list(&choices);
2550  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2551  choice_it.forward()) {
2552  if (choice_it.data()->rating() > best_rating) {
2553  best_rating = choice_it.data()->rating();
2554  }
2555  }
2556 
2557  threshold = tesseract_->matcher_good_threshold;
2558 
2559  if (blob->outlines)
2560  tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold,
2562  delete blob;
2563 }
2564 
2565 
2566 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
2567  PAGE_RES *page_res = new PAGE_RES(false, block_list,
2569  tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
2570  return page_res;
2571 }
2572 
2573 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
2574  PAGE_RES* pass1_result) {
2575  if (!pass1_result)
2576  pass1_result = new PAGE_RES(false, block_list,
2578  tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
2579  return pass1_result;
2580 }
2581 
2582 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2583  int debug_level = 0;
2584  GetIntVariable("paragraph_debug_level", &debug_level);
2585  if (paragraph_models_ == NULL)
2587  MutableIterator *result_it = GetMutableIterator();
2588  do { // Detect paragraphs for this block
2590  ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
2591  result_it, &models);
2592  *paragraph_models_ += models;
2593  } while (result_it->Next(RIL_BLOCK));
2594  delete result_it;
2595 }
2596 
2599  int length; // of unicode_repr
2600  float cost;
2602 
2603  TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
2604  length = (len == -1 ? strlen(repr) : len);
2605  unicode_repr = new char[length + 1];
2606  strncpy(unicode_repr, repr, length);
2607  }
2608 
2609  TESS_CHAR() { // Satisfies ELISTIZE.
2610  }
2612  delete [] unicode_repr;
2613  }
2614 };
2615 
2618 
2619 static void add_space(TESS_CHAR_IT* it) {
2620  TESS_CHAR *t = new TESS_CHAR(0, " ");
2621  it->add_after_then_move(t);
2622 }
2623 
2624 
2625 static float rating_to_cost(float rating) {
2626  rating = 100 + rating;
2627  // cuddled that to save from coverage profiler
2628  // (I have never seen ratings worse than -100,
2629  // but the check won't hurt)
2630  if (rating < 0) rating = 0;
2631  return rating;
2632 }
2633 
2638 static void extract_result(TESS_CHAR_IT* out,
2639  PAGE_RES* page_res) {
2640  PAGE_RES_IT page_res_it(page_res);
2641  int word_count = 0;
2642  while (page_res_it.word() != NULL) {
2643  WERD_RES *word = page_res_it.word();
2644  const char *str = word->best_choice->unichar_string().string();
2645  const char *len = word->best_choice->unichar_lengths().string();
2646  TBOX real_rect = word->word->bounding_box();
2647 
2648  if (word_count)
2649  add_space(out);
2650  int n = strlen(len);
2651  for (int i = 0; i < n; i++) {
2652  TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
2653  str, *len);
2654  tc->box = real_rect.intersection(word->box_word->BlobBox(i));
2655  out->add_after_then_move(tc);
2656  str += *len;
2657  len++;
2658  }
2659  page_res_it.forward();
2660  word_count++;
2661  }
2662 }
2663 
2669  int** lengths,
2670  float** costs,
2671  int** x0,
2672  int** y0,
2673  int** x1,
2674  int** y1,
2675  PAGE_RES* page_res) {
2676  TESS_CHAR_LIST tess_chars;
2677  TESS_CHAR_IT tess_chars_it(&tess_chars);
2678  extract_result(&tess_chars_it, page_res);
2679  tess_chars_it.move_to_first();
2680  int n = tess_chars.length();
2681  int text_len = 0;
2682  *lengths = new int[n];
2683  *costs = new float[n];
2684  *x0 = new int[n];
2685  *y0 = new int[n];
2686  *x1 = new int[n];
2687  *y1 = new int[n];
2688  int i = 0;
2689  for (tess_chars_it.mark_cycle_pt();
2690  !tess_chars_it.cycled_list();
2691  tess_chars_it.forward(), i++) {
2692  TESS_CHAR *tc = tess_chars_it.data();
2693  text_len += (*lengths)[i] = tc->length;
2694  (*costs)[i] = tc->cost;
2695  (*x0)[i] = tc->box.left();
2696  (*y0)[i] = tc->box.bottom();
2697  (*x1)[i] = tc->box.right();
2698  (*y1)[i] = tc->box.top();
2699  }
2700  char *p = *text = new char[text_len];
2701 
2702  tess_chars_it.move_to_first();
2703  for (tess_chars_it.mark_cycle_pt();
2704  !tess_chars_it.cycled_list();
2705  tess_chars_it.forward()) {
2706  TESS_CHAR *tc = tess_chars_it.data();
2707  strncpy(p, tc->unicode_repr, tc->length);
2708  p += tc->length;
2709  }
2710  return n;
2711 }
2712 
2714 // The resulting features are returned in int_features, which must be
2715 // of size MAX_NUM_INT_FEATURES. The number of features is returned in
2716 // num_features (or 0 if there was a failure).
2717 // On return feature_outline_index is filled with an index of the outline
2718 // corresponding to each feature in int_features.
2719 // TODO(rays) Fix the caller to out outline_counts instead.
2721  INT_FEATURE_STRUCT* int_features,
2722  int* num_features,
2723  int* feature_outline_index) {
2724  GenericVector<int> outline_counts;
2727  INT_FX_RESULT_STRUCT fx_info;
2728  tesseract_->ExtractFeatures(*blob, false, &bl_features,
2729  &cn_features, &fx_info, &outline_counts);
2730  if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) {
2731  *num_features = 0;
2732  return; // Feature extraction failed.
2733  }
2734  *num_features = cn_features.size();
2735  memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
2736  // TODO(rays) Pass outline_counts back and simplify the calling code.
2737  if (feature_outline_index != NULL) {
2738  int f = 0;
2739  for (int i = 0; i < outline_counts.size(); ++i) {
2740  while (f < outline_counts[i])
2741  feature_outline_index[f++] = i;
2742  }
2743  }
2744 }
2745 
2746 // This method returns the row to which a box of specified dimensions would
2747 // belong. If no good match is found, it returns NULL.
2748 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
2749  int left, int top, int right, int bottom) {
2750  TBOX box(left, bottom, right, top);
2751  BLOCK_IT b_it(blocks);
2752  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
2753  BLOCK* block = b_it.data();
2754  if (!box.major_overlap(block->bounding_box()))
2755  continue;
2756  ROW_IT r_it(block->row_list());
2757  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
2758  ROW* row = r_it.data();
2759  if (!box.major_overlap(row->bounding_box()))
2760  continue;
2761  WERD_IT w_it(row->word_list());
2762  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
2763  WERD* word = w_it.data();
2764  if (box.major_overlap(word->bounding_box()))
2765  return row;
2766  }
2767  }
2768  }
2769  return NULL;
2770 }
2771 
2774  int num_max_matches,
2775  int* unichar_ids,
2776  float* ratings,
2777  int* num_matches_returned) {
2778  BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
2779  tesseract_->AdaptiveClassifier(blob, choices);
2780  BLOB_CHOICE_IT choices_it(choices);
2781  int& index = *num_matches_returned;
2782  index = 0;
2783  for (choices_it.mark_cycle_pt();
2784  !choices_it.cycled_list() && index < num_max_matches;
2785  choices_it.forward()) {
2786  BLOB_CHOICE* choice = choices_it.data();
2787  unichar_ids[index] = choice->unichar_id();
2788  ratings[index] = choice->rating();
2789  ++index;
2790  }
2791  *num_matches_returned = index;
2792  delete choices;
2793 }
2794 
2796 const char* TessBaseAPI::GetUnichar(int unichar_id) {
2797  return tesseract_->unicharset.id_to_unichar(unichar_id);
2798 }
2799 
2801 const Dawg *TessBaseAPI::GetDawg(int i) const {
2802  if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
2803  return tesseract_->getDict().GetDawg(i);
2804 }
2805 
2808  return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
2809 }
2810 
2812 STRING HOcrEscape(const char* text) {
2813  STRING ret;
2814  const char *ptr;
2815  for (ptr = text; *ptr; ptr++) {
2816  switch (*ptr) {
2817  case '<': ret += "&lt;"; break;
2818  case '>': ret += "&gt;"; break;
2819  case '&': ret += "&amp;"; break;
2820  case '"': ret += "&quot;"; break;
2821  case '\'': ret += "&#39;"; break;
2822  default: ret += *ptr;
2823  }
2824  }
2825  return ret;
2826 }
2827 
2828 } // namespace tesseract.
const int kMinRectSize
Definition: baseapi.cpp:85
int orientation_id
Definition: osdetect.h:41
void signal_exit(int signal_code)
Definition: globaloc.cpp:52
const char * kOldVarsFile
Definition: baseapi.cpp:100
Dict & getDict() override
const int kLatinChs[]
Definition: baseapi.cpp:1747
const char * GetUnichar(int unichar_id)
Definition: baseapi.cpp:2796
float Confidence(PageIteratorLevel level) const
int(Dict::* DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: baseapi.h:76
GenericVector< DoubleParam * > double_params
Definition: params.h:47
float angle() const
find angle
Definition: points.h:249
void SetFillLatticeFunc(FillLatticeFunc f)
Definition: baseapi.cpp:2162
virtual Pix * GetPixRectGrey()
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
const char * WordRecognitionLanguage() const
#define UNICHAR_LEN
Definition: unichar.h:31
TESSLINE * outlines
Definition: blobs.h:377
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:472
float base_line(float xpos) const
Definition: ocrrow.h:56
const char * string() const
Definition: params.h:202
bool empty() const
Definition: genericvector.h:91
void SetEquationDetect(EquationDetect *detector)
void SetRectangle(int left, int top, int width, int height)
WERD_CHOICE * best_choice
Definition: pageres.h:219
UNICHARSET * unicharset
Definition: osdetect.h:78
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:778
REJMAP reject_map
Definition: pageres.h:271
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
#define TRUE
Definition: capi.h:45
char * GetOsdText(int page_number)
Definition: baseapi.cpp:1897
void SetRectangle(int left, int top, int width, int height)
Definition: baseapi.cpp:592
void GetAvailableLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:439
const int kMaxBytesPerLine
Definition: baseapi.cpp:1693
constexpr int kMinCredibleResolution
Definition: publictypes.h:38
char * GetTSVText(int page_number)
Definition: baseapi.cpp:1574
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
Definition: werd.h:36
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
bool GetTextDirection(int *out_offset, float *out_slope)
Definition: baseapi.cpp:2101
void SetProbabilityInContextFunc(ProbabilityInContextFunc f)
Definition: baseapi.cpp:2150
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:859
WERD_LIST * word_list()
Definition: ocrrow.h:52
const char * kInputFile
Definition: baseapi.cpp:96
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
static TESS_LOCAL int TesseractExtractResult(char **text, int **lengths, float **costs, int **x0, int **y0, int **x1, int **y1, PAGE_RES *page_res)
Definition: baseapi.cpp:2668
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:412
Pix * pix_grey() const
void delete_data_pointers()
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
PolyBlockType BlockType() const
Boxa * GetRegions(Pixa **pixa)
Definition: baseapi.cpp:617
double(Dict::* ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Definition: baseapi.h:78
unsigned char BOOL8
Definition: host.h:36
#define PERF_COUNT_START(FUNCT_NAME)
int GetScaledYResolution() const
Definition: thresholder.h:93
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:416
GenericVector< IntParam * > int_params
Definition: params.h:44
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
void assign(const char *cstr, int len)
Definition: strngs.cpp:422
GenericVector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:863
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:326
static void ResetToDefaults(ParamsVectors *member_params)
Definition: params.cpp:198
Tesseract * get_sub_lang(int index) const
STRING * input_file_
Name used by training code.
Definition: baseapi.h:866
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:46
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
float y() const
Definition: points.h:212
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
int InitLangMod(const char *datapath, const char *language)
Definition: baseapi.cpp:453
#define PI
Definition: const.h:19
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:858
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:872
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:860
STRING lang
Definition: ccutil.h:66
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:555
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
void add_str_double(const char *str, double number)
Definition: strngs.cpp:391
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
int16_t left() const
Definition: rect.h:68
int IsValidWord(const char *word)
Definition: baseapi.cpp:2090
MutableIterator * GetMutableIterator()
Definition: baseapi.cpp:1244
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
TBLOB * make_tesseract_blob(float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix)
Definition: baseapi.cpp:2513
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:798
void SetSourceResolution(int ppi)
Definition: baseapi.cpp:565
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:492
Definition: strngs.h:45
int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:562
Definition: rect.h:30
void DeleteUnusedDawgs()
Definition: dawg_cache.h:43
char * GetUTF8Text(PageIteratorLevel level) const
virtual bool Next(PageIteratorLevel level)
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
Definition: blobs.h:261
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:772
void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing)
Definition: baseapi.cpp:2388
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2357
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1210
bool wordrec_run_blamer
Definition: wordrec.h:168
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1972
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:871
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
FileReader reader_
Reads files from any filesystem.
Definition: baseapi.h:861
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2812
int GetThresholdedImageScaleFactor() const
Definition: baseapi.cpp:756
ParamsVectors * params()
Definition: ccutil.h:62
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:300
#define BOOL
Definition: capi.h:44
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
int32_t length() const
Definition: rejctmap.h:226
const int kBlnXHeight
Definition: normalis.h:28
void GetFeaturesForBlob(TBLOB *blob, INT_FEATURE_STRUCT *int_features, int *num_features, int *feature_outline_index)
Definition: baseapi.cpp:2720
TESS_LOCAL void AdaptToCharacter(const char *unichar_repr, int length, float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2531
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
const char * GetDatapath()
Definition: baseapi.cpp:921
FILE * init_recog_training(const STRING &fname)
TESS_LOCAL PAGE_RES * RecognitionPass1(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2566
int length() const
Definition: ratngs.h:299
const STRING & unichar_lengths() const
Definition: ratngs.h:544
virtual Pix * GetPixRectThresholds()
int size() const
Definition: genericvector.h:72
STRING * language_
Last initialized language.
Definition: baseapi.h:869
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:97
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:50
Definition: werd.h:35
BLOCK * block
Definition: pageres.h:99
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:172
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
bool IsValidCharacter(const char *utf8_character)
Definition: baseapi.cpp:2094
const char kUNLVReject
Definition: baseapi.cpp:89
OcrEngineMode oem() const
Definition: baseapi.h:759
Tesseract * tesseract() const
Definition: baseapi.h:757
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:499
void GetLoadedLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:425
void SavePixForCrash(int resolution, Pix *pix)
Definition: globaloc.cpp:34
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:285
const int kBytesPerNumber
Definition: baseapi.cpp:1678
bool SetDebugVariable(const char *name, const char *value)
Definition: baseapi.cpp:272
void set_deadline_msecs(int32_t deadline_msecs)
Definition: ocrclass.h:146
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void set_pix_original(Pix *original_pix)
int NumDawgs() const
Definition: baseapi.cpp:2807
const char * GetInputName()
Definition: baseapi.cpp:915
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1053
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1684
TruthCallback * truth_cb_
Definition: baseapi.h:872
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:867
const int kMaxIntSize
Definition: baseapi.cpp:102
void chomp_string(char *str)
Definition: helpers.h:82
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:337
int push_back(T object)
const int kUniChs[]
Definition: baseapi.cpp:1743
GenericVector< BoolParam * > bool_params
Definition: params.h:45
TBOX bounding_box() const
Definition: ocrrow.h:85
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:75
#define ELISTIZE(CLASSNAME)
Definition: elst.h:961
virtual TESS_LOCAL bool Threshold(Pix **pix)
Definition: baseapi.cpp:2184
void BestChoiceToCorrectText()
Definition: pageres.cpp:918
Pix * GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1027
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
void SetInputName(const char *name)
Definition: baseapi.cpp:251
Definition: werd.h:60
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void SetDictFunc(DictFunc f)
Definition: baseapi.cpp:2136
const char kUNLVSuspect
Definition: baseapi.cpp:91
WERD * word
Definition: pageres.h:175
int RecognizeForChopTest(ETEXT_DESC *monitor)
Definition: baseapi.cpp:879
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:314
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1686
bool tessedit_resegment_from_line_boxes
Definition: ocrrow.h:32
void ClearAdaptiveClassifier()
Definition: baseapi.cpp:541
static size_t getOpenCLDevice(void **device)
Definition: baseapi.cpp:212
const STRING & unichar_string() const
Definition: ratngs.h:537
float x_height() const
Definition: ocrrow.h:61
void RunAdaptiveClassifier(TBLOB *blob, int num_max_matches, int *unichar_ids, float *ratings, int *num_matches_returned)
Definition: baseapi.cpp:2773
bool AnyLSTMLang() const
double matcher_good_threshold
Definition: classify.h:419
void set_min_orientation_margin(double margin)
Definition: baseapi.cpp:2370
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:870
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:862
#define FALSE
Definition: capi.h:46
TESS_LOCAL int TextLength(int *blob_count)
Definition: baseapi.cpp:2327
Boxa * GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:629
Pix * pix_original() const
float rating() const
Definition: ratngs.h:79
Orientation and script detection only.
Definition: publictypes.h:164
#define tprintf(...)
Definition: tprintf.h:31
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:911
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:117
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode)
Definition: baseapi.cpp:2500
bool LoadMemBuffer(const char *name, const char *data, int size)
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:865
tesseract::BoxWord * box_word
Definition: pageres.h:250
static bool GetParamAsString(const char *name, const ParamsVectors *member_params, STRING *value)
Definition: params.cpp:135
char * TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height)
Definition: baseapi.cpp:519
float oconfidence
Definition: osdetect.h:44
ROW_RES * row() const
Definition: pageres.h:739
bool classify_bln_numeric_mode
Definition: classify.h:499
int UNICHAR_ID
Definition: unichar.h:35
void set_pix_grey(Pix *grey_pix)
float sconfidence
Definition: osdetect.h:43
GenericVector< StringParam * > string_params
Definition: params.h:46
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:412
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:192
#define PERF_COUNT_SUB(SUB)
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1145
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:948
TESS_LOCAL bool InternalSetImage()
Definition: baseapi.cpp:2167
WERD_RES * restart_page()
Definition: pageres.h:683
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1866
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:864
#define PERF_COUNT_END
STRING * datapath_
Current location of tessdata.
Definition: baseapi.h:868
Boxa * GetComponentImages(const PageIteratorLevel level, const bool text_only, const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:674
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
Pix * GetBinaryImage(PageIteratorLevel level) const
bool stream_filelist
Definition: baseapi.cpp:80
char * GetBoxText(int page_number)
Definition: baseapi.cpp:1702
STRING datadir
Definition: ccutil.h:64
Boxa * GetConnectedComponents(Pixa **cc)
Definition: baseapi.cpp:662
Pix * GetThresholdedImage()
Definition: baseapi.cpp:603
Definition: points.h:189
WERD_RES * forward()
Definition: pageres.h:716
static const char * Version()
Definition: baseapi.cpp:196
void set_source_resolution(int ppi)
bool Empty(PageIteratorLevel level) const
BLOCK_LIST * FindLinesCreateBlockList()
Definition: baseapi.cpp:2436
int num_sub_langs() const
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
const int kNumbersPerBlob
Definition: baseapi.cpp:1673
virtual ~TessBaseAPI()
Definition: baseapi.cpp:189
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:286
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:447
const Dawg * GetDawg(int i) const
Definition: baseapi.cpp:2801
#define DIR
Definition: polyaprx.cpp:39
CMD_EVENTS mode
Definition: pgedit.cpp:116
const char * c_str() const
Definition: strngs.cpp:209
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:414
int GetScaledEstimatedResolution() const
Definition: thresholder.h:106
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
const char * string() const
Definition: strngs.cpp:198
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
TBOX bounding_box() const
Definition: blobs.cpp:482
void SetOutputName(const char *name)
Definition: baseapi.cpp:259
void RowAttributes(float *row_height, float *descenders, float *ascenders) const
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253
static void DeleteBlockList(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2448
OSBestResult best_result
Definition: osdetect.h:79
void(Wordrec::* FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: baseapi.h:85
#define MAX(x, y)
Definition: ndminx.h:24
bool AddImage(TessBaseAPI *api)
Definition: renderer.cpp:83
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:294
void set_pix_thresholds(Pix *thresholds)
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2228
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288
Boxa * GetStrips(Pixa **pixa, int **blockids)
Definition: baseapi.cpp:643
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
Boxa * GetWords(Pixa **pixa)
Definition: baseapi.cpp:652
TESS_LOCAL PAGE_RES * RecognitionPass2(BLOCK_LIST *block_list, PAGE_RES *pass1_result)
Definition: baseapi.cpp:2573
int IntCastRounded(double x)
Definition: helpers.h:179
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
int32_t length() const
Definition: strngs.cpp:193
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
constexpr int kMaxCredibleResolution
Definition: publictypes.h:40
BLOCK_RES * block() const
Definition: pageres.h:742
virtual void Run(A1, A2, A3, A4)=0
static void ClearPersistentCache()
Definition: baseapi.cpp:2082
float rating() const
Definition: ratngs.h:323
UNICHARSET unicharset
Definition: ccutil.h:68
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:167
ResultIterator * GetIterator()
Definition: baseapi.cpp:1227
const char * GetInitLanguagesAsString() const
Definition: baseapi.cpp:415
float certainty() const
Definition: ratngs.h:326
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:366
int16_t top() const
Definition: rect.h:54
const char kTesseractReject
Definition: baseapi.cpp:87
static ROW * MakeTessOCRRow(float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2453
int first_uni() const
Definition: unichar.cpp:99
void set_text(const char *new_text)
Definition: werd.h:126
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:746
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:478
const int kBlnBaselineOffset
Definition: normalis.h:29
int16_t right() const
Definition: rect.h:75
int16_t bottom() const
Definition: rect.h:61
void SetSourceYResolution(int ppi)
Definition: thresholder.h:86
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:45
ROW * row
Definition: pageres.h:127
void ReadDebugConfigFile(const char *filename)
Definition: baseapi.cpp:483
uint8_t space()
Definition: werd.h:104
bool GetVariableAsString(const char *name, STRING *val)
Definition: baseapi.cpp:309
bool BeginDocument(const char *title)
Definition: renderer.cpp:72
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:853
static TBLOB * MakeTBLOB(Pix *pix)
Definition: baseapi.cpp:2470
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:42
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:796
#define PACKAGE_VERSION
Definition: config_auto.h:131
int orientation_and_script_detection(STRING &filename, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:188
StrongScriptDirection WordDirection() const
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:278
TBOX bounding_box() const
Definition: werd.cpp:160
Pix * pix_binary() const
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
virtual R Run()=0
TESS_LOCAL void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2582
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:111
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
#define MAX_PATH
Definition: platform.h:46
virtual bool Next(PageIteratorLevel level)
static void CatchSignals()
Definition: baseapi.cpp:232
Definition: ocrblock.h:30
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:266
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
Definition: baseapi.cpp:1402
static ROW * FindRowForBox(BLOCK_LIST *blocks, int left, int top, int right, int bottom)
Definition: baseapi.cpp:2748
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
TESS_CHAR(float _cost, const char *repr, int len=-1)
Definition: baseapi.cpp:2603
int GetSourceYResolution() const
Definition: thresholder.h:90
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300
WERD_RES * word() const
Definition: pageres.h:736
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:132
virtual bool ThresholdToPix(PageSegMode pageseg_mode, Pix **pix)
Returns false on error.