All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
equationdetect.cpp
Go to the documentation of this file.
1 // File: equationdetect.cpp
3 // Description: Helper classes to detect equations.
4 // Author: Zongyi (Joe) Liu (joeliu@google.com)
5 // Created: Fri Aug 31 11:13:01 PST 2011
6 //
7 // (C) Copyright 2011, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #include <mathfix.h>
23 #endif
24 
25 #ifdef __MINGW32__
26 #include <limits.h>
27 #endif
28 
29 #include <float.h>
30 
31 // Include automatically generated configuration file if running autoconf.
32 #ifdef HAVE_CONFIG_H
33 #include "config_auto.h"
34 #endif
35 
36 #include "equationdetect.h"
37 
38 #include "bbgrid.h"
39 #include "classify.h"
40 #include "colpartition.h"
41 #include "colpartitiongrid.h"
42 #include "colpartitionset.h"
43 #include "helpers.h"
44 #include "ratngs.h"
45 #include "tesseractclass.h"
46 
47 // Config variables.
48 BOOL_VAR(equationdetect_save_bi_image, false, "Save input bi image");
49 BOOL_VAR(equationdetect_save_spt_image, false, "Save special character image");
50 BOOL_VAR(equationdetect_save_seed_image, false, "Save the seed image");
51 BOOL_VAR(equationdetect_save_merged_image, false, "Save the merged image");
52 
53 namespace tesseract {
54 
56 // Utility ColParition sort functions.
58 static int SortCPByTopReverse(const void* p1, const void* p2) {
59  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
60  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
61  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
62  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
63  return box2.top() - box1.top();
64 }
65 
66 static int SortCPByBottom(const void* p1, const void* p2) {
67  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
68  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
69  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
70  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
71  return box1.bottom() - box2.bottom();
72 }
73 
74 static int SortCPByHeight(const void* p1, const void* p2) {
75  const ColPartition* cp1 = *reinterpret_cast<ColPartition* const*>(p1);
76  const ColPartition* cp2 = *reinterpret_cast<ColPartition* const*>(p2);
77  ASSERT_HOST(cp1 != NULL && cp2 != NULL);
78  const TBOX &box1(cp1->bounding_box()), &box2(cp2->bounding_box());
79  return box1.height() - box2.height();
80 }
81 
82 // TODO(joeliu): we may want to parameterize these constants.
83 const float kMathDigitDensityTh1 = 0.25;
84 const float kMathDigitDensityTh2 = 0.1;
85 const float kMathItalicDensityTh = 0.5;
86 const float kUnclearDensityTh = 0.25;
87 const int kSeedBlobsCountTh = 10;
89 
90 // Returns true if PolyBlockType is of text type or equation type.
92  return PTIsTextType(type) || type == PT_EQUATION;
93 }
94 
95 inline bool IsLeftIndented(const EquationDetect::IndentType type) {
96  return type == EquationDetect::LEFT_INDENT ||
98 }
99 
101  return type == EquationDetect::RIGHT_INDENT ||
103 }
104 
105 EquationDetect::EquationDetect(const char* equ_datapath,
106  const char* equ_name) {
107  const char* default_name = "equ";
108  if (equ_name == NULL) {
109  equ_name = default_name;
110  }
112  resolution_ = 0;
113  page_count_ = 0;
114 
115  // Construct equ_tesseract_.
116  equ_tesseract_ = new Tesseract();
117  if (equ_tesseract_->init_tesseract(equ_datapath, equ_name,
119  tprintf("Warning: equation region detection requested,"
120  " but %s failed to load from %s\n", equ_name, equ_datapath);
121  delete equ_tesseract_;
123  }
124 
126 }
127 
129  if (equ_tesseract_) {
130  delete (equ_tesseract_);
131  }
132  if (cps_super_bbox_) {
133  delete(cps_super_bbox_);
134  }
135 }
136 
138  lang_tesseract_ = lang_tesseract;
139 }
140 
141 void EquationDetect::SetResolution(const int resolution) {
142  resolution_ = resolution;
143 }
144 
146  if (to_block == NULL) {
147  tprintf("Warning: input to_block is NULL!\n");
148  return -1;
149  }
150 
152  blob_lists.push_back(&(to_block->blobs));
153  blob_lists.push_back(&(to_block->large_blobs));
154  for (int i = 0; i < blob_lists.size(); ++i) {
155  BLOBNBOX_IT bbox_it(blob_lists[i]);
156  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
157  bbox_it.forward()) {
158  bbox_it.data()->set_special_text_type(BSTT_NONE);
159  }
160  }
161 
162  return 0;
163 }
164 
166  BLOBNBOX *blobnbox, const int height_th) {
167  ASSERT_HOST(blobnbox != NULL);
168  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
169  // For small blob, we simply set to BSTT_NONE.
170  blobnbox->set_special_text_type(BSTT_NONE);
171  return;
172  }
173 
174  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
175  C_BLOB* blob = blobnbox->cblob();
176  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
177  // each classifier here, as they may require different PolygonalCopy.
178  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
179  const TBOX& box = tblob->bounding_box();
180 
181  // Normalize the blob. Set the origin to the place we want to be the
182  // bottom-middle, and scaling is to make the height the x-height.
183  float scaling = static_cast<float>(kBlnXHeight) / box.height();
184  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
185  TBLOB* normed_blob = new TBLOB(*tblob);
186  normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
187  0.0f, static_cast<float>(kBlnBaselineOffset),
188  false, NULL);
189  equ_tesseract_->AdaptiveClassifier(normed_blob, &ratings_equ);
190  lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang);
191  delete normed_blob;
192  delete tblob;
193 
194  // Get the best choice from ratings_lang and rating_equ. As the choice in the
195  // list has already been sorted by the certainty, we simply use the first
196  // choice.
197  BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL;
198  if (ratings_lang.length() > 0) {
199  BLOB_CHOICE_IT choice_it(&ratings_lang);
200  lang_choice = choice_it.data();
201  }
202  if (ratings_equ.length() > 0) {
203  BLOB_CHOICE_IT choice_it(&ratings_equ);
204  equ_choice = choice_it.data();
205  }
206 
207  float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
208  float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
209 
210  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
211  // The scores here are negative, so the max/min == fabs(min/max).
212  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
213  float diff = fabs(lang_score - equ_score);
215 
216  // Classification.
217  if (fmax(lang_score, equ_score) < kConfScoreTh) {
218  // If both score are very small, then mark it as unclear.
219  type = BSTT_UNCLEAR;
220  } else if (diff > kConfDiffTh && equ_score > lang_score) {
221  // If equ_score is significantly higher, then we classify this character as
222  // math symbol.
223  type = BSTT_MATH;
224  } else if (lang_choice) {
225  // For other cases: lang_score is similar or significantly higher.
226  type = EstimateTypeForUnichar(
227  lang_tesseract_->unicharset, lang_choice->unichar_id());
228  }
229 
230  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
231  lang_choice->fontinfo_id()).is_italic()) {
232  // For text symbol, we still check if it is italic.
234  } else {
235  blobnbox->set_special_text_type(type);
236  }
237 }
238 
240  const UNICHARSET& unicharset, const UNICHAR_ID id) const {
241  STRING s = unicharset.id_to_unichar(id);
242  if (unicharset.get_isalpha(id)) {
243  return BSTT_NONE;
244  }
245 
246  if (unicharset.get_ispunctuation(id)) {
247  // Exclude some special texts that are likely to be confused as math symbol.
248  static GenericVector<UNICHAR_ID> ids_to_exclude;
249  if (ids_to_exclude.empty()) {
250  static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
251  "〈", "〉", "《", "》", "」", "「", ""};
252  int i = 0;
253  while (kCharsToEx[i] != "") {
254  ids_to_exclude.push_back(
255  unicharset.unichar_to_id(kCharsToEx[i++].string()));
256  }
257  ids_to_exclude.sort();
258  }
259  return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
260  }
261 
262  // Check if it is digit. In addition to the isdigit attribute, we also check
263  // if this character belongs to those likely to be confused with a digit.
264  static const STRING kDigitsChars = "|";
265  if (unicharset.get_isdigit(id) ||
266  (s.length() == 1 && kDigitsChars.contains(s[0]))) {
267  return BSTT_DIGIT;
268  } else {
269  return BSTT_MATH;
270  }
271 }
272 
274  // Set configuration for Tesseract::AdaptiveClassifier.
275  equ_tesseract_->tess_cn_matching.set_value(true); // turn it on
276  equ_tesseract_->tess_bn_matching.set_value(false);
277 
278  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
279  int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
280  int classify_integer_matcher =
284 
286  ColPartition *part = NULL;
287  gsearch.StartFullSearch();
288  while ((part = gsearch.NextFullSearch()) != NULL) {
289  if (!IsTextOrEquationType(part->type())) {
290  continue;
291  }
292  IdentifyBlobsToSkip(part);
293  BLOBNBOX_C_IT bbox_it(part->boxes());
294  // Compute the height threshold.
295  GenericVector<int> blob_heights;
296  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
297  bbox_it.forward()) {
298  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
299  blob_heights.push_back(bbox_it.data()->bounding_box().height());
300  }
301  }
302  blob_heights.sort();
303  int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
304  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
305  bbox_it.forward()) {
306  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
307  IdentifySpecialText(bbox_it.data(), height_th);
308  }
309  }
310  }
311 
312  // Set the multiplier values back.
314  classify_class_pruner);
316  classify_integer_matcher);
317 
318  if (equationdetect_save_spt_image) { // For debug.
319  STRING outfile;
320  GetOutputTiffName("_spt", &outfile);
321  PaintSpecialTexts(outfile);
322  }
323 }
324 
326  ASSERT_HOST(part);
327  BLOBNBOX_C_IT blob_it(part->boxes());
328 
329  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
330  // At this moment, no blob should have been joined.
331  ASSERT_HOST(!blob_it.data()->joined_to_prev());
332  }
333  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
334  BLOBNBOX* blob = blob_it.data();
335  if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
336  continue;
337  }
338  TBOX blob_box = blob->bounding_box();
339 
340  // Search if any blob can be merged into blob. If found, then we mark all
341  // these blobs as BSTT_SKIP.
342  BLOBNBOX_C_IT blob_it2 = blob_it;
343  bool found = false;
344  while (!blob_it2.at_last()) {
345  BLOBNBOX* nextblob = blob_it2.forward();
346  const TBOX& nextblob_box = nextblob->bounding_box();
347  if (nextblob_box.left() >= blob_box.right()) {
348  break;
349  }
350  const float kWidthR = 0.4, kHeightR = 0.3;
351  bool xoverlap = blob_box.major_x_overlap(nextblob_box),
352  yoverlap = blob_box.y_overlap(nextblob_box);
353  float widthR = static_cast<float>(
354  MIN(nextblob_box.width(), blob_box.width())) /
355  MAX(nextblob_box.width(), blob_box.width());
356  float heightR = static_cast<float>(
357  MIN(nextblob_box.height(), blob_box.height())) /
358  MAX(nextblob_box.height(), blob_box.height());
359 
360  if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
361  // Found one, set nextblob type and recompute blob_box.
362  found = true;
363  nextblob->set_special_text_type(BSTT_SKIP);
364  blob_box += nextblob_box;
365  }
366  }
367  if (found) {
369  }
370  }
371 }
372 
374  ColPartitionGrid* part_grid, ColPartitionSet** best_columns) {
375  if (!equ_tesseract_ || !lang_tesseract_) {
376  tprintf("Warning: equ_tesseract_/lang_tesseract_ is NULL!\n");
377  return -1;
378  }
379  if (!part_grid || !best_columns) {
380  tprintf("part_grid/best_columns is NULL!!\n");
381  return -1;
382  }
383  cp_seeds_.clear();
384  part_grid_ = part_grid;
385  best_columns_ = best_columns;
387  STRING outfile;
388  page_count_++;
389 
391  GetOutputTiffName("_bi", &outfile);
392  pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
393  }
394 
395  // Pass 0: Compute special text type for blobs.
397 
398  // Pass 1: Merge parts by overlap.
400 
401  // Pass 2: compute the math blob density and find the seed partition.
403  // We still need separate seed into block seed and inline seed partition.
405 
407  GetOutputTiffName("_seed", &outfile);
408  PaintColParts(outfile);
409  }
410 
411  // Pass 3: expand block equation seeds.
412  while (!cp_seeds_.empty()) {
413  GenericVector<ColPartition*> seeds_expanded;
414  for (int i = 0; i < cp_seeds_.size(); ++i) {
415  if (ExpandSeed(cp_seeds_[i])) {
416  // If this seed is expanded, then we add it into seeds_expanded. Note
417  // this seed has been removed from part_grid_ if it is expanded.
418  seeds_expanded.push_back(cp_seeds_[i]);
419  }
420  }
421  // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
422  for (int i = 0; i < seeds_expanded.size(); ++i) {
423  InsertPartAfterAbsorb(seeds_expanded[i]);
424  }
425  cp_seeds_ = seeds_expanded;
426  }
427 
428  // Pass 4: find math block satellite text partitions and merge them.
430 
431  if (equationdetect_save_merged_image) { // For debug.
432  GetOutputTiffName("_merged", &outfile);
433  PaintColParts(outfile);
434  }
435 
436  return 0;
437 }
438 
440  while (true) {
441  ColPartition* part = NULL;
442  // partitions that have been updated.
443  GenericVector<ColPartition*> parts_updated;
445  gsearch.StartFullSearch();
446  while ((part = gsearch.NextFullSearch()) != NULL) {
447  if (!IsTextOrEquationType(part->type())) {
448  continue;
449  }
450  GenericVector<ColPartition*> parts_to_merge;
451  SearchByOverlap(part, &parts_to_merge);
452  if (parts_to_merge.empty()) {
453  continue;
454  }
455 
456  // Merge parts_to_merge with part, and remove them from part_grid_.
457  part_grid_->RemoveBBox(part);
458  for (int i = 0; i < parts_to_merge.size(); ++i) {
459  ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part);
460  part->Absorb(parts_to_merge[i], NULL);
461  }
462  gsearch.RepositionIterator();
463 
464  parts_updated.push_back(part);
465  }
466 
467  if (parts_updated.empty()) { // Exit the loop
468  break;
469  }
470 
471  // Re-insert parts_updated into part_grid_.
472  for (int i = 0; i < parts_updated.size(); ++i) {
473  InsertPartAfterAbsorb(parts_updated[i]);
474  }
475  }
476 }
477 
479  ColPartition* seed,
480  GenericVector<ColPartition*>* parts_overlap) {
481  ASSERT_HOST(seed != NULL && parts_overlap != NULL);
482  if (!IsTextOrEquationType(seed->type())) {
483  return;
484  }
486  const TBOX& seed_box(seed->bounding_box());
487  const int kRadNeighborCells = 30;
488  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
489  (seed_box.top() + seed_box.bottom()) / 2,
490  kRadNeighborCells);
491  search.SetUniqueMode(true);
492 
493  // Search iteratively.
494  ColPartition *part;
496  const float kLargeOverlapTh = 0.95;
497  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
498  while ((part = search.NextRadSearch()) != NULL) {
499  if (part == seed || !IsTextOrEquationType(part->type())) {
500  continue;
501  }
502  const TBOX& part_box(part->bounding_box());
503  bool merge = false;
504 
505  float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
506  y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
507 
508  // If part is large overlapped with seed, then set merge to true.
509  if (x_overlap_fraction >= kLargeOverlapTh &&
510  y_overlap_fraction >= kLargeOverlapTh) {
511  merge = true;
512  } else if (seed->type() == PT_EQUATION &&
513  IsTextOrEquationType(part->type())) {
514  if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
515  (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
516  merge = true;
517  }
518  }
519 
520  if (merge) { // Remove the part from search and put it into parts.
521  search.RemoveBBox();
522  parts_overlap->push_back(part);
523  }
524  }
525 }
526 
528  ASSERT_HOST(part);
529 
530  // Before insert part back into part_grid_, we will need re-compute some
531  // of its attributes such as first_column_, last_column_. However, we still
532  // want to preserve its type.
533  BlobTextFlowType flow_type = part->flow();
534  PolyBlockType part_type = part->type();
535  BlobRegionType blob_type = part->blob_type();
536 
537  // Call SetPartitionType to re-compute the attributes of part.
538  const TBOX& part_box(part->bounding_box());
539  int grid_x, grid_y;
541  part_box.left(), part_box.bottom(), &grid_x, &grid_y);
543 
544  // Reset the types back.
545  part->set_type(part_type);
546  part->set_blob_type(blob_type);
547  part->set_flow(flow_type);
548  part->SetBlobTypes();
549 
550  // Insert into part_grid_.
551  part_grid_->InsertBBox(true, true, part);
552 }
553 
556  ColPartition *part = NULL;
557  gsearch.StartFullSearch();
558 
559  GenericVector<ColPartition*> seeds1, seeds2;
560  // The left coordinates of indented text partitions.
561  GenericVector<int> indented_texts_left;
562  // The foreground density of text partitions.
563  GenericVector<float> texts_foreground_density;
564  while ((part = gsearch.NextFullSearch()) != NULL) {
565  if (!IsTextOrEquationType(part->type())) {
566  continue;
567  }
569  bool blobs_check = CheckSeedBlobsCount(part);
570  const int kTextBlobsTh = 20;
571 
572  if (CheckSeedDensity(kMathDigitDensityTh1, kMathDigitDensityTh2, part) &&
573  blobs_check) {
574  // Passed high density threshold test, save into seeds1.
575  seeds1.push_back(part);
576  } else {
577  IndentType indent = IsIndented(part);
578  if (IsLeftIndented(indent) && blobs_check &&
579  CheckSeedDensity(kMathDigitDensityTh2, kMathDigitDensityTh2, part)) {
580  // Passed low density threshold test and is indented, save into seeds2.
581  seeds2.push_back(part);
582  } else if (!IsRightIndented(indent) &&
583  part->boxes_count() > kTextBlobsTh) {
584  // This is likely to be a text part, save the features.
585  const TBOX&box = part->bounding_box();
586  if (IsLeftIndented(indent)) {
587  indented_texts_left.push_back(box.left());
588  }
589  texts_foreground_density.push_back(ComputeForegroundDensity(box));
590  }
591  }
592  }
593 
594  // Sort the features collected from text regions.
595  indented_texts_left.sort();
596  texts_foreground_density.sort();
597  float foreground_density_th = 0.15; // Default value.
598  if (!texts_foreground_density.empty()) {
599  // Use the median of the texts_foreground_density.
600  foreground_density_th = 0.8 * texts_foreground_density[
601  texts_foreground_density.size() / 2];
602  }
603 
604  for (int i = 0; i < seeds1.size(); ++i) {
605  const TBOX& box = seeds1[i]->bounding_box();
606  if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
607  !(IsLeftIndented(IsIndented(seeds1[i])) &&
608  CountAlignment(indented_texts_left, box.left()) >=
609  kLeftIndentAlignmentCountTh)) {
610  // Mark as PT_EQUATION type.
611  seeds1[i]->set_type(PT_EQUATION);
612  cp_seeds_.push_back(seeds1[i]);
613  } else { // Mark as PT_INLINE_EQUATION type.
614  seeds1[i]->set_type(PT_INLINE_EQUATION);
615  }
616  }
617 
618  for (int i = 0; i < seeds2.size(); ++i) {
619  if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
620  seeds2[i]->set_type(PT_EQUATION);
621  cp_seeds_.push_back(seeds2[i]);
622  }
623  }
624 }
625 
627 #if LIBLEPT_MINOR_VERSION < 69 && LIBLEPT_MAJOR_VERSION <= 1
628  // This will disable the detector because no seed will be identified.
629  return 1.0f;
630 #else
631  Pix *pix_bi = lang_tesseract_->pix_binary();
632  int pix_height = pixGetHeight(pix_bi);
633  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
634  tbox.width(), tbox.height());
635  Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL);
636  l_float32 fract;
637  pixForegroundFraction(pix_sub, &fract);
638  pixDestroy(&pix_sub);
639  boxDestroy(&box);
640 
641  return fract;
642 #endif
643 }
644 
645 bool EquationDetect::CheckSeedFgDensity(const float density_th,
646  ColPartition* part) {
647  ASSERT_HOST(part);
648 
649  // Split part horizontall, and check for each sub part.
650  GenericVector<TBOX> sub_boxes;
651  SplitCPHorLite(part, &sub_boxes);
652  float parts_passed = 0.0;
653  for (int i = 0; i < sub_boxes.size(); ++i) {
654  float density = ComputeForegroundDensity(sub_boxes[i]);
655  if (density < density_th) {
656  parts_passed++;
657  }
658  }
659 
660  // If most sub parts passed, then we return true.
661  const float kSeedPartRatioTh = 0.3;
662  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
663 
664  return retval;
665 }
666 
668  GenericVector<ColPartition*>* parts_splitted) {
669  ASSERT_HOST(part && parts_splitted);
670  if (part->median_width() == 0 || part->boxes_count() == 0) {
671  return;
672  }
673 
674  // Make a copy of part, and reset parts_splitted.
675  ColPartition* right_part = part->CopyButDontOwnBlobs();
676  parts_splitted->delete_data_pointers();
677  parts_splitted->clear();
678 
679  const double kThreshold = part->median_width() * 3.0;
680  bool found_split = true;
681  while (found_split) {
682  found_split = false;
683  BLOBNBOX_C_IT box_it(right_part->boxes());
684  // Blobs are sorted left side first. If blobs overlap,
685  // the previous blob may have a "more right" right side.
686  // Account for this by always keeping the largest "right"
687  // so far.
688  int previous_right = MIN_INT32;
689 
690  // Look for the next split in the partition.
691  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
692  const TBOX& box = box_it.data()->bounding_box();
693  if (previous_right != MIN_INT32 &&
694  box.left() - previous_right > kThreshold) {
695  // We have a split position. Split the partition in two pieces.
696  // Insert the left piece in the grid and keep processing the right.
697  int mid_x = (box.left() + previous_right) / 2;
698  ColPartition* left_part = right_part;
699  right_part = left_part->SplitAt(mid_x);
700 
701  parts_splitted->push_back(left_part);
702  left_part->ComputeSpecialBlobsDensity();
703  found_split = true;
704  break;
705  }
706 
707  // The right side of the previous blobs.
708  previous_right = MAX(previous_right, box.right());
709  }
710  }
711 
712  // Add the last piece.
713  right_part->ComputeSpecialBlobsDensity();
714  parts_splitted->push_back(right_part);
715 }
716 
718  GenericVector<TBOX>* splitted_boxes) {
719  ASSERT_HOST(part && splitted_boxes);
720  splitted_boxes->clear();
721  if (part->median_width() == 0) {
722  return;
723  }
724 
725  const double kThreshold = part->median_width() * 3.0;
726 
727  // Blobs are sorted left side first. If blobs overlap,
728  // the previous blob may have a "more right" right side.
729  // Account for this by always keeping the largest "right"
730  // so far.
731  TBOX union_box;
732  int previous_right = MIN_INT32;
733  BLOBNBOX_C_IT box_it(part->boxes());
734  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
735  const TBOX& box = box_it.data()->bounding_box();
736  if (previous_right != MIN_INT32 &&
737  box.left() - previous_right > kThreshold) {
738  // We have a split position.
739  splitted_boxes->push_back(union_box);
740  previous_right = MIN_INT32;
741  }
742  if (previous_right == MIN_INT32) {
743  union_box = box;
744  } else {
745  union_box += box;
746  }
747  // The right side of the previous blobs.
748  previous_right = MAX(previous_right, box.right());
749  }
750 
751  // Add the last piece.
752  if (previous_right != MIN_INT32) {
753  splitted_boxes->push_back(union_box);
754  }
755 }
756 
758  const GenericVector<int>& indented_texts_left,
759  const float foreground_density_th,
760  ColPartition* part) {
761  ASSERT_HOST(part);
762  const TBOX& box = part->bounding_box();
763 
764  // Check if it is aligned with any indented_texts_left.
765  if (!indented_texts_left.empty() &&
766  CountAlignment(indented_texts_left, box.left()) >=
767  kLeftIndentAlignmentCountTh) {
768  return false;
769  }
770 
771  // Check the foreground density.
772  if (ComputeForegroundDensity(box) > foreground_density_th) {
773  return false;
774  }
775 
776  return true;
777 }
778 
780  const GenericVector<int>& sorted_vec, const int val) const {
781  if (sorted_vec.empty()) {
782  return 0;
783  }
784  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
785  int pos = sorted_vec.binary_search(val), count = 0;
786 
787  // Search left side.
788  int index = pos;
789  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
790  count++;
791  }
792 
793  // Search right side.
794  index = pos + 1;
795  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
796  count++;
797  }
798 
799  return count;
800 }
801 
805  int textparts_linespacing = EstimateTextPartLineSpacing();
806  IdentifyInlinePartsVertical(true, textparts_linespacing);
807  IdentifyInlinePartsVertical(false, textparts_linespacing);
808 }
809 
812  ColPartition *part = NULL;
813  gsearch.StartFullSearch();
814  if (cps_super_bbox_) {
815  delete cps_super_bbox_;
816  }
817  cps_super_bbox_ = new TBOX();
818  while ((part = gsearch.NextFullSearch()) != NULL) {
819  (*cps_super_bbox_) += part->bounding_box();
820  }
821 }
822 
826  const int kMarginDiffTh = IntCastRounded(
828  const int kGapTh = static_cast<int>(roundf(
831  search.SetUniqueMode(true);
832  // The center x coordinate of the cp_super_bbox_.
833  int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
834  for (int i = 0; i < cp_seeds_.size(); ++i) {
835  ColPartition* part = cp_seeds_[i];
836  const TBOX& part_box(part->bounding_box());
837  int left_margin = part_box.left() - cps_super_bbox_->left(),
838  right_margin = cps_super_bbox_->right() - part_box.right();
839  bool right_to_left;
840  if (left_margin + kMarginDiffTh < right_margin &&
841  left_margin < kMarginDiffTh) {
842  // part is left aligned, so we search if it has any right neighbor.
843  search.StartSideSearch(
844  part_box.right(), part_box.top(), part_box.bottom());
845  right_to_left = false;
846  } else if (left_margin > cps_cx) {
847  // part locates on the right half on image, so search if it has any left
848  // neighbor.
849  search.StartSideSearch(
850  part_box.left(), part_box.top(), part_box.bottom());
851  right_to_left = true;
852  } else { // part is not an inline equation.
853  new_seeds.push_back(part);
854  continue;
855  }
856  ColPartition* neighbor = NULL;
857  bool side_neighbor_found = false;
858  while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) {
859  const TBOX& neighbor_box(neighbor->bounding_box());
860  if (!IsTextOrEquationType(neighbor->type()) ||
861  part_box.x_gap(neighbor_box) > kGapTh ||
862  !part_box.major_y_overlap(neighbor_box) ||
863  part_box.major_x_overlap(neighbor_box)) {
864  continue;
865  }
866  // We have found one. Set the side_neighbor_found flag.
867  side_neighbor_found = true;
868  break;
869  }
870  if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
872  } else {
873  // Check the geometric feature of neighbor.
874  const TBOX& neighbor_box(neighbor->bounding_box());
875  if (neighbor_box.width() > part_box.width() &&
876  neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
878  } else { // part is not an inline equation type.
879  new_seeds.push_back(part);
880  }
881  }
882  }
883 
884  // Reset the cp_seeds_ using the new_seeds.
885  cp_seeds_ = new_seeds;
886 }
887 
890 
891  // Get the y gap between text partitions;
892  ColPartition *current = NULL, *prev = NULL;
893  gsearch.StartFullSearch();
894  GenericVector<int> ygaps;
895  while ((current = gsearch.NextFullSearch()) != NULL) {
896  if (!PTIsTextType(current->type())) {
897  continue;
898  }
899  if (prev != NULL) {
900  const TBOX &current_box = current->bounding_box();
901  const TBOX &prev_box = prev->bounding_box();
902  // prev and current should be x major overlap and non y overlap.
903  if (current_box.major_x_overlap(prev_box) &&
904  !current_box.y_overlap(prev_box)) {
905  int gap = current_box.y_gap(prev_box);
906  if (gap < MIN(current_box.height(), prev_box.height())) {
907  // The gap should be smaller than the height of the bounding boxes.
908  ygaps.push_back(gap);
909  }
910  }
911  }
912  prev = current;
913  }
914 
915  if (ygaps.size() < 8) { // We do not have enough data.
916  return -1;
917  }
918 
919  // Compute the line spacing from ygaps: use the mean of the first half.
920  ygaps.sort();
921  int spacing = 0, count;
922  for (count = 0; count < ygaps.size() / 2; count++) {
923  spacing += ygaps[count];
924  }
925  return spacing / count;
926 }
927 
929  const bool top_to_bottom, const int textparts_linespacing) {
930  if (cp_seeds_.empty()) {
931  return;
932  }
933 
934  // Sort cp_seeds_.
935  if (top_to_bottom) { // From top to bottom.
936  cp_seeds_.sort(&SortCPByTopReverse);
937  } else { // From bottom to top.
938  cp_seeds_.sort(&SortCPByBottom);
939  }
940 
942  for (int i = 0; i < cp_seeds_.size(); ++i) {
943  ColPartition* part = cp_seeds_[i];
944  // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
945  // for its top neighbors, so that if two/more inline regions are connected
946  // to each other, then we will identify the top one, and then use it to
947  // identify the bottom one.
948  if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
950  } else {
951  new_seeds.push_back(part);
952  }
953  }
954  cp_seeds_ = new_seeds;
955 }
956 
957 bool EquationDetect::IsInline(const bool search_bottom,
958  const int textparts_linespacing,
959  ColPartition* part) {
960  ASSERT_HOST(part != NULL);
961  // Look for its nearest vertical neighbor that hardly overlaps in y but
962  // largely overlaps in x.
964  ColPartition *neighbor = NULL;
965  const TBOX& part_box(part->bounding_box());
966  const float kYGapRatioTh = 1.0;
967 
968  if (search_bottom) {
969  search.StartVerticalSearch(part_box.left(), part_box.right(),
970  part_box.bottom());
971  } else {
972  search.StartVerticalSearch(part_box.left(), part_box.right(),
973  part_box.top());
974  }
975  search.SetUniqueMode(true);
976  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
977  const TBOX& neighbor_box(neighbor->bounding_box());
978  if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
979  MIN(part_box.height(), neighbor_box.height())) {
980  // Finished searching.
981  break;
982  }
983  if (!PTIsTextType(neighbor->type())) {
984  continue;
985  }
986 
987  // Check if neighbor and part is inline similar.
988  const float kHeightRatioTh = 0.5;
989  const int kYGapTh = textparts_linespacing > 0 ?
990  textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
991  static_cast<int>(roundf(0.05 * resolution_)); // Default value.
992  if (part_box.x_overlap(neighbor_box) && // Location feature.
993  part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
994  // Geo feature.
995  static_cast<float>(MIN(part_box.height(), neighbor_box.height())) /
996  MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
997  return true;
998  }
999  }
1000 
1001  return false;
1002 }
1003 
1005  if (!part) {
1006  return false;
1007  }
1008  const int kSeedMathBlobsCount = 2;
1009  const int kSeedMathDigitBlobsCount = 5;
1010 
1011  int blobs = part->boxes_count(),
1012  math_blobs = part->SpecialBlobsCount(BSTT_MATH),
1013  digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
1014  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
1015  math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
1016  return false;
1017  }
1018 
1019  return true;
1020 }
1021 
1023  const float math_density_high,
1024  const float math_density_low,
1025  const ColPartition* part) const {
1026  ASSERT_HOST(part);
1027  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
1029  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
1030  if (math_digit_density > math_density_high) {
1031  return true;
1032  }
1033  if (math_digit_density + italic_density > kMathItalicDensityTh &&
1034  math_digit_density > math_density_low) {
1035  return true;
1036  }
1037 
1038  return false;
1039 }
1040 
1042  ASSERT_HOST(part);
1043 
1045  ColPartition *neighbor = NULL;
1046  const TBOX& part_box(part->bounding_box());
1047  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
1048  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
1049  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
1050 
1051  // Here we use a simple approximation algorithm: from the center of part, We
1052  // perform the radius search, and check if we can find a neighboring parition
1053  // that locates on the top/bottom left of part.
1054  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
1055  (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1056  search.SetUniqueMode(true);
1057  bool left_indented = false, right_indented = false;
1058  while ((neighbor = search.NextRadSearch()) != NULL &&
1059  (!left_indented || !right_indented)) {
1060  if (neighbor == part) {
1061  continue;
1062  }
1063  const TBOX& neighbor_box(neighbor->bounding_box());
1064 
1065  if (part_box.major_y_overlap(neighbor_box) &&
1066  part_box.x_gap(neighbor_box) < kXGapTh) {
1067  // When this happens, it is likely part is a fragment of an
1068  // over-segmented colpartition. So we return false.
1069  return NO_INDENT;
1070  }
1071 
1072  if (!IsTextOrEquationType(neighbor->type())) {
1073  continue;
1074  }
1075 
1076  // The neighbor should be above/below part, and overlap in x direction.
1077  if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1078  continue;
1079  }
1080 
1081  if (part_box.y_gap(neighbor_box) < kYGapTh) {
1082  int left_gap = part_box.left() - neighbor_box.left();
1083  int right_gap = neighbor_box.right() - part_box.right();
1084  if (left_gap > kXGapTh) {
1085  left_indented = true;
1086  }
1087  if (right_gap > kXGapTh) {
1088  right_indented = true;
1089  }
1090  }
1091  }
1092 
1093  if (left_indented && right_indented) {
1094  return BOTH_INDENT;
1095  }
1096  if (left_indented) {
1097  return LEFT_INDENT;
1098  }
1099  if (right_indented) {
1100  return RIGHT_INDENT;
1101  }
1102  return NO_INDENT;
1103 }
1104 
1106  if (seed == NULL || // This seed has been absorbed by other seeds.
1107  seed->IsVerticalType()) { // We skip vertical type right now.
1108  return false;
1109  }
1110 
1111  // Expand in four directions.
1112  GenericVector<ColPartition*> parts_to_merge;
1113  ExpandSeedHorizontal(true, seed, &parts_to_merge);
1114  ExpandSeedHorizontal(false, seed, &parts_to_merge);
1115  ExpandSeedVertical(true, seed, &parts_to_merge);
1116  ExpandSeedVertical(false, seed, &parts_to_merge);
1117  SearchByOverlap(seed, &parts_to_merge);
1118 
1119  if (parts_to_merge.empty()) { // We don't find any partition to merge.
1120  return false;
1121  }
1122 
1123  // Merge all partitions in parts_to_merge with seed. We first remove seed
1124  // from part_grid_ as its bounding box is going to expand. Then we add it
1125  // back after it aborbs all parts_to_merge parititions.
1126  part_grid_->RemoveBBox(seed);
1127  for (int i = 0; i < parts_to_merge.size(); ++i) {
1128  ColPartition* part = parts_to_merge[i];
1129  if (part->type() == PT_EQUATION) {
1130  // If part is in cp_seeds_, then we mark it as NULL so that we won't
1131  // process it again.
1132  for (int j = 0; j < cp_seeds_.size(); ++j) {
1133  if (part == cp_seeds_[j]) {
1134  cp_seeds_[j] = NULL;
1135  break;
1136  }
1137  }
1138  }
1139 
1140  // part has already been removed from part_grid_ in function
1141  // ExpandSeedHorizontal/ExpandSeedVertical.
1142  seed->Absorb(part, NULL);
1143  }
1144 
1145  return true;
1146 }
1147 
1149  const bool search_left,
1150  ColPartition* seed,
1151  GenericVector<ColPartition*>* parts_to_merge) {
1152  ASSERT_HOST(seed != NULL && parts_to_merge != NULL);
1153  const float kYOverlapTh = 0.6;
1154  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
1155 
1157  const TBOX& seed_box(seed->bounding_box());
1158  int x = search_left ? seed_box.left() : seed_box.right();
1159  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
1160  search.SetUniqueMode(true);
1161 
1162  // Search iteratively.
1163  ColPartition *part = NULL;
1164  while ((part = search.NextSideSearch(search_left)) != NULL) {
1165  if (part == seed) {
1166  continue;
1167  }
1168  const TBOX& part_box(part->bounding_box());
1169  if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
1170  break;
1171  }
1172 
1173  // Check part location.
1174  if ((part_box.left() >= seed_box.left() && search_left) ||
1175  (part_box.right() <= seed_box.right() && !search_left)) {
1176  continue;
1177  }
1178 
1179  if (part->type() != PT_EQUATION) { // Non-equation type.
1180  // Skip PT_LINLINE_EQUATION and non text type.
1181  if (part->type() == PT_INLINE_EQUATION ||
1182  (!IsTextOrEquationType(part->type()) &&
1183  part->blob_type() != BRT_HLINE)) {
1184  continue;
1185  }
1186  // For other types, it should be the near small neighbor of seed.
1187  if (!IsNearSmallNeighbor(seed_box, part_box) ||
1188  !CheckSeedNeighborDensity(part)) {
1189  continue;
1190  }
1191  } else { // Equation type, check the y overlap.
1192  if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1193  seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1194  continue;
1195  }
1196  }
1197 
1198  // Passed the check, delete it from search and add into parts_to_merge.
1199  search.RemoveBBox();
1200  parts_to_merge->push_back(part);
1201  }
1202 }
1203 
1205  const bool search_bottom,
1206  ColPartition* seed,
1207  GenericVector<ColPartition*>* parts_to_merge) {
1208  ASSERT_HOST(seed != NULL && parts_to_merge != NULL &&
1209  cps_super_bbox_ != NULL);
1210  const float kXOverlapTh = 0.4;
1211  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
1212 
1214  const TBOX& seed_box(seed->bounding_box());
1215  int y = search_bottom ? seed_box.bottom() : seed_box.top();
1216  search.StartVerticalSearch(
1218  search.SetUniqueMode(true);
1219 
1220  // Search iteratively.
1221  ColPartition *part = NULL;
1223  int skipped_min_top = INT_MAX, skipped_max_bottom = -1;
1224  while ((part = search.NextVerticalSearch(search_bottom)) != NULL) {
1225  if (part == seed) {
1226  continue;
1227  }
1228  const TBOX& part_box(part->bounding_box());
1229 
1230  if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
1231  break;
1232  }
1233 
1234  // Check part location.
1235  if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1236  (part_box.top() <= seed_box.top() && !search_bottom)) {
1237  continue;
1238  }
1239 
1240  bool skip_part = false;
1241  if (part->type() != PT_EQUATION) { // Non-equation type.
1242  // Skip PT_LINLINE_EQUATION and non text type.
1243  if (part->type() == PT_INLINE_EQUATION ||
1244  (!IsTextOrEquationType(part->type()) &&
1245  part->blob_type() != BRT_HLINE)) {
1246  skip_part = true;
1247  } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
1248  !CheckSeedNeighborDensity(part)) {
1249  // For other types, it should be the near small neighbor of seed.
1250  skip_part = true;
1251  }
1252  } else { // Equation type, check the x overlap.
1253  if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1254  seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1255  skip_part = true;
1256  }
1257  }
1258  if (skip_part) {
1259  if (part->type() != PT_EQUATION) {
1260  if (skipped_min_top > part_box.top()) {
1261  skipped_min_top = part_box.top();
1262  }
1263  if (skipped_max_bottom < part_box.bottom()) {
1264  skipped_max_bottom = part_box.bottom();
1265  }
1266  }
1267  } else {
1268  parts.push_back(part);
1269  }
1270  }
1271 
1272  // For every part in parts, we need verify it is not above skipped_min_top
1273  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
1274  // we will skip a part if it looks like:
1275  // search bottom | search top
1276  // seed: ****************** | part: **********
1277  // skipped: xxx | skipped: xxx
1278  // part: ********** | seed: ***********
1279  for (int i = 0; i < parts.size(); i++) {
1280  const TBOX& part_box(parts[i]->bounding_box());
1281  if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
1282  (!search_bottom && part_box.bottom() >= skipped_min_top)) {
1283  continue;
1284  }
1285  // Add parts[i] into parts_to_merge, and delete it from part_grid_.
1286  parts_to_merge->push_back(parts[i]);
1287  part_grid_->RemoveBBox(parts[i]);
1288  }
1289 }
1290 
1292  const TBOX& part_box) const {
1293  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
1294  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
1295 
1296  // Check geometric feature.
1297  if (part_box.height() > seed_box.height() ||
1298  part_box.width() > seed_box.width()) {
1299  return false;
1300  }
1301 
1302  // Check overlap and distance.
1303  if ((!part_box.major_x_overlap(seed_box) ||
1304  part_box.y_gap(seed_box) > kYGapTh) &&
1305  (!part_box.major_y_overlap(seed_box) ||
1306  part_box.x_gap(seed_box) > kXGapTh)) {
1307  return false;
1308  }
1309 
1310  return true;
1311 }
1312 
1314  ASSERT_HOST(part);
1315  if (part->boxes_count() < kSeedBlobsCountTh) {
1316  // Too few blobs, skip the check.
1317  return true;
1318  }
1319 
1320  // We check the math blobs density and the unclear blobs density.
1321  if (part->SpecialBlobsDensity(BSTT_MATH) +
1322  part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
1324  return true;
1325  }
1326 
1327  return false;
1328 }
1329 
1331  // Iterate over part_grid_, and find all parts that are text type but not
1332  // equation type.
1333  ColPartition *part = NULL;
1334  GenericVector<ColPartition*> text_parts;
1336  gsearch.StartFullSearch();
1337  while ((part = gsearch.NextFullSearch()) != NULL) {
1338  if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
1339  text_parts.push_back(part);
1340  }
1341  }
1342  if (text_parts.empty()) {
1343  return;
1344  }
1345 
1346  // Compute the medium height of the text_parts.
1347  text_parts.sort(&SortCPByHeight);
1348  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
1349  int med_height = text_box.height();
1350  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
1351  const TBOX& text_box =
1352  text_parts[text_parts.size() / 2 - 1]->bounding_box();
1353  med_height = static_cast<int>(roundf(
1354  0.5 * (text_box.height() + med_height)));
1355  }
1356 
1357  // Iterate every text_parts and check if it is a math block satellite.
1358  for (int i = 0; i < text_parts.size(); ++i) {
1359  const TBOX& text_box(text_parts[i]->bounding_box());
1360  if (text_box.height() > med_height) {
1361  continue;
1362  }
1363  GenericVector<ColPartition*> math_blocks;
1364  if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
1365  continue;
1366  }
1367 
1368  // Found. merge text_parts[i] with math_blocks.
1369  part_grid_->RemoveBBox(text_parts[i]);
1370  text_parts[i]->set_type(PT_EQUATION);
1371  for (int j = 0; j < math_blocks.size(); ++j) {
1372  part_grid_->RemoveBBox(math_blocks[j]);
1373  text_parts[i]->Absorb(math_blocks[j], NULL);
1374  }
1375  InsertPartAfterAbsorb(text_parts[i]);
1376  }
1377 }
1378 
1380  ColPartition* part, GenericVector<ColPartition*>* math_blocks) {
1381  ASSERT_HOST(part != NULL && math_blocks != NULL);
1382  math_blocks->clear();
1383  const TBOX& part_box(part->bounding_box());
1384  // Find the top/bottom nearest neighbor of part.
1385  ColPartition *neighbors[2];
1386  int y_gaps[2] = {INT_MAX, INT_MAX};
1387  // The horizontal boundary of the neighbors.
1388  int neighbors_left = INT_MAX, neighbors_right = 0;
1389  for (int i = 0; i < 2; ++i) {
1390  neighbors[i] = SearchNNVertical(i != 0, part);
1391  if (neighbors[i]) {
1392  const TBOX& neighbor_box = neighbors[i]->bounding_box();
1393  y_gaps[i] = neighbor_box.y_gap(part_box);
1394  if (neighbor_box.left() < neighbors_left) {
1395  neighbors_left = neighbor_box.left();
1396  }
1397  if (neighbor_box.right() > neighbors_right) {
1398  neighbors_right = neighbor_box.right();
1399  }
1400  }
1401  }
1402  if (neighbors[0] == neighbors[1]) {
1403  // This happens when part is inside neighbor.
1404  neighbors[1] = NULL;
1405  y_gaps[1] = INT_MAX;
1406  }
1407 
1408  // Check if part is within [neighbors_left, neighbors_right].
1409  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1410  return false;
1411  }
1412 
1413  // Get the index of the near one in neighbors.
1414  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1415 
1416  // Check the near one.
1417  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1418  math_blocks->push_back(neighbors[index]);
1419  } else {
1420  // If the near one failed the check, then we skip checking the far one.
1421  return false;
1422  }
1423 
1424  // Check the far one.
1425  index = 1 - index;
1426  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1427  math_blocks->push_back(neighbors[index]);
1428  }
1429 
1430  return true;
1431 }
1432 
1434  const bool search_bottom, const ColPartition* part) {
1435  ASSERT_HOST(part);
1436  ColPartition *nearest_neighbor = NULL, *neighbor = NULL;
1437  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
1438 
1440  search.SetUniqueMode(true);
1441  const TBOX& part_box(part->bounding_box());
1442  int y = search_bottom ? part_box.bottom() : part_box.top();
1443  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
1444  int min_y_gap = INT_MAX;
1445  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
1446  if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
1447  continue;
1448  }
1449  const TBOX& neighbor_box(neighbor->bounding_box());
1450  int y_gap = neighbor_box.y_gap(part_box);
1451  if (y_gap > kYGapTh) { // Out of scope.
1452  break;
1453  }
1454  if (!neighbor_box.major_x_overlap(part_box) ||
1455  (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1456  (!search_bottom && neighbor_box.top() < part_box.top())) {
1457  continue;
1458  }
1459  if (y_gap < min_y_gap) {
1460  min_y_gap = y_gap;
1461  nearest_neighbor = neighbor;
1462  }
1463  }
1464 
1465  return nearest_neighbor;
1466 }
1467 
1469  const int y_gap, const ColPartition *neighbor) const {
1470  if (!neighbor) {
1471  return false;
1472  }
1473  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
1474  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
1475 }
1476 
1478  STRING* image_name) const {
1479  ASSERT_HOST(image_name && name);
1480  char page[50];
1481  snprintf(page, sizeof(page), "%04d", page_count_);
1482  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
1483 }
1484 
1485 void EquationDetect::PaintSpecialTexts(const STRING& outfile) const {
1486  Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary();
1487  pix = pixConvertTo32(pixBi);
1489  ColPartition* part = NULL;
1490  gsearch.StartFullSearch();
1491  while ((part = gsearch.NextFullSearch()) != NULL) {
1492  BLOBNBOX_C_IT blob_it(part->boxes());
1493  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1494  RenderSpecialText(pix, blob_it.data());
1495  }
1496  }
1497 
1498  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1499  pixDestroy(&pix);
1500 }
1501 
1502 void EquationDetect::PaintColParts(const STRING& outfile) const {
1503  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
1505  gsearch.StartFullSearch();
1506  ColPartition* part = NULL;
1507  while ((part = gsearch.NextFullSearch()) != NULL) {
1508  const TBOX& tbox = part->bounding_box();
1509  Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
1510  tbox.width(), tbox.height());
1511  if (part->type() == PT_EQUATION) {
1512  pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1513  } else if (part->type() == PT_INLINE_EQUATION) {
1514  pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1515  } else {
1516  pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1517  }
1518  boxDestroy(&box);
1519  }
1520 
1521  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1522  pixDestroy(&pix);
1523 }
1524 
1526  ASSERT_HOST(part);
1527  TBOX box(part->bounding_box());
1528  int h = pixGetHeight(lang_tesseract_->BestPix());
1529  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
1530  h - box.top(), h - box.bottom());
1531  box.print();
1532  tprintf("blobs count = %d, density = ", part->boxes_count());
1533  for (int i = 0; i < BSTT_COUNT; ++i) {
1534  BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i);
1535  tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
1536  }
1537  tprintf("\n");
1538 }
1539 
1540 }; // namespace tesseract
const int kBlnXHeight
Definition: normalis.h:28
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)
Definition: blobs.h:261
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
int size() const
Definition: genericvector.h:72
bool IsVerticalType() const
Definition: colpartition.h:435
int classify_integer_matcher_multiplier
Definition: classify.h:469
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
float SpecialBlobsDensity(const BlobSpecialTextType type) const
STRING imagebasename
Definition: ccutil.h:68
ColPartitionSet ** best_columns_
BlobRegionType
Definition: blobbox.h:57
#define MAX(x, y)
Definition: ndminx.h:24
ColPartitionGrid * part_grid_
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)
const float kMathItalicDensityTh
BlobTextFlowType
Definition: blobbox.h:99
BlobSpecialTextType
Definition: blobbox.h:81
int push_back(T object)
bool equationdetect_save_seed_image
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:805
double x_overlap_fraction(const TBOX &box) const
Definition: rect.h:447
bool joined_to_prev() const
Definition: blobbox.h:241
const TBOX & bounding_box() const
Definition: colpartition.h:109
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
UNICHARSET unicharset
Definition: ccutil.h:72
void InsertPartAfterAbsorb(ColPartition *part)
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition * > *math_blocks)
void PaintSpecialTexts(const STRING &outfile) const
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
int median_width() const
Definition: colpartition.h:142
void RepositionIterator()
Definition: bbgrid.h:895
void GetOutputTiffName(const char *name, STRING *image_name) const
#define BOOL_VAR(name, val, comment)
Definition: params.h:280
BlobRegionType blob_type() const
Definition: colpartition.h:148
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
inT32 length() const
Definition: strngs.cpp:188
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
bool equationdetect_save_spt_image
const int kLeftIndentAlignmentCountTh
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
bool IsRightIndented(const EquationDetect::IndentType type)
bool bool_binary_search(const T &target) const
int source_resolution() const
ColPartition * SplitAt(int split_x)
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:277
inT16 right() const
Definition: rect.h:75
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:764
#define MIN_INT32
Definition: host.h:128
void PaintColParts(const STRING &outfile) const
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
void StartFullSearch()
Definition: bbgrid.h:668
void IdentifyBlobsToSkip(ColPartition *part)
void SetResolution(const int resolution)
bool CheckSeedBlobsCount(ColPartition *part)
IndentType IsIndented(ColPartition *part)
bool equationdetect_save_bi_image
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition * > *parts_to_merge)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:489
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
void SetPartitionType(int resolution, ColPartitionSet *columns)
inT16 fontinfo_id() const
Definition: ratngs.h:85
void SetUniqueMode(bool mode)
Definition: bbgrid.h:254
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:701
inT16 left() const
Definition: rect.h:68
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
void delete_data_pointers()
void PrintSpecialBlobsDensity(const ColPartition *part) const
name_table name
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition * > *parts_to_merge)
int y_gap(const TBOX &box) const
Definition: rect.h:225
BlobTextFlowType flow() const
Definition: colpartition.h:154
int classify_class_pruner_multiplier
Definition: classify.h:465
float ComputeForegroundDensity(const TBOX &tbox)
const int kSeedBlobsCountTh
C_BLOB * cblob() const
Definition: blobbox.h:253
void SetLangTesseract(Tesseract *lang_tesseract)
int binary_search(const T &target) const
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
const int kBlnBaselineOffset
Definition: normalis.h:29
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
int UNICHAR_ID
Definition: unichar.h:33
int LabelSpecialText(TO_BLOCK *to_block)
void SplitCPHor(ColPartition *part, GenericVector< ColPartition * > *parts_splitted)
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
bool IsTextOrEquationType(PolyBlockType type)
Pix * BestPix() const
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:151
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
inT16 bottom() const
Definition: rect.h:61
const float kMathDigitDensityTh2
bool IsLeftIndented(const EquationDetect::IndentType type)
ColPartition * CopyButDontOwnBlobs()
bool empty() const
Definition: genericvector.h:84
PolyBlockType type() const
Definition: colpartition.h:181
inT16 height() const
Definition: rect.h:104
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:749
const float kMathDigitDensityTh1
void Absorb(ColPartition *other, WidthCallback *cb)
inT16 width() const
Definition: rect.h:111
BBC * NextFullSearch()
Definition: bbgrid.h:678
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params)
Definition: tessedit.cpp:285
int x_gap(const TBOX &box) const
Definition: rect.h:217
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
int count(LIST var_list)
Definition: oldlist.cpp:108
int IntCastRounded(double x)
Definition: helpers.h:172
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
BBC * NextRadSearch()
Definition: bbgrid.h:716
Definition: rect.h:30
bool equationdetect_save_merged_image
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70
EquationDetect(const char *equ_datapath, const char *equ_language)
int FindEquationParts(ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
int SpecialBlobsCount(const BlobSpecialTextType type)
const float kUnclearDensityTh
Definition: strngs.h:44
GenericVector< ColPartition * > cp_seeds_
#define NULL
Definition: host.h:144
const TBOX & bounding_box() const
Definition: blobbox.h:215
PolyBlockType
Definition: publictypes.h:41
Pix * pix_binary() const
TBOX bounding_box() const
Definition: blobs.cpp:482
float roundf(float num)
Definition: mathfix.h:35
int boxes_count() const
Definition: colpartition.h:190
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:536
const char * string() const
Definition: strngs.cpp:193
inT16 top() const
Definition: rect.h:54
float certainty() const
Definition: ratngs.h:82
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
#define fmax
Definition: mathfix.h:33
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition * > *parts_overlap)
bool CheckSeedNeighborDensity(const ColPartition *part) const
BOOL8 contains(const char c) const
Definition: strngs.cpp:184
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
bool ExpandSeed(ColPartition *seed)
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
void set_type(PolyBlockType t)
Definition: colpartition.h:184
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:791