tesseract  4.00.00dev
tesseract::EquationDetect Class Reference

#include <equationdetect.h>

Inheritance diagram for tesseract::EquationDetect:
tesseract::EquationDetectBase

Public Types

enum  IndentType {
  NO_INDENT, LEFT_INDENT, RIGHT_INDENT, BOTH_INDENT,
  INDENT_TYPE_COUNT
}
 

Public Member Functions

 EquationDetect (const char *equ_datapath, const char *equ_language)
 
 ~EquationDetect ()
 
void SetLangTesseract (Tesseract *lang_tesseract)
 
int LabelSpecialText (TO_BLOCK *to_block)
 
int FindEquationParts (ColPartitionGrid *part_grid, ColPartitionSet **best_columns)
 
void SetResolution (const int resolution)
 
- Public Member Functions inherited from tesseract::EquationDetectBase
 EquationDetectBase ()
 
virtual ~EquationDetectBase ()
 

Protected Member Functions

void IdentifySpecialText (BLOBNBOX *blob, const int height_th)
 
BlobSpecialTextType EstimateTypeForUnichar (const UNICHARSET &unicharset, const UNICHAR_ID id) const
 
void IdentifySpecialText ()
 
void IdentifyBlobsToSkip (ColPartition *part)
 
void MergePartsByLocation ()
 
void SearchByOverlap (ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
 
void InsertPartAfterAbsorb (ColPartition *part)
 
void IdentifySeedParts ()
 
bool CheckSeedBlobsCount (ColPartition *part)
 
float ComputeForegroundDensity (const TBOX &tbox)
 
bool CheckForSeed2 (const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
 
int CountAlignment (const GenericVector< int > &sorted_vec, const int val) const
 
bool CheckSeedFgDensity (const float density_th, ColPartition *part)
 
void SplitCPHorLite (ColPartition *part, GenericVector< TBOX > *splitted_boxes)
 
void SplitCPHor (ColPartition *part, GenericVector< ColPartition *> *parts_splitted)
 
bool CheckSeedDensity (const float math_density_high, const float math_density_low, const ColPartition *part) const
 
IndentType IsIndented (ColPartition *part)
 
void IdentifyInlineParts ()
 
void ComputeCPsSuperBBox ()
 
void IdentifyInlinePartsHorizontal ()
 
int EstimateTextPartLineSpacing ()
 
void IdentifyInlinePartsVertical (const bool top_to_bottom, const int textPartsLineSpacing)
 
bool IsInline (const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)
 
bool ExpandSeed (ColPartition *seed)
 
void ExpandSeedHorizontal (const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
void ExpandSeedVertical (const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
 
bool IsNearSmallNeighbor (const TBOX &seed_box, const TBOX &part_box) const
 
bool CheckSeedNeighborDensity (const ColPartition *part) const
 
void ProcessMathBlockSatelliteParts ()
 
bool IsMathBlockSatellite (ColPartition *part, GenericVector< ColPartition *> *math_blocks)
 
ColPartitionSearchNNVertical (const bool search_bottom, const ColPartition *part)
 
bool IsNearMathNeighbor (const int y_gap, const ColPartition *neighbor) const
 
void GetOutputTiffName (const char *name, STRING *image_name) const
 
void PaintColParts (const STRING &outfile) const
 
void PaintSpecialTexts (const STRING &outfile) const
 
void PrintSpecialBlobsDensity (const ColPartition *part) const
 

Protected Attributes

Tesseract equ_tesseract_
 
Tesseractlang_tesseract_
 
ColPartitionGridpart_grid_
 
ColPartitionSet ** best_columns_
 
TBOXcps_super_bbox_
 
GenericVector< ColPartition * > cp_seeds_
 
int resolution_
 
int page_count_
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::EquationDetectBase
static void RenderSpecialText (Pix *pix, BLOBNBOX *blob)
 

Detailed Description

Definition at line 43 of file equationdetect.h.

Member Enumeration Documentation

◆ IndentType

Constructor & Destructor Documentation

◆ EquationDetect()

tesseract::EquationDetect::EquationDetect ( const char *  equ_datapath,
const char *  equ_language 
)

Definition at line 100 of file equationdetect.cpp.

101  {
102  const char* default_name = "equ";
103  if (equ_name == NULL) {
104  equ_name = default_name;
105  }
106  lang_tesseract_ = NULL;
107  resolution_ = 0;
108  page_count_ = 0;
109 
110  if (equ_tesseract_.init_tesseract(equ_datapath, equ_name,
112  tprintf("Warning: equation region detection requested,"
113  " but %s failed to load from %s\n", equ_name, equ_datapath);
114  }
115 
116  cps_super_bbox_ = NULL;
117 }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:285
#define tprintf(...)
Definition: tprintf.h:31

◆ ~EquationDetect()

tesseract::EquationDetect::~EquationDetect ( )

Definition at line 119 of file equationdetect.cpp.

119 { delete (cps_super_bbox_); }

Member Function Documentation

◆ CheckForSeed2()

bool tesseract::EquationDetect::CheckForSeed2 ( const GenericVector< int > &  indented_texts_left,
const float  foreground_density_th,
ColPartition part 
)
protected

Definition at line 736 of file equationdetect.cpp.

739  {
740  ASSERT_HOST(part);
741  const TBOX& box = part->bounding_box();
742 
743  // Check if it is aligned with any indented_texts_left.
744  if (!indented_texts_left.empty() &&
745  CountAlignment(indented_texts_left, box.left()) >=
747  return false;
748  }
749 
750  // Check the foreground density.
751  if (ComputeForegroundDensity(box) > foreground_density_th) {
752  return false;
753  }
754 
755  return true;
756 }
bool empty() const
Definition: genericvector.h:91
float ComputeForegroundDensity(const TBOX &tbox)
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
const int kLeftIndentAlignmentCountTh

◆ CheckSeedBlobsCount()

bool tesseract::EquationDetect::CheckSeedBlobsCount ( ColPartition part)
protected

Definition at line 983 of file equationdetect.cpp.

983  {
984  if (!part) {
985  return false;
986  }
987  const int kSeedMathBlobsCount = 2;
988  const int kSeedMathDigitBlobsCount = 5;
989 
990  int blobs = part->boxes_count(),
991  math_blobs = part->SpecialBlobsCount(BSTT_MATH),
992  digit_blobs = part->SpecialBlobsCount(BSTT_DIGIT);
993  if (blobs < kSeedBlobsCountTh || math_blobs <= kSeedMathBlobsCount ||
994  math_blobs + digit_blobs <= kSeedMathDigitBlobsCount) {
995  return false;
996  }
997 
998  return true;
999 }
const int kSeedBlobsCountTh

◆ CheckSeedDensity()

bool tesseract::EquationDetect::CheckSeedDensity ( const float  math_density_high,
const float  math_density_low,
const ColPartition part 
) const
protected

Definition at line 1001 of file equationdetect.cpp.

1004  {
1005  ASSERT_HOST(part);
1006  float math_digit_density = part->SpecialBlobsDensity(BSTT_MATH)
1007  + part->SpecialBlobsDensity(BSTT_DIGIT);
1008  float italic_density = part->SpecialBlobsDensity(BSTT_ITALIC);
1009  if (math_digit_density > math_density_high) {
1010  return true;
1011  }
1012  if (math_digit_density + italic_density > kMathItalicDensityTh &&
1013  math_digit_density > math_density_low) {
1014  return true;
1015  }
1016 
1017  return false;
1018 }
const float kMathItalicDensityTh
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedFgDensity()

bool tesseract::EquationDetect::CheckSeedFgDensity ( const float  density_th,
ColPartition part 
)
protected

Definition at line 624 of file equationdetect.cpp.

625  {
626  ASSERT_HOST(part);
627 
628  // Split part horizontall, and check for each sub part.
629  GenericVector<TBOX> sub_boxes;
630  SplitCPHorLite(part, &sub_boxes);
631  float parts_passed = 0.0;
632  for (int i = 0; i < sub_boxes.size(); ++i) {
633  float density = ComputeForegroundDensity(sub_boxes[i]);
634  if (density < density_th) {
635  parts_passed++;
636  }
637  }
638 
639  // If most sub parts passed, then we return true.
640  const float kSeedPartRatioTh = 0.3;
641  bool retval = (parts_passed / sub_boxes.size() >= kSeedPartRatioTh);
642 
643  return retval;
644 }
float ComputeForegroundDensity(const TBOX &tbox)
void SplitCPHorLite(ColPartition *part, GenericVector< TBOX > *splitted_boxes)
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ CheckSeedNeighborDensity()

bool tesseract::EquationDetect::CheckSeedNeighborDensity ( const ColPartition part) const
protected

Definition at line 1292 of file equationdetect.cpp.

1292  {
1293  ASSERT_HOST(part);
1294  if (part->boxes_count() < kSeedBlobsCountTh) {
1295  // Too few blobs, skip the check.
1296  return true;
1297  }
1298 
1299  // We check the math blobs density and the unclear blobs density.
1300  if (part->SpecialBlobsDensity(BSTT_MATH) +
1301  part->SpecialBlobsDensity(BSTT_DIGIT) > kMathDigitDensityTh1 ||
1302  part->SpecialBlobsDensity(BSTT_UNCLEAR) > kUnclearDensityTh) {
1303  return true;
1304  }
1305 
1306  return false;
1307 }
const float kMathDigitDensityTh1
#define ASSERT_HOST(x)
Definition: errcode.h:84
const float kUnclearDensityTh
const int kSeedBlobsCountTh

◆ ComputeCPsSuperBBox()

void tesseract::EquationDetect::ComputeCPsSuperBBox ( )
protected

Definition at line 789 of file equationdetect.cpp.

789  {
791  ColPartition *part = NULL;
792  gsearch.StartFullSearch();
793  if (cps_super_bbox_) {
794  delete cps_super_bbox_;
795  }
796  cps_super_bbox_ = new TBOX();
797  while ((part = gsearch.NextFullSearch()) != NULL) {
798  (*cps_super_bbox_) += part->bounding_box();
799  }
800 }
Definition: rect.h:30
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ ComputeForegroundDensity()

float tesseract::EquationDetect::ComputeForegroundDensity ( const TBOX tbox)
protected

Definition at line 610 of file equationdetect.cpp.

610  {
611  Pix *pix_bi = lang_tesseract_->pix_binary();
612  int pix_height = pixGetHeight(pix_bi);
613  Box* box = boxCreate(tbox.left(), pix_height - tbox.top(),
614  tbox.width(), tbox.height());
615  Pix *pix_sub = pixClipRectangle(pix_bi, box, NULL);
616  l_float32 fract;
617  pixForegroundFraction(pix_sub, &fract);
618  pixDestroy(&pix_sub);
619  boxDestroy(&box);
620 
621  return fract;
622 }
Pix * pix_binary() const
inT16 top() const
Definition: rect.h:54
inT16 height() const
Definition: rect.h:104
inT16 left() const
Definition: rect.h:68
inT16 width() const
Definition: rect.h:111

◆ CountAlignment()

int tesseract::EquationDetect::CountAlignment ( const GenericVector< int > &  sorted_vec,
const int  val 
) const
protected

Definition at line 758 of file equationdetect.cpp.

759  {
760  if (sorted_vec.empty()) {
761  return 0;
762  }
763  const int kDistTh = static_cast<int>(roundf(0.03 * resolution_));
764  int pos = sorted_vec.binary_search(val), count = 0;
765 
766  // Search left side.
767  int index = pos;
768  while (index >= 0 && abs(val - sorted_vec[index--]) < kDistTh) {
769  count++;
770  }
771 
772  // Search right side.
773  index = pos + 1;
774  while (index < sorted_vec.size() && sorted_vec[index++] - val < kDistTh) {
775  count++;
776  }
777 
778  return count;
779 }
bool empty() const
Definition: genericvector.h:91
int size() const
Definition: genericvector.h:72
int count(LIST var_list)
Definition: oldlist.cpp:103
int binary_search(const T &target) const

◆ EstimateTextPartLineSpacing()

int tesseract::EquationDetect::EstimateTextPartLineSpacing ( )
protected

Definition at line 867 of file equationdetect.cpp.

867  {
869 
870  // Get the y gap between text partitions;
871  ColPartition *current = NULL, *prev = NULL;
872  gsearch.StartFullSearch();
873  GenericVector<int> ygaps;
874  while ((current = gsearch.NextFullSearch()) != NULL) {
875  if (!PTIsTextType(current->type())) {
876  continue;
877  }
878  if (prev != NULL) {
879  const TBOX &current_box = current->bounding_box();
880  const TBOX &prev_box = prev->bounding_box();
881  // prev and current should be x major overlap and non y overlap.
882  if (current_box.major_x_overlap(prev_box) &&
883  !current_box.y_overlap(prev_box)) {
884  int gap = current_box.y_gap(prev_box);
885  if (gap < MIN(current_box.height(), prev_box.height())) {
886  // The gap should be smaller than the height of the bounding boxes.
887  ygaps.push_back(gap);
888  }
889  }
890  }
891  prev = current;
892  }
893 
894  if (ygaps.size() < 8) { // We do not have enough data.
895  return -1;
896  }
897 
898  // Compute the line spacing from ygaps: use the mean of the first half.
899  ygaps.sort();
900  int spacing = 0, count;
901  for (count = 0; count < ygaps.size() / 2; count++) {
902  spacing += ygaps[count];
903  }
904  return spacing / count;
905 }
#define MIN(x, y)
Definition: ndminx.h:28
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
int size() const
Definition: genericvector.h:72
int y_gap(const TBOX &box) const
Definition: rect.h:225
int count(LIST var_list)
Definition: oldlist.cpp:103
int push_back(T object)
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ EstimateTypeForUnichar()

BlobSpecialTextType tesseract::EquationDetect::EstimateTypeForUnichar ( const UNICHARSET unicharset,
const UNICHAR_ID  id 
) const
protected

Definition at line 223 of file equationdetect.cpp.

224  {
225  STRING s = unicharset.id_to_unichar(id);
226  if (unicharset.get_isalpha(id)) {
227  return BSTT_NONE;
228  }
229 
230  if (unicharset.get_ispunctuation(id)) {
231  // Exclude some special texts that are likely to be confused as math symbol.
232  static GenericVector<UNICHAR_ID> ids_to_exclude;
233  if (ids_to_exclude.empty()) {
234  static const STRING kCharsToEx[] = {"'", "`", "\"", "\\", ",", ".",
235  "〈", "〉", "《", "》", "」", "「", ""};
236  int i = 0;
237  while (kCharsToEx[i] != "") {
238  ids_to_exclude.push_back(
239  unicharset.unichar_to_id(kCharsToEx[i++].string()));
240  }
241  ids_to_exclude.sort();
242  }
243  return ids_to_exclude.bool_binary_search(id) ? BSTT_NONE : BSTT_MATH;
244  }
245 
246  // Check if it is digit. In addition to the isdigit attribute, we also check
247  // if this character belongs to those likely to be confused with a digit.
248  static const STRING kDigitsChars = "|";
249  if (unicharset.get_isdigit(id) ||
250  (s.length() == 1 && kDigitsChars.contains(s[0]))) {
251  return BSTT_DIGIT;
252  } else {
253  return BSTT_MATH;
254  }
255 }
bool empty() const
Definition: genericvector.h:91
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
const char * string() const
Definition: strngs.cpp:198
int push_back(T object)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
Definition: strngs.h:45
bool bool_binary_search(const T &target) const
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
inT32 length() const
Definition: strngs.cpp:193
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ ExpandSeed()

bool tesseract::EquationDetect::ExpandSeed ( ColPartition seed)
protected

Definition at line 1084 of file equationdetect.cpp.

1084  {
1085  if (seed == NULL || // This seed has been absorbed by other seeds.
1086  seed->IsVerticalType()) { // We skip vertical type right now.
1087  return false;
1088  }
1089 
1090  // Expand in four directions.
1091  GenericVector<ColPartition*> parts_to_merge;
1092  ExpandSeedHorizontal(true, seed, &parts_to_merge);
1093  ExpandSeedHorizontal(false, seed, &parts_to_merge);
1094  ExpandSeedVertical(true, seed, &parts_to_merge);
1095  ExpandSeedVertical(false, seed, &parts_to_merge);
1096  SearchByOverlap(seed, &parts_to_merge);
1097 
1098  if (parts_to_merge.empty()) { // We don't find any partition to merge.
1099  return false;
1100  }
1101 
1102  // Merge all partitions in parts_to_merge with seed. We first remove seed
1103  // from part_grid_ as its bounding box is going to expand. Then we add it
1104  // back after it aborbs all parts_to_merge parititions.
1105  part_grid_->RemoveBBox(seed);
1106  for (int i = 0; i < parts_to_merge.size(); ++i) {
1107  ColPartition* part = parts_to_merge[i];
1108  if (part->type() == PT_EQUATION) {
1109  // If part is in cp_seeds_, then we mark it as NULL so that we won't
1110  // process it again.
1111  for (int j = 0; j < cp_seeds_.size(); ++j) {
1112  if (part == cp_seeds_[j]) {
1113  cp_seeds_[j] = NULL;
1114  break;
1115  }
1116  }
1117  }
1118 
1119  // part has already been removed from part_grid_ in function
1120  // ExpandSeedHorizontal/ExpandSeedVertical.
1121  seed->Absorb(part, NULL);
1122  }
1123 
1124  return true;
1125 }
bool empty() const
Definition: genericvector.h:91
void ExpandSeedVertical(const bool search_bottom, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
void ExpandSeedHorizontal(const bool search_left, ColPartition *seed, GenericVector< ColPartition *> *parts_to_merge)
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
int size() const
Definition: genericvector.h:72
GenericVector< ColPartition * > cp_seeds_
ColPartitionGrid * part_grid_

◆ ExpandSeedHorizontal()

void tesseract::EquationDetect::ExpandSeedHorizontal ( const bool  search_left,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1127 of file equationdetect.cpp.

1130  {
1131  ASSERT_HOST(seed != NULL && parts_to_merge != NULL);
1132  const float kYOverlapTh = 0.6;
1133  const int kXGapTh = static_cast<int>(roundf(0.2 * resolution_));
1134 
1136  const TBOX& seed_box(seed->bounding_box());
1137  int x = search_left ? seed_box.left() : seed_box.right();
1138  search.StartSideSearch(x, seed_box.bottom(), seed_box.top());
1139  search.SetUniqueMode(true);
1140 
1141  // Search iteratively.
1142  ColPartition *part = NULL;
1143  while ((part = search.NextSideSearch(search_left)) != NULL) {
1144  if (part == seed) {
1145  continue;
1146  }
1147  const TBOX& part_box(part->bounding_box());
1148  if (part_box.x_gap(seed_box) > kXGapTh) { // Out of scope.
1149  break;
1150  }
1151 
1152  // Check part location.
1153  if ((part_box.left() >= seed_box.left() && search_left) ||
1154  (part_box.right() <= seed_box.right() && !search_left)) {
1155  continue;
1156  }
1157 
1158  if (part->type() != PT_EQUATION) { // Non-equation type.
1159  // Skip PT_LINLINE_EQUATION and non text type.
1160  if (part->type() == PT_INLINE_EQUATION ||
1161  (!IsTextOrEquationType(part->type()) &&
1162  part->blob_type() != BRT_HLINE)) {
1163  continue;
1164  }
1165  // For other types, it should be the near small neighbor of seed.
1166  if (!IsNearSmallNeighbor(seed_box, part_box) ||
1167  !CheckSeedNeighborDensity(part)) {
1168  continue;
1169  }
1170  } else { // Equation type, check the y overlap.
1171  if (part_box.y_overlap_fraction(seed_box) < kYOverlapTh &&
1172  seed_box.y_overlap_fraction(part_box) < kYOverlapTh) {
1173  continue;
1174  }
1175  }
1176 
1177  // Passed the check, delete it from search and add into parts_to_merge.
1178  search.RemoveBBox();
1179  parts_to_merge->push_back(part);
1180  }
1181 }
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
bool IsTextOrEquationType(PolyBlockType type)
int push_back(T object)
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
bool CheckSeedNeighborDensity(const ColPartition *part) const
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ ExpandSeedVertical()

void tesseract::EquationDetect::ExpandSeedVertical ( const bool  search_bottom,
ColPartition seed,
GenericVector< ColPartition *> *  parts_to_merge 
)
protected

Definition at line 1183 of file equationdetect.cpp.

1186  {
1187  ASSERT_HOST(seed != NULL && parts_to_merge != NULL &&
1188  cps_super_bbox_ != NULL);
1189  const float kXOverlapTh = 0.4;
1190  const int kYGapTh = static_cast<int>(roundf(0.2 * resolution_));
1191 
1193  const TBOX& seed_box(seed->bounding_box());
1194  int y = search_bottom ? seed_box.bottom() : seed_box.top();
1195  search.StartVerticalSearch(
1197  search.SetUniqueMode(true);
1198 
1199  // Search iteratively.
1200  ColPartition *part = NULL;
1202  int skipped_min_top = INT_MAX, skipped_max_bottom = -1;
1203  while ((part = search.NextVerticalSearch(search_bottom)) != NULL) {
1204  if (part == seed) {
1205  continue;
1206  }
1207  const TBOX& part_box(part->bounding_box());
1208 
1209  if (part_box.y_gap(seed_box) > kYGapTh) { // Out of scope.
1210  break;
1211  }
1212 
1213  // Check part location.
1214  if ((part_box.bottom() >= seed_box.bottom() && search_bottom) ||
1215  (part_box.top() <= seed_box.top() && !search_bottom)) {
1216  continue;
1217  }
1218 
1219  bool skip_part = false;
1220  if (part->type() != PT_EQUATION) { // Non-equation type.
1221  // Skip PT_LINLINE_EQUATION and non text type.
1222  if (part->type() == PT_INLINE_EQUATION ||
1223  (!IsTextOrEquationType(part->type()) &&
1224  part->blob_type() != BRT_HLINE)) {
1225  skip_part = true;
1226  } else if (!IsNearSmallNeighbor(seed_box, part_box) ||
1227  !CheckSeedNeighborDensity(part)) {
1228  // For other types, it should be the near small neighbor of seed.
1229  skip_part = true;
1230  }
1231  } else { // Equation type, check the x overlap.
1232  if (part_box.x_overlap_fraction(seed_box) < kXOverlapTh &&
1233  seed_box.x_overlap_fraction(part_box) < kXOverlapTh) {
1234  skip_part = true;
1235  }
1236  }
1237  if (skip_part) {
1238  if (part->type() != PT_EQUATION) {
1239  if (skipped_min_top > part_box.top()) {
1240  skipped_min_top = part_box.top();
1241  }
1242  if (skipped_max_bottom < part_box.bottom()) {
1243  skipped_max_bottom = part_box.bottom();
1244  }
1245  }
1246  } else {
1247  parts.push_back(part);
1248  }
1249  }
1250 
1251  // For every part in parts, we need verify it is not above skipped_min_top
1252  // when search top, or not below skipped_max_bottom when search bottom. I.e.,
1253  // we will skip a part if it looks like:
1254  // search bottom | search top
1255  // seed: ****************** | part: **********
1256  // skipped: xxx | skipped: xxx
1257  // part: ********** | seed: ***********
1258  for (int i = 0; i < parts.size(); i++) {
1259  const TBOX& part_box(parts[i]->bounding_box());
1260  if ((search_bottom && part_box.top() <= skipped_max_bottom) ||
1261  (!search_bottom && part_box.bottom() >= skipped_min_top)) {
1262  continue;
1263  }
1264  // Add parts[i] into parts_to_merge, and delete it from part_grid_.
1265  parts_to_merge->push_back(parts[i]);
1266  part_grid_->RemoveBBox(parts[i]);
1267  }
1268 }
bool IsNearSmallNeighbor(const TBOX &seed_box, const TBOX &part_box) const
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
bool IsTextOrEquationType(PolyBlockType type)
int size() const
Definition: genericvector.h:72
int push_back(T object)
inT16 top() const
Definition: rect.h:54
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
bool CheckSeedNeighborDensity(const ColPartition *part) const
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
inT16 right() const
Definition: rect.h:75
ColPartitionGrid * part_grid_

◆ FindEquationParts()

int tesseract::EquationDetect::FindEquationParts ( ColPartitionGrid part_grid,
ColPartitionSet **  best_columns 
)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 357 of file equationdetect.cpp.

358  {
359  if (!lang_tesseract_) {
360  tprintf("Warning: lang_tesseract_ is NULL!\n");
361  return -1;
362  }
363  if (!part_grid || !best_columns) {
364  tprintf("part_grid/best_columns is NULL!!\n");
365  return -1;
366  }
367  cp_seeds_.clear();
368  part_grid_ = part_grid;
369  best_columns_ = best_columns;
371  STRING outfile;
372  page_count_++;
373 
375  GetOutputTiffName("_bi", &outfile);
376  pixWrite(outfile.string(), lang_tesseract_->pix_binary(), IFF_TIFF_G4);
377  }
378 
379  // Pass 0: Compute special text type for blobs.
381 
382  // Pass 1: Merge parts by overlap.
384 
385  // Pass 2: compute the math blob density and find the seed partition.
387  // We still need separate seed into block seed and inline seed partition.
389 
391  GetOutputTiffName("_seed", &outfile);
392  PaintColParts(outfile);
393  }
394 
395  // Pass 3: expand block equation seeds.
396  while (!cp_seeds_.empty()) {
397  GenericVector<ColPartition*> seeds_expanded;
398  for (int i = 0; i < cp_seeds_.size(); ++i) {
399  if (ExpandSeed(cp_seeds_[i])) {
400  // If this seed is expanded, then we add it into seeds_expanded. Note
401  // this seed has been removed from part_grid_ if it is expanded.
402  seeds_expanded.push_back(cp_seeds_[i]);
403  }
404  }
405  // Add seeds_expanded back into part_grid_ and reset cp_seeds_.
406  for (int i = 0; i < seeds_expanded.size(); ++i) {
407  InsertPartAfterAbsorb(seeds_expanded[i]);
408  }
409  cp_seeds_ = seeds_expanded;
410  }
411 
412  // Pass 4: find math block satellite text partitions and merge them.
414 
415  if (equationdetect_save_merged_image) { // For debug.
416  GetOutputTiffName("_merged", &outfile);
417  PaintColParts(outfile);
418  }
419 
420  return 0;
421 }
Pix * pix_binary() const
bool equationdetect_save_bi_image
bool equationdetect_save_merged_image
void PaintColParts(const STRING &outfile) const
int size() const
Definition: genericvector.h:72
GenericVector< ColPartition * > cp_seeds_
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int push_back(T object)
bool ExpandSeed(ColPartition *seed)
int source_resolution() const
Definition: strngs.h:45
ColPartitionSet ** best_columns_
bool equationdetect_save_seed_image
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_
void GetOutputTiffName(const char *name, STRING *image_name) const

◆ GetOutputTiffName()

void tesseract::EquationDetect::GetOutputTiffName ( const char *  name,
STRING image_name 
) const
protected

Definition at line 1456 of file equationdetect.cpp.

1457  {
1458  ASSERT_HOST(image_name && name);
1459  char page[50];
1460  snprintf(page, sizeof(page), "%04d", page_count_);
1461  *image_name = STRING(lang_tesseract_->imagebasename) + page + name + ".tif";
1462 }
Definition: strngs.h:45
#define ASSERT_HOST(x)
Definition: errcode.h:84
STRING imagebasename
Definition: ccutil.h:65

◆ IdentifyBlobsToSkip()

void tesseract::EquationDetect::IdentifyBlobsToSkip ( ColPartition part)
protected

Definition at line 309 of file equationdetect.cpp.

309  {
310  ASSERT_HOST(part);
311  BLOBNBOX_C_IT blob_it(part->boxes());
312 
313  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
314  // At this moment, no blob should have been joined.
315  ASSERT_HOST(!blob_it.data()->joined_to_prev());
316  }
317  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
318  BLOBNBOX* blob = blob_it.data();
319  if (blob->joined_to_prev() || blob->special_text_type() == BSTT_SKIP) {
320  continue;
321  }
322  TBOX blob_box = blob->bounding_box();
323 
324  // Search if any blob can be merged into blob. If found, then we mark all
325  // these blobs as BSTT_SKIP.
326  BLOBNBOX_C_IT blob_it2 = blob_it;
327  bool found = false;
328  while (!blob_it2.at_last()) {
329  BLOBNBOX* nextblob = blob_it2.forward();
330  const TBOX& nextblob_box = nextblob->bounding_box();
331  if (nextblob_box.left() >= blob_box.right()) {
332  break;
333  }
334  const float kWidthR = 0.4, kHeightR = 0.3;
335  bool xoverlap = blob_box.major_x_overlap(nextblob_box),
336  yoverlap = blob_box.y_overlap(nextblob_box);
337  float widthR = static_cast<float>(
338  MIN(nextblob_box.width(), blob_box.width())) /
339  MAX(nextblob_box.width(), blob_box.width());
340  float heightR = static_cast<float>(
341  MIN(nextblob_box.height(), blob_box.height())) /
342  MAX(nextblob_box.height(), blob_box.height());
343 
344  if (xoverlap && yoverlap && widthR > kWidthR && heightR > kHeightR) {
345  // Found one, set nextblob type and recompute blob_box.
346  found = true;
347  nextblob->set_special_text_type(BSTT_SKIP);
348  blob_box += nextblob_box;
349  }
350  }
351  if (found) {
353  }
354  }
355 }
#define MIN(x, y)
Definition: ndminx.h:28
bool y_overlap(const TBOX &box) const
Definition: rect.h:418
void set_special_text_type(BlobSpecialTextType new_type)
Definition: blobbox.h:277
#define MAX(x, y)
Definition: ndminx.h:24
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
const TBOX & bounding_box() const
Definition: blobbox.h:215
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
bool joined_to_prev() const
Definition: blobbox.h:241
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111

◆ IdentifyInlineParts()

void tesseract::EquationDetect::IdentifyInlineParts ( )
protected

Definition at line 781 of file equationdetect.cpp.

781  {
784  int textparts_linespacing = EstimateTextPartLineSpacing();
785  IdentifyInlinePartsVertical(true, textparts_linespacing);
786  IdentifyInlinePartsVertical(false, textparts_linespacing);
787 }
void IdentifyInlinePartsVertical(const bool top_to_bottom, const int textPartsLineSpacing)

◆ IdentifyInlinePartsHorizontal()

void tesseract::EquationDetect::IdentifyInlinePartsHorizontal ( )
protected

Definition at line 802 of file equationdetect.cpp.

802  {
805  const int kMarginDiffTh = IntCastRounded(
807  const int kGapTh = static_cast<int>(roundf(
810  search.SetUniqueMode(true);
811  // The center x coordinate of the cp_super_bbox_.
812  int cps_cx = cps_super_bbox_->left() + cps_super_bbox_->width() / 2;
813  for (int i = 0; i < cp_seeds_.size(); ++i) {
814  ColPartition* part = cp_seeds_[i];
815  const TBOX& part_box(part->bounding_box());
816  int left_margin = part_box.left() - cps_super_bbox_->left(),
817  right_margin = cps_super_bbox_->right() - part_box.right();
818  bool right_to_left;
819  if (left_margin + kMarginDiffTh < right_margin &&
820  left_margin < kMarginDiffTh) {
821  // part is left aligned, so we search if it has any right neighbor.
822  search.StartSideSearch(
823  part_box.right(), part_box.top(), part_box.bottom());
824  right_to_left = false;
825  } else if (left_margin > cps_cx) {
826  // part locates on the right half on image, so search if it has any left
827  // neighbor.
828  search.StartSideSearch(
829  part_box.left(), part_box.top(), part_box.bottom());
830  right_to_left = true;
831  } else { // part is not an inline equation.
832  new_seeds.push_back(part);
833  continue;
834  }
835  ColPartition* neighbor = NULL;
836  bool side_neighbor_found = false;
837  while ((neighbor = search.NextSideSearch(right_to_left)) != NULL) {
838  const TBOX& neighbor_box(neighbor->bounding_box());
839  if (!IsTextOrEquationType(neighbor->type()) ||
840  part_box.x_gap(neighbor_box) > kGapTh ||
841  !part_box.major_y_overlap(neighbor_box) ||
842  part_box.major_x_overlap(neighbor_box)) {
843  continue;
844  }
845  // We have found one. Set the side_neighbor_found flag.
846  side_neighbor_found = true;
847  break;
848  }
849  if (!side_neighbor_found) { // Mark part as PT_INLINE_EQUATION.
850  part->set_type(PT_INLINE_EQUATION);
851  } else {
852  // Check the geometric feature of neighbor.
853  const TBOX& neighbor_box(neighbor->bounding_box());
854  if (neighbor_box.width() > part_box.width() &&
855  neighbor->type() != PT_EQUATION) { // Mark as PT_INLINE_EQUATION.
856  part->set_type(PT_INLINE_EQUATION);
857  } else { // part is not an inline equation type.
858  new_seeds.push_back(part);
859  }
860  }
861  }
862 
863  // Reset the cp_seeds_ using the new_seeds.
864  cp_seeds_ = new_seeds;
865 }
bool IsTextOrEquationType(PolyBlockType type)
GenericVector< ColPartition * > cp_seeds_
int push_back(T object)
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
int source_resolution() const
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
inT16 right() const
Definition: rect.h:75
int IntCastRounded(double x)
Definition: helpers.h:179
ColPartitionGrid * part_grid_
inT16 width() const
Definition: rect.h:111

◆ IdentifyInlinePartsVertical()

void tesseract::EquationDetect::IdentifyInlinePartsVertical ( const bool  top_to_bottom,
const int  textPartsLineSpacing 
)
protected

Definition at line 907 of file equationdetect.cpp.

908  {
909  if (cp_seeds_.empty()) {
910  return;
911  }
912 
913  // Sort cp_seeds_.
914  if (top_to_bottom) { // From top to bottom.
915  cp_seeds_.sort(&SortCPByTopReverse);
916  } else { // From bottom to top.
917  cp_seeds_.sort(&SortCPByBottom);
918  }
919 
921  for (int i = 0; i < cp_seeds_.size(); ++i) {
922  ColPartition* part = cp_seeds_[i];
923  // If we sort cp_seeds_ from top to bottom, then for each cp_seeds_, we look
924  // for its top neighbors, so that if two/more inline regions are connected
925  // to each other, then we will identify the top one, and then use it to
926  // identify the bottom one.
927  if (IsInline(!top_to_bottom, textparts_linespacing, part)) {
928  part->set_type(PT_INLINE_EQUATION);
929  } else {
930  new_seeds.push_back(part);
931  }
932  }
933  cp_seeds_ = new_seeds;
934 }
GenericVector< ColPartition * > cp_seeds_
int push_back(T object)
bool IsInline(const bool search_bottom, const int textPartsLineSpacing, ColPartition *part)

◆ IdentifySeedParts()

void tesseract::EquationDetect::IdentifySeedParts ( )
protected

Definition at line 538 of file equationdetect.cpp.

538  {
540  ColPartition *part = NULL;
541  gsearch.StartFullSearch();
542 
543  GenericVector<ColPartition*> seeds1, seeds2;
544  // The left coordinates of indented text partitions.
545  GenericVector<int> indented_texts_left;
546  // The foreground density of text partitions.
547  GenericVector<float> texts_foreground_density;
548  while ((part = gsearch.NextFullSearch()) != NULL) {
549  if (!IsTextOrEquationType(part->type())) {
550  continue;
551  }
552  part->ComputeSpecialBlobsDensity();
553  bool blobs_check = CheckSeedBlobsCount(part);
554  const int kTextBlobsTh = 20;
555 
557  blobs_check) {
558  // Passed high density threshold test, save into seeds1.
559  seeds1.push_back(part);
560  } else {
561  IndentType indent = IsIndented(part);
562  if (IsLeftIndented(indent) && blobs_check &&
564  // Passed low density threshold test and is indented, save into seeds2.
565  seeds2.push_back(part);
566  } else if (!IsRightIndented(indent) &&
567  part->boxes_count() > kTextBlobsTh) {
568  // This is likely to be a text part, save the features.
569  const TBOX&box = part->bounding_box();
570  if (IsLeftIndented(indent)) {
571  indented_texts_left.push_back(box.left());
572  }
573  texts_foreground_density.push_back(ComputeForegroundDensity(box));
574  }
575  }
576  }
577 
578  // Sort the features collected from text regions.
579  indented_texts_left.sort();
580  texts_foreground_density.sort();
581  float foreground_density_th = 0.15; // Default value.
582  if (!texts_foreground_density.empty()) {
583  // Use the median of the texts_foreground_density.
584  foreground_density_th = 0.8 * texts_foreground_density[
585  texts_foreground_density.size() / 2];
586  }
587 
588  for (int i = 0; i < seeds1.size(); ++i) {
589  const TBOX& box = seeds1[i]->bounding_box();
590  if (CheckSeedFgDensity(foreground_density_th, seeds1[i]) &&
591  !(IsLeftIndented(IsIndented(seeds1[i])) &&
592  CountAlignment(indented_texts_left, box.left()) >=
594  // Mark as PT_EQUATION type.
595  seeds1[i]->set_type(PT_EQUATION);
596  cp_seeds_.push_back(seeds1[i]);
597  } else { // Mark as PT_INLINE_EQUATION type.
598  seeds1[i]->set_type(PT_INLINE_EQUATION);
599  }
600  }
601 
602  for (int i = 0; i < seeds2.size(); ++i) {
603  if (CheckForSeed2(indented_texts_left, foreground_density_th, seeds2[i])) {
604  seeds2[i]->set_type(PT_EQUATION);
605  cp_seeds_.push_back(seeds2[i]);
606  }
607  }
608 }
const float kMathDigitDensityTh1
bool empty() const
Definition: genericvector.h:91
bool IsLeftIndented(const EquationDetect::IndentType type)
bool CheckSeedBlobsCount(ColPartition *part)
float ComputeForegroundDensity(const TBOX &tbox)
const float kMathDigitDensityTh2
bool IsTextOrEquationType(PolyBlockType type)
IndentType IsIndented(ColPartition *part)
int size() const
Definition: genericvector.h:72
int CountAlignment(const GenericVector< int > &sorted_vec, const int val) const
GenericVector< ColPartition * > cp_seeds_
int push_back(T object)
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
bool IsRightIndented(const EquationDetect::IndentType type)
bool CheckForSeed2(const GenericVector< int > &indented_texts_left, const float foreground_density_th, ColPartition *part)
bool CheckSeedFgDensity(const float density_th, ColPartition *part)
const int kLeftIndentAlignmentCountTh
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
bool CheckSeedDensity(const float math_density_high, const float math_density_low, const ColPartition *part) const
ColPartitionGrid * part_grid_

◆ IdentifySpecialText() [1/2]

void tesseract::EquationDetect::IdentifySpecialText ( BLOBNBOX blob,
const int  height_th 
)
protected

Definition at line 149 of file equationdetect.cpp.

150  {
151  ASSERT_HOST(blobnbox != NULL);
152  if (blobnbox->bounding_box().height() < height_th && height_th > 0) {
153  // For small blob, we simply set to BSTT_NONE.
154  blobnbox->set_special_text_type(BSTT_NONE);
155  return;
156  }
157 
158  BLOB_CHOICE_LIST ratings_equ, ratings_lang;
159  C_BLOB* blob = blobnbox->cblob();
160  // TODO(joeliu/rays) Fix this. We may have to normalize separately for
161  // each classifier here, as they may require different PolygonalCopy.
162  TBLOB* tblob = TBLOB::PolygonalCopy(false, blob);
163  const TBOX& box = tblob->bounding_box();
164 
165  // Normalize the blob. Set the origin to the place we want to be the
166  // bottom-middle, and scaling is to make the height the x-height.
167  float scaling = static_cast<float>(kBlnXHeight) / box.height();
168  float x_orig = (box.left() + box.right()) / 2.0f, y_orig = box.bottom();
169  TBLOB* normed_blob = new TBLOB(*tblob);
170  normed_blob->Normalize(NULL, NULL, NULL, x_orig, y_orig, scaling, scaling,
171  0.0f, static_cast<float>(kBlnBaselineOffset),
172  false, NULL);
173  equ_tesseract_.AdaptiveClassifier(normed_blob, &ratings_equ);
174  lang_tesseract_->AdaptiveClassifier(normed_blob, &ratings_lang);
175  delete normed_blob;
176  delete tblob;
177 
178  // Get the best choice from ratings_lang and rating_equ. As the choice in the
179  // list has already been sorted by the certainty, we simply use the first
180  // choice.
181  BLOB_CHOICE *lang_choice = NULL, *equ_choice = NULL;
182  if (ratings_lang.length() > 0) {
183  BLOB_CHOICE_IT choice_it(&ratings_lang);
184  lang_choice = choice_it.data();
185  }
186  if (ratings_equ.length() > 0) {
187  BLOB_CHOICE_IT choice_it(&ratings_equ);
188  equ_choice = choice_it.data();
189  }
190 
191  float lang_score = lang_choice ? lang_choice->certainty() : -FLT_MAX;
192  float equ_score = equ_choice ? equ_choice->certainty() : -FLT_MAX;
193 
194  const float kConfScoreTh = -5.0f, kConfDiffTh = 1.8;
195  // The scores here are negative, so the max/min == fabs(min/max).
196  // float ratio = fmax(lang_score, equ_score) / fmin(lang_score, equ_score);
197  float diff = fabs(lang_score - equ_score);
199 
200  // Classification.
201  if (fmax(lang_score, equ_score) < kConfScoreTh) {
202  // If both score are very small, then mark it as unclear.
203  type = BSTT_UNCLEAR;
204  } else if (diff > kConfDiffTh && equ_score > lang_score) {
205  // If equ_score is significantly higher, then we classify this character as
206  // math symbol.
207  type = BSTT_MATH;
208  } else if (lang_choice) {
209  // For other cases: lang_score is similar or significantly higher.
210  type = EstimateTypeForUnichar(
211  lang_tesseract_->unicharset, lang_choice->unichar_id());
212  }
213 
214  if (type == BSTT_NONE && lang_tesseract_->get_fontinfo_table().get(
215  lang_choice->fontinfo_id()).is_italic()) {
216  // For text symbol, we still check if it is italic.
217  blobnbox->set_special_text_type(BSTT_ITALIC);
218  } else {
219  blobnbox->set_special_text_type(type);
220  }
221 }
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
BlobSpecialTextType EstimateTypeForUnichar(const UNICHARSET &unicharset, const UNICHAR_ID id) const
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
const int kBlnXHeight
Definition: normalis.h:28
C_BLOB * cblob() const
Definition: blobbox.h:253
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
UNICHARSET unicharset
Definition: ccutil.h:68
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
inT16 bottom() const
Definition: rect.h:61
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:261
#define ASSERT_HOST(x)
Definition: errcode.h:84
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
inT16 fontinfo_id() const
Definition: ratngs.h:85
BlobSpecialTextType
Definition: blobbox.h:81
TBOX bounding_box() const
Definition: blobs.cpp:482
const int kBlnBaselineOffset
Definition: normalis.h:29
inT16 right() const
Definition: rect.h:75
float certainty() const
Definition: ratngs.h:82

◆ IdentifySpecialText() [2/2]

void tesseract::EquationDetect::IdentifySpecialText ( )
protected

Definition at line 257 of file equationdetect.cpp.

257  {
258  // Set configuration for Tesseract::AdaptiveClassifier.
259  equ_tesseract_.tess_cn_matching.set_value(1); // turn it on
260  equ_tesseract_.tess_bn_matching.set_value(0);
261 
262  // Set the multiplier to zero for lang_tesseract_ to improve the accuracy.
263  int classify_class_pruner = lang_tesseract_->classify_class_pruner_multiplier;
264  int classify_integer_matcher =
268 
270  ColPartition *part = NULL;
271  gsearch.StartFullSearch();
272  while ((part = gsearch.NextFullSearch()) != NULL) {
273  if (!IsTextOrEquationType(part->type())) {
274  continue;
275  }
276  IdentifyBlobsToSkip(part);
277  BLOBNBOX_C_IT bbox_it(part->boxes());
278  // Compute the height threshold.
279  GenericVector<int> blob_heights;
280  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
281  bbox_it.forward()) {
282  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
283  blob_heights.push_back(bbox_it.data()->bounding_box().height());
284  }
285  }
286  blob_heights.sort();
287  int height_th = blob_heights[blob_heights.size() / 2] / 3 * 2;
288  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
289  bbox_it.forward()) {
290  if (bbox_it.data()->special_text_type() != BSTT_SKIP) {
291  IdentifySpecialText(bbox_it.data(), height_th);
292  }
293  }
294  }
295 
296  // Set the multiplier values back.
298  classify_class_pruner);
300  classify_integer_matcher);
301 
302  if (equationdetect_save_spt_image) { // For debug.
303  STRING outfile;
304  GetOutputTiffName("_spt", &outfile);
305  PaintSpecialTexts(outfile);
306  }
307 }
bool IsTextOrEquationType(PolyBlockType type)
int push_back(T object)
int classify_integer_matcher_multiplier
Definition: classify.h:468
Definition: strngs.h:45
void IdentifyBlobsToSkip(ColPartition *part)
bool equationdetect_save_spt_image
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
void PaintSpecialTexts(const STRING &outfile) const
ColPartitionGrid * part_grid_
void GetOutputTiffName(const char *name, STRING *image_name) const
int classify_class_pruner_multiplier
Definition: classify.h:464

◆ InsertPartAfterAbsorb()

void tesseract::EquationDetect::InsertPartAfterAbsorb ( ColPartition part)
protected

Definition at line 511 of file equationdetect.cpp.

511  {
512  ASSERT_HOST(part);
513 
514  // Before insert part back into part_grid_, we will need re-compute some
515  // of its attributes such as first_column_, last_column_. However, we still
516  // want to preserve its type.
517  BlobTextFlowType flow_type = part->flow();
518  PolyBlockType part_type = part->type();
519  BlobRegionType blob_type = part->blob_type();
520 
521  // Call SetPartitionType to re-compute the attributes of part.
522  const TBOX& part_box(part->bounding_box());
523  int grid_x, grid_y;
525  part_box.left(), part_box.bottom(), &grid_x, &grid_y);
526  part->SetPartitionType(resolution_, best_columns_[grid_y]);
527 
528  // Reset the types back.
529  part->set_type(part_type);
530  part->set_blob_type(blob_type);
531  part->set_flow(flow_type);
532  part->SetBlobTypes();
533 
534  // Insert into part_grid_.
535  part_grid_->InsertBBox(true, true, part);
536 }
PolyBlockType
Definition: publictypes.h:53
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:490
ColPartitionSet ** best_columns_
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
BlobRegionType
Definition: blobbox.h:57
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:54
ColPartitionGrid * part_grid_
BlobTextFlowType
Definition: blobbox.h:99

◆ IsIndented()

EquationDetect::IndentType tesseract::EquationDetect::IsIndented ( ColPartition part)
protected

Definition at line 1020 of file equationdetect.cpp.

1020  {
1021  ASSERT_HOST(part);
1022 
1024  ColPartition *neighbor = NULL;
1025  const TBOX& part_box(part->bounding_box());
1026  const int kXGapTh = static_cast<int>(roundf(0.5 * resolution_));
1027  const int kRadiusTh = static_cast<int>(roundf(3.0 * resolution_));
1028  const int kYGapTh = static_cast<int>(roundf(0.5 * resolution_));
1029 
1030  // Here we use a simple approximation algorithm: from the center of part, We
1031  // perform the radius search, and check if we can find a neighboring parition
1032  // that locates on the top/bottom left of part.
1033  search.StartRadSearch((part_box.left() + part_box.right()) / 2,
1034  (part_box.top() + part_box.bottom()) / 2, kRadiusTh);
1035  search.SetUniqueMode(true);
1036  bool left_indented = false, right_indented = false;
1037  while ((neighbor = search.NextRadSearch()) != NULL &&
1038  (!left_indented || !right_indented)) {
1039  if (neighbor == part) {
1040  continue;
1041  }
1042  const TBOX& neighbor_box(neighbor->bounding_box());
1043 
1044  if (part_box.major_y_overlap(neighbor_box) &&
1045  part_box.x_gap(neighbor_box) < kXGapTh) {
1046  // When this happens, it is likely part is a fragment of an
1047  // over-segmented colpartition. So we return false.
1048  return NO_INDENT;
1049  }
1050 
1051  if (!IsTextOrEquationType(neighbor->type())) {
1052  continue;
1053  }
1054 
1055  // The neighbor should be above/below part, and overlap in x direction.
1056  if (!part_box.x_overlap(neighbor_box) || part_box.y_overlap(neighbor_box)) {
1057  continue;
1058  }
1059 
1060  if (part_box.y_gap(neighbor_box) < kYGapTh) {
1061  int left_gap = part_box.left() - neighbor_box.left();
1062  int right_gap = neighbor_box.right() - part_box.right();
1063  if (left_gap > kXGapTh) {
1064  left_indented = true;
1065  }
1066  if (right_gap > kXGapTh) {
1067  right_indented = true;
1068  }
1069  }
1070  }
1071 
1072  if (left_indented && right_indented) {
1073  return BOTH_INDENT;
1074  }
1075  if (left_indented) {
1076  return LEFT_INDENT;
1077  }
1078  if (right_indented) {
1079  return RIGHT_INDENT;
1080  }
1081  return NO_INDENT;
1082 }
bool IsTextOrEquationType(PolyBlockType type)
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ IsInline()

bool tesseract::EquationDetect::IsInline ( const bool  search_bottom,
const int  textPartsLineSpacing,
ColPartition part 
)
protected

Definition at line 936 of file equationdetect.cpp.

938  {
939  ASSERT_HOST(part != NULL);
940  // Look for its nearest vertical neighbor that hardly overlaps in y but
941  // largely overlaps in x.
943  ColPartition *neighbor = NULL;
944  const TBOX& part_box(part->bounding_box());
945  const float kYGapRatioTh = 1.0;
946 
947  if (search_bottom) {
948  search.StartVerticalSearch(part_box.left(), part_box.right(),
949  part_box.bottom());
950  } else {
951  search.StartVerticalSearch(part_box.left(), part_box.right(),
952  part_box.top());
953  }
954  search.SetUniqueMode(true);
955  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
956  const TBOX& neighbor_box(neighbor->bounding_box());
957  if (part_box.y_gap(neighbor_box) > kYGapRatioTh *
958  MIN(part_box.height(), neighbor_box.height())) {
959  // Finished searching.
960  break;
961  }
962  if (!PTIsTextType(neighbor->type())) {
963  continue;
964  }
965 
966  // Check if neighbor and part is inline similar.
967  const float kHeightRatioTh = 0.5;
968  const int kYGapTh = textparts_linespacing > 0 ?
969  textparts_linespacing + static_cast<int>(roundf(0.02 * resolution_)):
970  static_cast<int>(roundf(0.05 * resolution_)); // Default value.
971  if (part_box.x_overlap(neighbor_box) && // Location feature.
972  part_box.y_gap(neighbor_box) <= kYGapTh && // Line spacing.
973  // Geo feature.
974  static_cast<float>(MIN(part_box.height(), neighbor_box.height())) /
975  MAX(part_box.height(), neighbor_box.height()) > kHeightRatioTh) {
976  return true;
977  }
978  }
979 
980  return false;
981 }
#define MIN(x, y)
Definition: ndminx.h:28
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:82
#define MAX(x, y)
Definition: ndminx.h:24
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ IsMathBlockSatellite()

bool tesseract::EquationDetect::IsMathBlockSatellite ( ColPartition part,
GenericVector< ColPartition *> *  math_blocks 
)
protected

Definition at line 1358 of file equationdetect.cpp.

1359  {
1360  ASSERT_HOST(part != NULL && math_blocks != NULL);
1361  math_blocks->clear();
1362  const TBOX& part_box(part->bounding_box());
1363  // Find the top/bottom nearest neighbor of part.
1364  ColPartition *neighbors[2];
1365  int y_gaps[2] = {INT_MAX, INT_MAX};
1366  // The horizontal boundary of the neighbors.
1367  int neighbors_left = INT_MAX, neighbors_right = 0;
1368  for (int i = 0; i < 2; ++i) {
1369  neighbors[i] = SearchNNVertical(i != 0, part);
1370  if (neighbors[i]) {
1371  const TBOX& neighbor_box = neighbors[i]->bounding_box();
1372  y_gaps[i] = neighbor_box.y_gap(part_box);
1373  if (neighbor_box.left() < neighbors_left) {
1374  neighbors_left = neighbor_box.left();
1375  }
1376  if (neighbor_box.right() > neighbors_right) {
1377  neighbors_right = neighbor_box.right();
1378  }
1379  }
1380  }
1381  if (neighbors[0] == neighbors[1]) {
1382  // This happens when part is inside neighbor.
1383  neighbors[1] = NULL;
1384  y_gaps[1] = INT_MAX;
1385  }
1386 
1387  // Check if part is within [neighbors_left, neighbors_right].
1388  if (part_box.left() < neighbors_left || part_box.right() > neighbors_right) {
1389  return false;
1390  }
1391 
1392  // Get the index of the near one in neighbors.
1393  int index = y_gaps[0] < y_gaps[1] ? 0 : 1;
1394 
1395  // Check the near one.
1396  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1397  math_blocks->push_back(neighbors[index]);
1398  } else {
1399  // If the near one failed the check, then we skip checking the far one.
1400  return false;
1401  }
1402 
1403  // Check the far one.
1404  index = 1 - index;
1405  if (IsNearMathNeighbor(y_gaps[index], neighbors[index])) {
1406  math_blocks->push_back(neighbors[index]);
1407  }
1408 
1409  return true;
1410 }
int y_gap(const TBOX &box) const
Definition: rect.h:225
bool IsNearMathNeighbor(const int y_gap, const ColPartition *neighbor) const
int push_back(T object)
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
ColPartition * SearchNNVertical(const bool search_bottom, const ColPartition *part)
inT16 right() const
Definition: rect.h:75

◆ IsNearMathNeighbor()

bool tesseract::EquationDetect::IsNearMathNeighbor ( const int  y_gap,
const ColPartition neighbor 
) const
protected

Definition at line 1447 of file equationdetect.cpp.

1448  {
1449  if (!neighbor) {
1450  return false;
1451  }
1452  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.1));
1453  return neighbor->type() == PT_EQUATION && y_gap <= kYGapTh;
1454 }

◆ IsNearSmallNeighbor()

bool tesseract::EquationDetect::IsNearSmallNeighbor ( const TBOX seed_box,
const TBOX part_box 
) const
protected

Definition at line 1270 of file equationdetect.cpp.

1271  {
1272  const int kXGapTh = static_cast<int>(roundf(0.25 * resolution_));
1273  const int kYGapTh = static_cast<int>(roundf(0.05 * resolution_));
1274 
1275  // Check geometric feature.
1276  if (part_box.height() > seed_box.height() ||
1277  part_box.width() > seed_box.width()) {
1278  return false;
1279  }
1280 
1281  // Check overlap and distance.
1282  if ((!part_box.major_x_overlap(seed_box) ||
1283  part_box.y_gap(seed_box) > kYGapTh) &&
1284  (!part_box.major_y_overlap(seed_box) ||
1285  part_box.x_gap(seed_box) > kXGapTh)) {
1286  return false;
1287  }
1288 
1289  return true;
1290 }
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
int y_gap(const TBOX &box) const
Definition: rect.h:225
int x_gap(const TBOX &box) const
Definition: rect.h:217
inT16 height() const
Definition: rect.h:104
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
inT16 width() const
Definition: rect.h:111

◆ LabelSpecialText()

int tesseract::EquationDetect::LabelSpecialText ( TO_BLOCK to_block)
virtual

Implements tesseract::EquationDetectBase.

Definition at line 129 of file equationdetect.cpp.

129  {
130  if (to_block == NULL) {
131  tprintf("Warning: input to_block is NULL!\n");
132  return -1;
133  }
134 
136  blob_lists.push_back(&(to_block->blobs));
137  blob_lists.push_back(&(to_block->large_blobs));
138  for (int i = 0; i < blob_lists.size(); ++i) {
139  BLOBNBOX_IT bbox_it(blob_lists[i]);
140  for (bbox_it.mark_cycle_pt (); !bbox_it.cycled_list();
141  bbox_it.forward()) {
142  bbox_it.data()->set_special_text_type(BSTT_NONE);
143  }
144  }
145 
146  return 0;
147 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
int push_back(T object)
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772

◆ MergePartsByLocation()

void tesseract::EquationDetect::MergePartsByLocation ( )
protected

Definition at line 423 of file equationdetect.cpp.

423  {
424  while (true) {
425  ColPartition* part = NULL;
426  // partitions that have been updated.
427  GenericVector<ColPartition*> parts_updated;
429  gsearch.StartFullSearch();
430  while ((part = gsearch.NextFullSearch()) != NULL) {
431  if (!IsTextOrEquationType(part->type())) {
432  continue;
433  }
434  GenericVector<ColPartition*> parts_to_merge;
435  SearchByOverlap(part, &parts_to_merge);
436  if (parts_to_merge.empty()) {
437  continue;
438  }
439 
440  // Merge parts_to_merge with part, and remove them from part_grid_.
441  part_grid_->RemoveBBox(part);
442  for (int i = 0; i < parts_to_merge.size(); ++i) {
443  ASSERT_HOST(parts_to_merge[i] != NULL && parts_to_merge[i] != part);
444  part->Absorb(parts_to_merge[i], NULL);
445  }
446  gsearch.RepositionIterator();
447 
448  parts_updated.push_back(part);
449  }
450 
451  if (parts_updated.empty()) { // Exit the loop
452  break;
453  }
454 
455  // Re-insert parts_updated into part_grid_.
456  for (int i = 0; i < parts_updated.size(); ++i) {
457  InsertPartAfterAbsorb(parts_updated[i]);
458  }
459  }
460 }
bool empty() const
Definition: genericvector.h:91
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
void SearchByOverlap(ColPartition *seed, GenericVector< ColPartition *> *parts_overlap)
bool IsTextOrEquationType(PolyBlockType type)
int size() const
Definition: genericvector.h:72
int push_back(T object)
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_

◆ PaintColParts()

void tesseract::EquationDetect::PaintColParts ( const STRING outfile) const
protected

Definition at line 1481 of file equationdetect.cpp.

1481  {
1482  Pix *pix = pixConvertTo32(lang_tesseract_->BestPix());
1484  gsearch.StartFullSearch();
1485  ColPartition* part = NULL;
1486  while ((part = gsearch.NextFullSearch()) != NULL) {
1487  const TBOX& tbox = part->bounding_box();
1488  Box *box = boxCreate(tbox.left(), pixGetHeight(pix) - tbox.top(),
1489  tbox.width(), tbox.height());
1490  if (part->type() == PT_EQUATION) {
1491  pixRenderBoxArb(pix, box, 5, 255, 0, 0);
1492  } else if (part->type() == PT_INLINE_EQUATION) {
1493  pixRenderBoxArb(pix, box, 5, 0, 255, 0);
1494  } else {
1495  pixRenderBoxArb(pix, box, 5, 0, 0, 255);
1496  }
1497  boxDestroy(&box);
1498  }
1499 
1500  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1501  pixDestroy(&pix);
1502 }
const char * string() const
Definition: strngs.cpp:198
inT16 top() const
Definition: rect.h:54
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
Pix * BestPix() const
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_
inT16 width() const
Definition: rect.h:111

◆ PaintSpecialTexts()

void tesseract::EquationDetect::PaintSpecialTexts ( const STRING outfile) const
protected

Definition at line 1464 of file equationdetect.cpp.

1464  {
1465  Pix *pix = NULL, *pixBi = lang_tesseract_->pix_binary();
1466  pix = pixConvertTo32(pixBi);
1468  ColPartition* part = NULL;
1469  gsearch.StartFullSearch();
1470  while ((part = gsearch.NextFullSearch()) != NULL) {
1471  BLOBNBOX_C_IT blob_it(part->boxes());
1472  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1473  RenderSpecialText(pix, blob_it.data());
1474  }
1475  }
1476 
1477  pixWrite(outfile.string(), pix, IFF_TIFF_LZW);
1478  pixDestroy(&pix);
1479 }
Pix * pix_binary() const
const char * string() const
Definition: strngs.cpp:198
static void RenderSpecialText(Pix *pix, BLOBNBOX *blob)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ PrintSpecialBlobsDensity()

void tesseract::EquationDetect::PrintSpecialBlobsDensity ( const ColPartition part) const
protected

Definition at line 1504 of file equationdetect.cpp.

1504  {
1505  ASSERT_HOST(part);
1506  TBOX box(part->bounding_box());
1507  int h = pixGetHeight(lang_tesseract_->BestPix());
1508  tprintf("Printing special blobs density values for ColParition (t=%d,b=%d) ",
1509  h - box.top(), h - box.bottom());
1510  box.print();
1511  tprintf("blobs count = %d, density = ", part->boxes_count());
1512  for (int i = 0; i < BSTT_COUNT; ++i) {
1513  BlobSpecialTextType type = static_cast<BlobSpecialTextType>(i);
1514  tprintf("%d:%f ", i, part->SpecialBlobsDensity(type));
1515  }
1516  tprintf("\n");
1517 }
#define tprintf(...)
Definition: tprintf.h:31
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
BlobSpecialTextType
Definition: blobbox.h:81
Pix * BestPix() const

◆ ProcessMathBlockSatelliteParts()

void tesseract::EquationDetect::ProcessMathBlockSatelliteParts ( )
protected

Definition at line 1309 of file equationdetect.cpp.

1309  {
1310  // Iterate over part_grid_, and find all parts that are text type but not
1311  // equation type.
1312  ColPartition *part = NULL;
1313  GenericVector<ColPartition*> text_parts;
1315  gsearch.StartFullSearch();
1316  while ((part = gsearch.NextFullSearch()) != NULL) {
1317  if (part->type() == PT_FLOWING_TEXT || part->type() == PT_HEADING_TEXT) {
1318  text_parts.push_back(part);
1319  }
1320  }
1321  if (text_parts.empty()) {
1322  return;
1323  }
1324 
1325  // Compute the medium height of the text_parts.
1326  text_parts.sort(&SortCPByHeight);
1327  const TBOX& text_box = text_parts[text_parts.size() / 2]->bounding_box();
1328  int med_height = text_box.height();
1329  if (text_parts.size() % 2 == 0 && text_parts.size() > 1) {
1330  const TBOX& text_box =
1331  text_parts[text_parts.size() / 2 - 1]->bounding_box();
1332  med_height = static_cast<int>(roundf(
1333  0.5 * (text_box.height() + med_height)));
1334  }
1335 
1336  // Iterate every text_parts and check if it is a math block satellite.
1337  for (int i = 0; i < text_parts.size(); ++i) {
1338  const TBOX& text_box(text_parts[i]->bounding_box());
1339  if (text_box.height() > med_height) {
1340  continue;
1341  }
1342  GenericVector<ColPartition*> math_blocks;
1343  if (!IsMathBlockSatellite(text_parts[i], &math_blocks)) {
1344  continue;
1345  }
1346 
1347  // Found. merge text_parts[i] with math_blocks.
1348  part_grid_->RemoveBBox(text_parts[i]);
1349  text_parts[i]->set_type(PT_EQUATION);
1350  for (int j = 0; j < math_blocks.size(); ++j) {
1351  part_grid_->RemoveBBox(math_blocks[j]);
1352  text_parts[i]->Absorb(math_blocks[j], NULL);
1353  }
1354  InsertPartAfterAbsorb(text_parts[i]);
1355  }
1356 }
bool empty() const
Definition: genericvector.h:91
void RemoveBBox(BBC *bbox)
Definition: bbgrid.h:537
int size() const
Definition: genericvector.h:72
int push_back(T object)
inT16 height() const
Definition: rect.h:104
Definition: rect.h:30
bool IsMathBlockSatellite(ColPartition *part, GenericVector< ColPartition *> *math_blocks)
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
void InsertPartAfterAbsorb(ColPartition *part)
ColPartitionGrid * part_grid_

◆ SearchByOverlap()

void tesseract::EquationDetect::SearchByOverlap ( ColPartition seed,
GenericVector< ColPartition *> *  parts_overlap 
)
protected

Definition at line 462 of file equationdetect.cpp.

464  {
465  ASSERT_HOST(seed != NULL && parts_overlap != NULL);
466  if (!IsTextOrEquationType(seed->type())) {
467  return;
468  }
470  const TBOX& seed_box(seed->bounding_box());
471  const int kRadNeighborCells = 30;
472  search.StartRadSearch((seed_box.left() + seed_box.right()) / 2,
473  (seed_box.top() + seed_box.bottom()) / 2,
474  kRadNeighborCells);
475  search.SetUniqueMode(true);
476 
477  // Search iteratively.
478  ColPartition *part;
480  const float kLargeOverlapTh = 0.95;
481  const float kEquXOverlap = 0.4, kEquYOverlap = 0.5;
482  while ((part = search.NextRadSearch()) != NULL) {
483  if (part == seed || !IsTextOrEquationType(part->type())) {
484  continue;
485  }
486  const TBOX& part_box(part->bounding_box());
487  bool merge = false;
488 
489  float x_overlap_fraction = part_box.x_overlap_fraction(seed_box),
490  y_overlap_fraction = part_box.y_overlap_fraction(seed_box);
491 
492  // If part is large overlapped with seed, then set merge to true.
493  if (x_overlap_fraction >= kLargeOverlapTh &&
494  y_overlap_fraction >= kLargeOverlapTh) {
495  merge = true;
496  } else if (seed->type() == PT_EQUATION &&
497  IsTextOrEquationType(part->type())) {
498  if ((x_overlap_fraction > kEquXOverlap && y_overlap_fraction > 0.0) ||
499  (x_overlap_fraction > 0.0 && y_overlap_fraction > kEquYOverlap)) {
500  merge = true;
501  }
502  }
503 
504  if (merge) { // Remove the part from search and put it into parts.
505  search.RemoveBBox();
506  parts_overlap->push_back(part);
507  }
508  }
509 }
bool IsTextOrEquationType(PolyBlockType type)
double x_overlap_fraction(const TBOX &box) const
Definition: rect.h:447
int push_back(T object)
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ SearchNNVertical()

ColPartition * tesseract::EquationDetect::SearchNNVertical ( const bool  search_bottom,
const ColPartition part 
)
protected

Definition at line 1412 of file equationdetect.cpp.

1413  {
1414  ASSERT_HOST(part);
1415  ColPartition *nearest_neighbor = NULL, *neighbor = NULL;
1416  const int kYGapTh = static_cast<int>(roundf(resolution_ * 0.5));
1417 
1419  search.SetUniqueMode(true);
1420  const TBOX& part_box(part->bounding_box());
1421  int y = search_bottom ? part_box.bottom() : part_box.top();
1422  search.StartVerticalSearch(part_box.left(), part_box.right(), y);
1423  int min_y_gap = INT_MAX;
1424  while ((neighbor = search.NextVerticalSearch(search_bottom)) != NULL) {
1425  if (neighbor == part || !IsTextOrEquationType(neighbor->type())) {
1426  continue;
1427  }
1428  const TBOX& neighbor_box(neighbor->bounding_box());
1429  int y_gap = neighbor_box.y_gap(part_box);
1430  if (y_gap > kYGapTh) { // Out of scope.
1431  break;
1432  }
1433  if (!neighbor_box.major_x_overlap(part_box) ||
1434  (search_bottom && neighbor_box.bottom() > part_box.bottom()) ||
1435  (!search_bottom && neighbor_box.top() < part_box.top())) {
1436  continue;
1437  }
1438  if (y_gap < min_y_gap) {
1439  min_y_gap = y_gap;
1440  nearest_neighbor = neighbor;
1441  }
1442  }
1443 
1444  return nearest_neighbor;
1445 }
bool IsTextOrEquationType(PolyBlockType type)
int y_gap(const TBOX &box) const
Definition: rect.h:225
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:371
inT16 bottom() const
Definition: rect.h:61
Definition: rect.h:30
#define ASSERT_HOST(x)
Definition: errcode.h:84
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:930
ColPartitionGrid * part_grid_

◆ SetLangTesseract()

void tesseract::EquationDetect::SetLangTesseract ( Tesseract lang_tesseract)

Definition at line 121 of file equationdetect.cpp.

121  {
122  lang_tesseract_ = lang_tesseract;
123 }

◆ SetResolution()

void tesseract::EquationDetect::SetResolution ( const int  resolution)

Definition at line 125 of file equationdetect.cpp.

125  {
126  resolution_ = resolution;
127 }

◆ SplitCPHor()

void tesseract::EquationDetect::SplitCPHor ( ColPartition part,
GenericVector< ColPartition *> *  parts_splitted 
)
protected

Definition at line 646 of file equationdetect.cpp.

647  {
648  ASSERT_HOST(part && parts_splitted);
649  if (part->median_width() == 0 || part->boxes_count() == 0) {
650  return;
651  }
652 
653  // Make a copy of part, and reset parts_splitted.
654  ColPartition* right_part = part->CopyButDontOwnBlobs();
655  parts_splitted->delete_data_pointers();
656  parts_splitted->clear();
657 
658  const double kThreshold = part->median_width() * 3.0;
659  bool found_split = true;
660  while (found_split) {
661  found_split = false;
662  BLOBNBOX_C_IT box_it(right_part->boxes());
663  // Blobs are sorted left side first. If blobs overlap,
664  // the previous blob may have a "more right" right side.
665  // Account for this by always keeping the largest "right"
666  // so far.
667  int previous_right = MIN_INT32;
668 
669  // Look for the next split in the partition.
670  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
671  const TBOX& box = box_it.data()->bounding_box();
672  if (previous_right != MIN_INT32 &&
673  box.left() - previous_right > kThreshold) {
674  // We have a split position. Split the partition in two pieces.
675  // Insert the left piece in the grid and keep processing the right.
676  int mid_x = (box.left() + previous_right) / 2;
677  ColPartition* left_part = right_part;
678  right_part = left_part->SplitAt(mid_x);
679 
680  parts_splitted->push_back(left_part);
681  left_part->ComputeSpecialBlobsDensity();
682  found_split = true;
683  break;
684  }
685 
686  // The right side of the previous blobs.
687  previous_right = MAX(previous_right, box.right());
688  }
689  }
690 
691  // Add the last piece.
692  right_part->ComputeSpecialBlobsDensity();
693  parts_splitted->push_back(right_part);
694 }
#define MAX(x, y)
Definition: ndminx.h:24
void delete_data_pointers()
int push_back(T object)
#define MIN_INT32
Definition: host.h:70
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 right() const
Definition: rect.h:75

◆ SplitCPHorLite()

void tesseract::EquationDetect::SplitCPHorLite ( ColPartition part,
GenericVector< TBOX > *  splitted_boxes 
)
protected

Definition at line 696 of file equationdetect.cpp.

697  {
698  ASSERT_HOST(part && splitted_boxes);
699  splitted_boxes->clear();
700  if (part->median_width() == 0) {
701  return;
702  }
703 
704  const double kThreshold = part->median_width() * 3.0;
705 
706  // Blobs are sorted left side first. If blobs overlap,
707  // the previous blob may have a "more right" right side.
708  // Account for this by always keeping the largest "right"
709  // so far.
710  TBOX union_box;
711  int previous_right = MIN_INT32;
712  BLOBNBOX_C_IT box_it(part->boxes());
713  for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
714  const TBOX& box = box_it.data()->bounding_box();
715  if (previous_right != MIN_INT32 &&
716  box.left() - previous_right > kThreshold) {
717  // We have a split position.
718  splitted_boxes->push_back(union_box);
719  previous_right = MIN_INT32;
720  }
721  if (previous_right == MIN_INT32) {
722  union_box = box;
723  } else {
724  union_box += box;
725  }
726  // The right side of the previous blobs.
727  previous_right = MAX(previous_right, box.right());
728  }
729 
730  // Add the last piece.
731  if (previous_right != MIN_INT32) {
732  splitted_boxes->push_back(union_box);
733  }
734 }
#define MAX(x, y)
Definition: ndminx.h:24
int push_back(T object)
#define MIN_INT32
Definition: host.h:70
Definition: rect.h:30
inT16 left() const
Definition: rect.h:68
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 right() const
Definition: rect.h:75

Member Data Documentation

◆ best_columns_

ColPartitionSet** tesseract::EquationDetect::best_columns_
protected

Definition at line 261 of file equationdetect.h.

◆ cp_seeds_

GenericVector<ColPartition*> tesseract::EquationDetect::cp_seeds_
protected

Definition at line 267 of file equationdetect.h.

◆ cps_super_bbox_

TBOX* tesseract::EquationDetect::cps_super_bbox_
protected

Definition at line 264 of file equationdetect.h.

◆ equ_tesseract_

Tesseract tesseract::EquationDetect::equ_tesseract_
protected

Definition at line 248 of file equationdetect.h.

◆ lang_tesseract_

Tesseract* tesseract::EquationDetect::lang_tesseract_
protected

Definition at line 252 of file equationdetect.h.

◆ page_count_

int tesseract::EquationDetect::page_count_
protected

Definition at line 273 of file equationdetect.h.

◆ part_grid_

ColPartitionGrid* tesseract::EquationDetect::part_grid_
protected

Definition at line 256 of file equationdetect.h.

◆ resolution_

int tesseract::EquationDetect::resolution_
protected

Definition at line 270 of file equationdetect.h.


The documentation for this class was generated from the following files: