tesseract v5.3.3.20231005
tesseract::TableFinder Class Reference

#include <tablefind.h>

Inheritance diagram for tesseract::TableFinder:
tesseract::TestableTableFinder

Public Member Functions

 TableFinder ()
 
 ~TableFinder ()
 
void set_resolution (int resolution)
 
void set_left_to_right_language (bool order)
 
void Init (int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
 
void InsertCleanPartitions (ColPartitionGrid *grid, TO_BLOCK *block)
 
void LocateTables (ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
 

Protected Member Functions

int gridsize () const
 
int gridwidth () const
 
int gridheight () const
 
const ICOORDbleft () const
 
const ICOORDtright () const
 
ScrollViewMakeWindow (int x, int y, const char *window_name)
 
void InsertTextPartition (ColPartition *part)
 
void InsertFragmentedTextPartition (ColPartition *part)
 
void InsertLeaderPartition (ColPartition *part)
 
void InsertRulingPartition (ColPartition *part)
 
void InsertImagePartition (ColPartition *part)
 
void SplitAndInsertFragmentedTextPartition (ColPartition *part)
 
bool AllowTextPartition (const ColPartition &part) const
 
bool AllowBlob (const BLOBNBOX &blob) const
 
void MoveColSegmentsToGrid (ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
 
void InitializePartitions (ColPartitionSet **all_columns)
 
void SetVerticalSpacing (ColPartition *part)
 
void SetGlobalSpacings (ColPartitionGrid *grid)
 
void set_global_median_xheight (int xheight)
 
void set_global_median_blob_width (int width)
 
void set_global_median_ledding (int ledding)
 
void FindNeighbors ()
 
void MarkTablePartitions ()
 
void MarkPartitionsUsingLocalInformation ()
 
bool HasWideOrNoInterWordGap (ColPartition *part) const
 
bool HasLeaderAdjacent (const ColPartition &part)
 
void FilterFalseAlarms ()
 
void FilterParagraphEndings ()
 
void FilterHeaderAndFooter ()
 
void SmoothTablePartitionRuns ()
 
void GetColumnBlocks (ColPartitionSet **columns, ColSegment_LIST *col_segments)
 
void GroupColumnBlocks (ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
 
bool ConsecutiveBoxes (const TBOX &b1, const TBOX &b2)
 
void SetColumnsType (ColSegment_LIST *col_segments)
 
void GridMergeColumnBlocks ()
 
void GetTableColumns (ColSegment_LIST *table_columns)
 
void GetTableRegions (ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
 
void GridMergeTableRegions ()
 
bool BelongToOneTable (const TBOX &box1, const TBOX &box2)
 
void AdjustTableBoundaries ()
 
void GrowTableBox (const TBOX &table_box, TBOX *result_box)
 
void GrowTableToIncludePartials (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
void GrowTableToIncludeLines (const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
 
bool HLineBelongsToTable (const ColPartition &part, const TBOX &table_box)
 
void IncludeLeftOutColumnHeaders (TBOX *table_box)
 
void DeleteSingleColumnTables ()
 
bool GapInXProjection (int *xprojection, int length)
 
void RecognizeTables ()
 
void DisplayColSegments (ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
 
void DisplayColPartitions (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void DisplayColPartitionConnections (ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
 
void MakeTableBlocks (ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
 

Static Protected Member Functions

static void SetPartitionSpacings (ColPartitionGrid *grid, ColPartitionSet **all_columns)
 

Protected Attributes

int resolution_
 
int global_median_xheight_
 
int global_median_blob_width_
 
int global_median_ledding_
 
ColPartitionGrid clean_part_grid_
 
ColPartitionGrid leader_and_ruling_grid_
 
ColPartitionGrid fragmented_text_grid_
 
ColSegmentGrid col_seg_grid_
 
ColSegmentGrid table_grid_
 
bool left_to_right_language_
 

Detailed Description

Definition at line 121 of file tablefind.h.

Constructor & Destructor Documentation

◆ TableFinder()

tesseract::TableFinder::TableFinder ( )

◆ ~TableFinder()

tesseract::TableFinder::~TableFinder ( )

Definition at line 167 of file tablefind.cpp.

167 {
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175}
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:506
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:397
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:403
ColSegmentGrid table_grid_
Definition: tablefind.h:405
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:401
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:395

Member Function Documentation

◆ AdjustTableBoundaries()

void tesseract::TableFinder::AdjustTableBoundaries ( )
protected

Definition at line 1540 of file tablefind.cpp.

1540 {
1541 // Iterate the table regions in the grid
1542 ColSegment_CLIST adjusted_tables;
1543 ColSegment_C_IT it(&adjusted_tables);
1545 gsearch.StartFullSearch();
1546 ColSegment *table = nullptr;
1547 while ((table = gsearch.NextFullSearch()) != nullptr) {
1548 const TBOX &table_box = table->bounding_box();
1549 TBOX grown_box = table_box;
1550 GrowTableBox(table_box, &grown_box);
1551 // To prevent a table from expanding again, do not insert the
1552 // modified box back to the grid. Instead move it to a list and
1553 // and remove it from the grid. The list is moved later back to the grid.
1554 if (!grown_box.null_box()) {
1555 auto *col = new ColSegment();
1556 col->InsertBox(grown_box);
1557 it.add_after_then_move(col);
1558 }
1559 gsearch.RemoveBBox();
1560 delete table;
1561 }
1562 // clear table grid to move final tables in it
1563 // TODO(nbeato): table_grid_ should already be empty. The above loop
1564 // removed everything. Maybe just assert it is empty?
1566 it.move_to_first();
1567 // move back final tables to table_grid_
1568 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569 ColSegment *seg = it.extract();
1570 table_grid_.InsertBBox(true, true, seg);
1571 }
1572}
@ TBOX
GridSearch< ColSegment, ColSegment_CLIST, ColSegment_C_IT > ColSegmentGridSearch
Definition: tablefind.h:111
void Clear()
Definition: bbgrid.h:497
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1574

◆ AllowBlob()

bool tesseract::TableFinder::AllowBlob ( const BLOBNBOX blob) const
protected

Definition at line 503 of file tablefind.cpp.

503 {
504 const TBOX &box = blob.bounding_box();
505 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507 const int median_area = global_median_xheight_ * global_median_blob_width_;
508 const double kAreaRequired = median_area * kAllowBlobArea;
509 // Keep comparisons strictly greater to disallow 0!
510 return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512}
const double kAllowBlobHeight
Definition: tablefind.cpp:56
const double kAllowBlobArea
Definition: tablefind.cpp:58
const double kAllowBlobWidth
Definition: tablefind.cpp:57

◆ AllowTextPartition()

bool tesseract::TableFinder::AllowTextPartition ( const ColPartition part) const
protected

Definition at line 490 of file tablefind.cpp.

490 {
491 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493 const int median_area = global_median_xheight_ * global_median_blob_width_;
494 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495 // Keep comparisons strictly greater to disallow 0!
496 return part.median_height() > kHeightRequired &&
497 part.median_width() > kWidthRequired &&
498 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499}
const double kAllowTextArea
Definition: tablefind.cpp:51
const double kAllowTextWidth
Definition: tablefind.cpp:50
const double kAllowTextHeight
Definition: tablefind.cpp:49

◆ BelongToOneTable()

bool tesseract::TableFinder::BelongToOneTable ( const TBOX box1,
const TBOX box2 
)
protected

Definition at line 1496 of file tablefind.cpp.

1496 {
1497 // Check the obvious case. Most likely not true because overlapping boxes
1498 // should already be merged, but seems like a good thing to do in case things
1499 // change.
1500 if (box1.overlap(box2)) {
1501 return true;
1502 }
1503 // Check for ColPartitions spanning both table regions
1504 TBOX bbox = box1.bounding_union(box2);
1505 // Start a rect search on bbox
1506 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1508 rectsearch.StartRectSearch(bbox);
1509 ColPartition *part = nullptr;
1510 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511 const TBOX &part_box = part->bounding_box();
1512 // return true if a colpartition spanning both table regions is found
1513 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514 !part->IsImageType()) {
1515 return true;
1516 }
1517 }
1518 return false;
1519}

◆ bleft()

const ICOORD & tesseract::TableFinder::bleft ( ) const
protected

Definition at line 388 of file tablefind.cpp.

388 {
389 return clean_part_grid_.bleft();
390}
const ICOORD & bleft() const
Definition: bbgrid.h:72

◆ ConsecutiveBoxes()

bool tesseract::TableFinder::ConsecutiveBoxes ( const TBOX b1,
const TBOX b2 
)
protected

Definition at line 571 of file tablefind.cpp.

571 {
572 int x_margin = 20;
573 int y_margin = 5;
574 return (abs(b1.left() - b2.left()) < x_margin) &&
575 (abs(b1.right() - b2.right()) < x_margin) &&
576 (abs(b1.top() - b2.bottom()) < y_margin ||
577 abs(b2.top() - b1.bottom()) < y_margin);
578}

◆ DeleteSingleColumnTables()

void tesseract::TableFinder::DeleteSingleColumnTables ( )
protected

Definition at line 1769 of file tablefind.cpp.

1769 {
1770 int page_width = tright().x() - bleft().x();
1771 ASSERT_HOST(page_width > 0);
1772 // create an integer array to hold projection on x-axis
1773 int *table_xprojection = new int[page_width];
1774 // Iterate through all tables in the table grid
1775 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
1776 &table_grid_);
1777 table_search.StartFullSearch();
1778 ColSegment *table;
1779 while ((table = table_search.NextFullSearch()) != nullptr) {
1780 TBOX table_box = table->bounding_box();
1781 // reset the projection array
1782 for (int i = 0; i < page_width; i++) {
1783 table_xprojection[i] = 0;
1784 }
1785 // Start a rect search on table_box
1786 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
1788 rectsearch.SetUniqueMode(true);
1789 rectsearch.StartRectSearch(table_box);
1790 ColPartition *part;
1791 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792 if (!part->IsTextType()) {
1793 continue; // Do not consider non-text partitions
1794 }
1795 if (part->flow() == BTFT_LEADER) {
1796 continue; // Assume leaders are in tables
1797 }
1798 TBOX part_box = part->bounding_box();
1799 // Do not consider partitions partially covered by the table
1800 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801 continue;
1802 }
1803 BLOBNBOX_CLIST *part_boxes = part->boxes();
1804 BLOBNBOX_C_IT pit(part_boxes);
1805
1806 // Make sure overlapping blobs don't artificially inflate the number
1807 // of rows in the table. This happens frequently with things such as
1808 // decimals and split characters. Do this by assuming the column
1809 // partition is sorted mostly left to right and just clip
1810 // bounding boxes by the previous box's extent.
1811 int next_position_to_write = 0;
1812
1813 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814 BLOBNBOX *pblob = pit.data();
1815 // ignore blob height for the purpose of projection since we
1816 // are only interested in finding valleys
1817 int xstart = pblob->bounding_box().left();
1818 int xend = pblob->bounding_box().right();
1819
1820 xstart = std::max(xstart, next_position_to_write);
1821 for (int i = xstart; i < xend; i++) {
1822 table_xprojection[i - bleft().x()]++;
1823 }
1824 next_position_to_write = xend;
1825 }
1826 }
1827 // Find largest valley between two reasonable peaks in the table
1828 if (!GapInXProjection(table_xprojection, page_width)) {
1829 table_search.RemoveBBox();
1830 delete table;
1831 }
1832 }
1833 delete[] table_xprojection;
1834}
#define ASSERT_HOST(x)
Definition: errcode.h:54
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
@ BTFT_LEADER
Definition: blobbox.h:117
TDimension x() const
access function
Definition: points.h:58
const ICOORD & bleft() const
Definition: tablefind.cpp:388
const ICOORD & tright() const
Definition: tablefind.cpp:391
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1838

◆ DisplayColPartitionConnections()

void tesseract::TableFinder::DisplayColPartitionConnections ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 2001 of file tablefind.cpp.

2003 {
2004 // Iterate the ColPartitions in the grid.
2005 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2006 gsearch.StartFullSearch();
2007 ColPartition *part = nullptr;
2008 while ((part = gsearch.NextFullSearch()) != nullptr) {
2009 const TBOX &box = part->bounding_box();
2010 int left_x = box.left();
2011 int right_x = box.right();
2012 int top_y = box.top();
2013 int bottom_y = box.bottom();
2014
2015 ColPartition *upper_part = part->nearest_neighbor_above();
2016 if (upper_part) {
2017 const TBOX &upper_box = upper_part->bounding_box();
2018 int mid_x = (left_x + right_x) / 2;
2019 int mid_y = (top_y + bottom_y) / 2;
2020 int other_x = (upper_box.left() + upper_box.right()) / 2;
2021 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022 win->Brush(ScrollView::NONE);
2023 win->Pen(color);
2024 win->Line(mid_x, mid_y, other_x, other_y);
2025 }
2026 ColPartition *lower_part = part->nearest_neighbor_below();
2027 if (lower_part) {
2028 const TBOX &lower_box = lower_part->bounding_box();
2029 int mid_x = (left_x + right_x) / 2;
2030 int mid_y = (top_y + bottom_y) / 2;
2031 int other_x = (lower_box.left() + lower_box.right()) / 2;
2032 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033 win->Brush(ScrollView::NONE);
2034 win->Pen(color);
2035 win->Line(mid_x, mid_y, other_x, other_y);
2036 }
2037 }
2038 win->UpdateWindow();
2039}

◆ DisplayColPartitions() [1/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  default_color 
)
protected

Definition at line 1996 of file tablefind.cpp.

1997 {
1998 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999}
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1970

◆ DisplayColPartitions() [2/2]

void tesseract::TableFinder::DisplayColPartitions ( ScrollView win,
ColPartitionGrid grid,
ScrollView::Color  text_color,
ScrollView::Color  table_color 
)
protected

Definition at line 1970 of file tablefind.cpp.

1972 {
1973 ScrollView::Color color = default_color;
1974 // Iterate the ColPartitions in the grid.
1975 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
1976 gsearch.StartFullSearch();
1977 ColPartition *part = nullptr;
1978 while ((part = gsearch.NextFullSearch()) != nullptr) {
1979 color = default_color;
1980 if (part->type() == PT_TABLE) {
1981 color = table_color;
1982 }
1983
1984 const TBOX &box = part->bounding_box();
1985 int left_x = box.left();
1986 int right_x = box.right();
1987 int top_y = box.top();
1988 int bottom_y = box.bottom();
1989 win->Brush(ScrollView::NONE);
1990 win->Pen(color);
1991 win->Rectangle(left_x, bottom_y, right_x, top_y);
1992 }
1993 win->UpdateWindow();
1994}

◆ DisplayColSegments()

void tesseract::TableFinder::DisplayColSegments ( ScrollView win,
ColSegment_LIST *  cols,
ScrollView::Color  color 
)
protected

Definition at line 1950 of file tablefind.cpp.

1951 {
1952 win->Pen(color);
1953 win->Brush(ScrollView::NONE);
1954 ColSegment_IT it(segments);
1955 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956 ColSegment *col = it.data();
1957 const TBOX &box = col->bounding_box();
1958 int left_x = box.left();
1959 int right_x = box.right();
1960 int top_y = box.top();
1961 int bottom_y = box.bottom();
1962 win->Rectangle(left_x, bottom_y, right_x, top_y);
1963 }
1964 win->UpdateWindow();
1965}

◆ FilterFalseAlarms()

void tesseract::TableFinder::FilterFalseAlarms ( )
protected

Definition at line 1015 of file tablefind.cpp.

1015 {
1018 // TODO(nbeato): Fully justified text as non-table?
1019}

◆ FilterHeaderAndFooter()

void tesseract::TableFinder::FilterHeaderAndFooter ( )
protected

Definition at line 1110 of file tablefind.cpp.

1110 {
1111 // Consider top-most text colpartition as header and bottom most as footer
1112 ColPartition *header = nullptr;
1113 ColPartition *footer = nullptr;
1114 int max_top = INT32_MIN;
1115 int min_bottom = INT32_MAX;
1117 gsearch.StartFullSearch();
1118 ColPartition *part = nullptr;
1119 while ((part = gsearch.NextFullSearch()) != nullptr) {
1120 if (!part->IsTextType()) {
1121 continue; // Consider only text partitions
1122 }
1123 int top = part->bounding_box().top();
1124 int bottom = part->bounding_box().bottom();
1125 if (top > max_top) {
1126 max_top = top;
1127 header = part;
1128 }
1129 if (bottom < min_bottom) {
1130 min_bottom = bottom;
1131 footer = part;
1132 }
1133 }
1134 if (header) {
1135 header->clear_table_type();
1136 }
1137 if (footer) {
1138 footer->clear_table_type();
1139 }
1140}
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919

◆ FilterParagraphEndings()

void tesseract::TableFinder::FilterParagraphEndings ( )
protected

Definition at line 1021 of file tablefind.cpp.

1021 {
1022 // Detect last line of paragraph
1023 // Iterate the ColPartitions in the grid.
1025 gsearch.StartFullSearch();
1026 ColPartition *part = nullptr;
1027 while ((part = gsearch.NextFullSearch()) != nullptr) {
1028 if (part->type() != PT_TABLE) {
1029 continue; // Consider only table partitions
1030 }
1031
1032 // Paragraph ending should have flowing text above it.
1033 ColPartition *upper_part = part->nearest_neighbor_above();
1034 if (!upper_part) {
1035 continue;
1036 }
1037 if (upper_part->type() != PT_FLOWING_TEXT) {
1038 continue;
1039 }
1040 if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041 continue;
1042 }
1043 // Check if its the last line of a paragraph.
1044 // In most cases, a paragraph ending should be left-aligned to text line
1045 // above it. Sometimes, it could be a 2 line paragraph, in which case
1046 // the line above it is indented.
1047 // To account for that, check if the partition center is to
1048 // the left of the one above it.
1049 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050 int upper_mid = (upper_part->bounding_box().left() +
1051 upper_part->bounding_box().right()) /
1052 2;
1053 int current_spacing = 0; // spacing of the current line to margin
1054 int upper_spacing = 0; // spacing of the previous line to the margin
1056 // Left to right languages, use mid - left to figure out the distance
1057 // the middle is from the left margin.
1058 int left = std::min(part->bounding_box().left(),
1059 upper_part->bounding_box().left());
1060 current_spacing = mid - left;
1061 upper_spacing = upper_mid - left;
1062 } else {
1063 // Right to left languages, use right - mid to figure out the distance
1064 // the middle is from the right margin.
1065 int right = std::max(part->bounding_box().right(),
1066 upper_part->bounding_box().right());
1067 current_spacing = right - mid;
1068 upper_spacing = right - upper_mid;
1069 }
1070 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071 continue;
1072 }
1073
1074 // Paragraphs should have similar fonts.
1075 if (!part->MatchingSizes(*upper_part) ||
1076 !part->MatchingStrokeWidth(*upper_part, kStrokeWidthFractionalTolerance,
1078 continue;
1079 }
1080
1081 // The last line of a paragraph should be left aligned.
1082 // TODO(nbeato): This would be untrue if the text was right aligned.
1083 // How often is that?
1084 if (part->space_to_left() >
1085 kMaxParagraphEndingLeftSpaceMultiple * part->median_height()) {
1086 continue;
1087 }
1088 // The line above it should be right aligned (assuming justified format).
1089 // Since we can't assume justified text, we compare whitespace to text.
1090 // The above line should have majority spanning text (or the current
1091 // line could have fit on the previous line). So compare
1092 // whitespace to text.
1093 if (upper_part->bounding_box().width() <
1095 upper_part->space_to_right()) {
1096 continue;
1097 }
1098
1099 // Ledding above the line should be less than ledding below
1100 if (part->space_above() >= part->space_below() ||
1101 part->space_above() > 2 * global_median_ledding_) {
1102 continue;
1103 }
1104
1105 // If all checks failed, it is probably text.
1106 part->clear_table_type();
1107 }
1108}
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:126
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:140
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:132
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:141
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:122
@ PT_FLOWING_TEXT
Definition: publictypes.h:53

◆ FindNeighbors()

void tesseract::TableFinder::FindNeighbors ( )
protected

Definition at line 773 of file tablefind.cpp.

773 {
775 gsearch.StartFullSearch();
776 ColPartition *part = nullptr;
777 while ((part = gsearch.NextFullSearch()) != nullptr) {
778 // TODO(nbeato): Rename this function, meaning is different now.
779 // IT is finding nearest neighbors its own way
780 // SetVerticalSpacing(part);
781
782 ColPartition *upper = part->SingletonPartner(true);
783 if (upper) {
784 part->set_nearest_neighbor_above(upper);
785 }
786
787 ColPartition *lower = part->SingletonPartner(false);
788 if (lower) {
789 part->set_nearest_neighbor_below(lower);
790 }
791 }
792}

◆ GapInXProjection()

bool tesseract::TableFinder::GapInXProjection ( int *  xprojection,
int  length 
)
protected

Definition at line 1838 of file tablefind.cpp.

1838 {
1839 // Find peak value of the histogram
1840 int peak_value = 0;
1841 for (int i = 0; i < length; i++) {
1842 if (xprojection[i] > peak_value) {
1843 peak_value = xprojection[i];
1844 }
1845 }
1846 // Peak value represents the maximum number of horizontally
1847 // overlapping colpartitions, so this can be considered as the
1848 // number of rows in the table
1849 if (peak_value < kMinRowsInTable) {
1850 return false;
1851 }
1852 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853 if (peak_value >= kLargeTableRowCount) {
1854 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855 }
1856 // Threshold the histogram
1857 for (int i = 0; i < length; i++) {
1858 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859 }
1860 // Find the largest run of zeros between two ones
1861 int largest_gap = 0;
1862 int run_start = -1;
1863 for (int i = 1; i < length; i++) {
1864 // detect start of a run of zeros
1865 if (xprojection[i - 1] && !xprojection[i]) {
1866 run_start = i;
1867 }
1868 // detect end of a run of zeros and update the value of largest gap
1869 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870 int gap = i - run_start;
1871 if (gap > largest_gap) {
1872 largest_gap = gap;
1873 }
1874 run_start = -1;
1875 }
1876 }
1877 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878}
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:107
const int kMinRowsInTable
Definition: tablefind.cpp:112
const int kLargeTableRowCount
Definition: tablefind.cpp:109
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:106
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:136

◆ GetColumnBlocks()

void tesseract::TableFinder::GetColumnBlocks ( ColPartitionSet **  columns,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 525 of file tablefind.cpp.

526 {
527 for (int i = 0; i < gridheight(); ++i) {
528 ColPartitionSet *columns = all_columns[i];
529 if (columns != nullptr) {
530 ColSegment_LIST new_blocks;
531 // Get boxes from the current vertical position on the grid
532 columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533 &new_blocks);
534 // Merge the new_blocks boxes into column_blocks if they are well-aligned
535 GroupColumnBlocks(&new_blocks, column_blocks);
536 }
537 }
538}
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:541
int gridheight() const
Definition: tablefind.cpp:385

◆ GetTableColumns()

void tesseract::TableFinder::GetTableColumns ( ColSegment_LIST *  table_columns)
protected

Definition at line 1320 of file tablefind.cpp.

1320 {
1321 ColSegment_IT it(table_columns);
1322 // Iterate the ColPartitions in the grid.
1323 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
1325 gsearch.StartFullSearch();
1326 ColPartition *part;
1327 while ((part = gsearch.NextFullSearch()) != nullptr) {
1328 if (part->inside_table_column() || part->type() != PT_TABLE) {
1329 continue; // prevent a partition to be assigned to multiple columns
1330 }
1331 const TBOX &box = part->bounding_box();
1332 auto *col = new ColSegment();
1333 col->InsertBox(box);
1334 part->set_inside_table_column(true);
1335 // Start a search below the current cell to find bottom neighbours
1336 // Note: a full search will always process things above it first, so
1337 // this should be starting at the highest cell and working its way down.
1338 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> vsearch(
1340 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341 ColPartition *neighbor = nullptr;
1342 bool found_neighbours = false;
1343 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344 // only consider neighbors not assigned to any column yet
1345 if (neighbor->inside_table_column()) {
1346 continue;
1347 }
1348 // Horizontal lines should not break the flow
1349 if (neighbor->IsHorizontalLine()) {
1350 continue;
1351 }
1352 // presence of a non-table neighbor marks the end of current
1353 // table column
1354 if (neighbor->type() != PT_TABLE) {
1355 break;
1356 }
1357 // add the neighbor partition to the table column
1358 const TBOX &neighbor_box = neighbor->bounding_box();
1359 col->InsertBox(neighbor_box);
1360 neighbor->set_inside_table_column(true);
1361 found_neighbours = true;
1362 }
1363 if (found_neighbours) {
1364 it.add_after_then_move(col);
1365 } else {
1366 part->set_inside_table_column(false);
1367 delete col;
1368 }
1369 }
1370}

◆ GetTableRegions()

void tesseract::TableFinder::GetTableRegions ( ColSegment_LIST *  table_columns,
ColSegment_LIST *  table_regions 
)
protected

Definition at line 1374 of file tablefind.cpp.

1375 {
1376 ColSegment_IT cit(table_columns);
1377 ColSegment_IT rit(table_regions);
1378 // Iterate through column blocks
1379 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1380 &col_seg_grid_);
1381 gsearch.StartFullSearch();
1382 ColSegment *part;
1383 int page_height = tright().y() - bleft().y();
1384 ASSERT_HOST(page_height > 0);
1385 // create a bool array to hold projection on y-axis
1386 bool *table_region = new bool[page_height];
1387 while ((part = gsearch.NextFullSearch()) != nullptr) {
1388 const TBOX &part_box = part->bounding_box();
1389 // reset the projection array
1390 for (int i = 0; i < page_height; i++) {
1391 table_region[i] = false;
1392 }
1393 // iterate through all table columns to find regions in the current
1394 // page column block
1395 cit.move_to_first();
1396 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397 TBOX col_box = cit.data()->bounding_box();
1398 // find intersection region of table column and page column
1399 TBOX intersection_box = col_box.intersection(part_box);
1400 // project table column on the y-axis
1401 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402 table_region[i - bleft().y()] = true;
1403 }
1404 }
1405 // set x-limits of table regions to page column width
1406 TBOX current_table_box;
1407 current_table_box.set_left(part_box.left());
1408 current_table_box.set_right(part_box.right());
1409 // go through the y-axis projection to find runs of table
1410 // regions. Each run makes one table region.
1411 for (int i = 1; i < page_height; i++) {
1412 // detect start of a table region
1413 if (!table_region[i - 1] && table_region[i]) {
1414 current_table_box.set_bottom(i + bleft().y());
1415 }
1416 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417 // detect end of a table region
1418 if (table_region[i - 1] && !table_region[i]) {
1419 current_table_box.set_top(i + bleft().y());
1420 if (!current_table_box.null_box()) {
1421 auto *seg = new ColSegment();
1422 seg->InsertBox(current_table_box);
1423 rit.add_after_then_move(seg);
1424 }
1425 }
1426 }
1427 }
1428 delete[] table_region;
1429}
const double y
TDimension y() const
access_function
Definition: points.h:62

◆ gridheight()

int tesseract::TableFinder::gridheight ( ) const
protected

Definition at line 385 of file tablefind.cpp.

385 {
387}
int gridheight() const
Definition: bbgrid.h:69

◆ GridMergeColumnBlocks()

void tesseract::TableFinder::GridMergeColumnBlocks ( )
protected

Definition at line 1238 of file tablefind.cpp.

1238 {
1239 int margin = gridsize();
1240
1241 // Iterate the Column Blocks in the grid.
1242 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1243 &col_seg_grid_);
1244 gsearch.StartFullSearch();
1245 ColSegment *seg;
1246 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247 if (seg->type() != COL_TEXT) {
1248 continue; // only consider text blocks for split detection
1249 }
1250 bool neighbor_found = false;
1251 bool modified = false; // Modified at least once
1252 // keep expanding current box as long as neighboring table columns
1253 // are found above or below it.
1254 do {
1255 TBOX box = seg->bounding_box();
1256 // slightly expand the search region vertically
1257 int top_range =
1258 std::min(box.top() + margin, static_cast<int>(tright().y()));
1259 int bottom_range =
1260 std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261 box.set_top(top_range);
1262 box.set_bottom(bottom_range);
1263 neighbor_found = false;
1264 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1265 &col_seg_grid_);
1266 rectsearch.StartRectSearch(box);
1267 ColSegment *neighbor = nullptr;
1268 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269 if (neighbor == seg) {
1270 continue;
1271 }
1272 const TBOX &neighbor_box = neighbor->bounding_box();
1273 // If the neighbor box significantly overlaps with the current
1274 // box (due to the expansion of the current box in the
1275 // previous iteration of this loop), remove the neighbor box
1276 // and expand the current box to include it.
1277 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278 seg->InsertBox(neighbor_box);
1279 modified = true;
1280 rectsearch.RemoveBBox();
1281 gsearch.RepositionIterator();
1282 delete neighbor;
1283 continue;
1284 }
1285 // Only expand if the neighbor box is of table type
1286 if (neighbor->type() != COL_TABLE) {
1287 continue;
1288 }
1289 // Insert the neighbor box into the current column block
1290 if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291 seg->InsertBox(neighbor_box);
1292 neighbor_found = true;
1293 modified = true;
1294 rectsearch.RemoveBBox();
1295 gsearch.RepositionIterator();
1296 delete neighbor;
1297 }
1298 }
1299 } while (neighbor_found);
1300 if (modified) {
1301 // Because the box has changed, it has to be removed first.
1302 gsearch.RemoveBBox();
1303 col_seg_grid_.InsertBBox(true, true, seg);
1304 gsearch.RepositionIterator();
1305 }
1306 }
1307}
@ COL_TEXT
Definition: tablefind.h:29
@ COL_TABLE
Definition: tablefind.h:29

◆ GridMergeTableRegions()

void tesseract::TableFinder::GridMergeTableRegions ( )
protected

Definition at line 1437 of file tablefind.cpp.

1437 {
1438 // Iterate the table regions in the grid.
1439 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> gsearch(
1440 &table_grid_);
1441 gsearch.StartFullSearch();
1442 ColSegment *seg = nullptr;
1443 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444 bool neighbor_found = false;
1445 bool modified = false; // Modified at least once
1446 do {
1447 // Start a rectangle search x-bounded by the image and y by the table
1448 const TBOX &box = seg->bounding_box();
1449 TBOX search_region(box);
1450 search_region.set_left(bleft().x());
1451 search_region.set_right(tright().x());
1452 neighbor_found = false;
1453 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> rectsearch(
1454 &table_grid_);
1455 rectsearch.StartRectSearch(search_region);
1456 ColSegment *neighbor = nullptr;
1457 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458 if (neighbor == seg) {
1459 continue;
1460 }
1461 const TBOX &neighbor_box = neighbor->bounding_box();
1462 // Check if a neighbor box has a large overlap with the table
1463 // region. This may happen as a result of merging two table
1464 // regions in the previous iteration.
1465 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466 seg->InsertBox(neighbor_box);
1467 rectsearch.RemoveBBox();
1468 gsearch.RepositionIterator();
1469 delete neighbor;
1470 modified = true;
1471 continue;
1472 }
1473 // Check if two table regions belong together based on a common
1474 // horizontal ruling line
1475 if (BelongToOneTable(box, neighbor_box)) {
1476 seg->InsertBox(neighbor_box);
1477 neighbor_found = true;
1478 modified = true;
1479 rectsearch.RemoveBBox();
1480 gsearch.RepositionIterator();
1481 delete neighbor;
1482 }
1483 }
1484 } while (neighbor_found);
1485 if (modified) {
1486 // Because the box has changed, it has to be removed first.
1487 gsearch.RemoveBBox();
1488 table_grid_.InsertBBox(true, true, seg);
1489 gsearch.RepositionIterator();
1490 }
1491 }
1492}
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1496

◆ gridsize()

int tesseract::TableFinder::gridsize ( ) const
protected

Definition at line 379 of file tablefind.cpp.

379 {
380 return clean_part_grid_.gridsize();
381}
int gridsize() const
Definition: bbgrid.h:63

◆ gridwidth()

int tesseract::TableFinder::gridwidth ( ) const
protected

Definition at line 382 of file tablefind.cpp.

382 {
384}
int gridwidth() const
Definition: bbgrid.h:66

◆ GroupColumnBlocks()

void tesseract::TableFinder::GroupColumnBlocks ( ColSegment_LIST *  current_segments,
ColSegment_LIST *  col_segments 
)
protected

Definition at line 541 of file tablefind.cpp.

542 {
543 ColSegment_IT src_it(new_blocks);
544 ColSegment_IT dest_it(column_blocks);
545 // iterate through the source list
546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547 ColSegment *src_seg = src_it.data();
548 const TBOX &src_box = src_seg->bounding_box();
549 bool match_found = false;
550 // iterate through the destination list to find a matching column block
551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552 ColSegment *dest_seg = dest_it.data();
553 TBOX dest_box = dest_seg->bounding_box();
554 if (ConsecutiveBoxes(src_box, dest_box)) {
555 // If matching block is found, insert the current block into it
556 // and delete the source block.
557 dest_seg->InsertBox(src_box);
558 match_found = true;
559 delete src_it.extract();
560 break;
561 }
562 }
563 // If no match is found, just append the source block to column_blocks
564 if (!match_found) {
565 dest_it.add_after_then_move(src_it.extract());
566 }
567 }
568}
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:571

◆ GrowTableBox()

void tesseract::TableFinder::GrowTableBox ( const TBOX table_box,
TBOX result_box 
)
protected

Definition at line 1574 of file tablefind.cpp.

1574 {
1575 // TODO(nbeato): The growing code is a bit excessive right now.
1576 // By removing these lines, the partitions considered need
1577 // to have some overlap or be special cases. These lines could
1578 // be added again once a check is put in place to make sure that
1579 // growing tables don't stomp on a lot of non-table partitions.
1580
1581 // search for horizontal ruling lines within the vertical margin
1582 // int vertical_margin = kRulingVerticalMargin * gridsize();
1583 TBOX search_box = table_box;
1584 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586 // search_box.set_top(top);
1587 // search_box.set_bottom(bottom);
1588
1589 GrowTableToIncludePartials(table_box, search_box, result_box);
1590 GrowTableToIncludeLines(table_box, search_box, result_box);
1591 IncludeLeftOutColumnHeaders(result_box);
1592}
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1728
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1625
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1596

◆ GrowTableToIncludeLines()

void tesseract::TableFinder::GrowTableToIncludeLines ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1625 of file tablefind.cpp.

1627 {
1629 rsearch.SetUniqueMode(true);
1630 rsearch.StartRectSearch(search_range);
1631 ColPartition *part = nullptr;
1632 while ((part = rsearch.NextRectSearch()) != nullptr) {
1633 // TODO(nbeato) This should also do vertical, but column
1634 // boundaries are breaking things. This function needs to be
1635 // updated to allow vertical lines as well.
1636 if (!part->IsLineType()) {
1637 continue;
1638 }
1639 // Avoid the following function call if the result of the
1640 // function is irrelevant.
1641 const TBOX &part_box = part->bounding_box();
1642 if (result_box->contains(part_box)) {
1643 continue;
1644 }
1645 // Include a partially overlapping horizontal line only if the
1646 // extra ColPartitions that will be included due to expansion
1647 // have large side spacing w.r.t. columns containing them.
1648 if (HLineBelongsToTable(*part, table_box)) {
1649 *result_box = result_box->bounding_union(part_box);
1650 }
1651 // TODO(nbeato): Vertical
1652 }
1653}
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1658

◆ GrowTableToIncludePartials()

void tesseract::TableFinder::GrowTableToIncludePartials ( const TBOX table_box,
const TBOX search_range,
TBOX result_box 
)
protected

Definition at line 1596 of file tablefind.cpp.

1598 {
1599 // Rulings are in a different grid, so search 2 grids for rulings, text,
1600 // and table partitions that are not entirely within the new box.
1601 for (int i = 0; i < 2; ++i) {
1602 ColPartitionGrid *grid =
1604 ColPartitionGridSearch rectsearch(grid);
1605 rectsearch.StartRectSearch(search_range);
1606 ColPartition *part = nullptr;
1607 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608 // Only include text and table types.
1609 if (part->IsImageType()) {
1610 continue;
1611 }
1612 const TBOX &part_box = part->bounding_box();
1613 // Include partition in the table if more than half of it
1614 // is covered by the table
1615 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616 *result_box = result_box->bounding_union(part_box);
1617 continue;
1618 }
1619 }
1620 }
1621}

◆ HasLeaderAdjacent()

bool tesseract::TableFinder::HasLeaderAdjacent ( const ColPartition part)
protected

Definition at line 969 of file tablefind.cpp.

969 {
970 if (part.flow() == BTFT_LEADER) {
971 return true;
972 }
973 // Search range is left and right bounded by an offset of the
974 // median xheight. This offset is to allow some tolerance to the
975 // the leaders on the page in the event that the alignment is still
976 // a bit off.
977 const TBOX &box = part.bounding_box();
979 const int top = box.top() + search_size;
980 const int bottom = box.bottom() - search_size;
982 for (int direction = 0; direction < 2; ++direction) {
983 bool right_to_left = (direction == 0);
984 int x = right_to_left ? box.right() : box.left();
985 hsearch.StartSideSearch(x, bottom, top);
986 ColPartition *leader = nullptr;
987 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988 // The leader could be a horizontal ruling in the grid.
989 // Make sure it is actually a leader.
990 if (leader->flow() != BTFT_LEADER) {
991 continue;
992 }
993 // This should not happen, they are in different grids.
994 ASSERT_HOST(&part != leader);
995 // Make sure the leader shares a page column with the partition,
996 // otherwise we are spreading across columns.
997 if (!part.IsInSameColumnAs(*leader)) {
998 break;
999 }
1000 // There should be a significant vertical overlap
1001 if (!leader->VSignificantCoreOverlap(part)) {
1002 continue;
1003 }
1004 // Leader passed all tests, so it is adjacent.
1005 return true;
1006 }
1007 }
1008 // No leaders are adjacent to the given partition.
1009 return false;
1010}
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:117

◆ HasWideOrNoInterWordGap()

bool tesseract::TableFinder::HasWideOrNoInterWordGap ( ColPartition part) const
protected

Definition at line 875 of file tablefind.cpp.

875 {
876 // Should only get text partitions.
877 ASSERT_HOST(part->IsTextType());
878 // Blob access
879 BLOBNBOX_CLIST *part_boxes = part->boxes();
880 BLOBNBOX_C_IT it(part_boxes);
881 // Check if this is a relatively small partition (such as a single word)
882 if (part->bounding_box().width() <
883 kMinBoxesInTextPartition * part->median_height() &&
884 part_boxes->length() < kMinBoxesInTextPartition) {
885 return true;
886 }
887
888 // Variables used to compute inter-blob spacing.
889 int current_x0 = -1;
890 int current_x1 = -1;
891 int previous_x1 = -1;
892 // Stores the maximum gap detected.
893 int largest_partition_gap_found = -1;
894 // Text partition gap limits. If this is text (and not a table),
895 // there should be at least one gap larger than min_gap and no gap
896 // larger than max_gap.
897 const double max_gap = kMaxGapInTextPartition * part->median_height();
898 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899
900 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901 BLOBNBOX *blob = it.data();
902 current_x0 = blob->bounding_box().left();
903 current_x1 = blob->bounding_box().right();
904 if (previous_x1 != -1) {
905 int gap = current_x0 - previous_x1;
906
907 // TODO(nbeato): Boxes may overlap? Huh?
908 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909 // on the top right of the page are filtered out with this line.
910 // Note 2: Iterating over blobs in a partition, so we are looking for
911 // spacing between the words.
912 if (gap < 0) {
913 // More likely case, the blobs slightly overlap. This can happen
914 // with diacritics (accents) or broken alphabet symbols (characters).
915 // Merge boxes together by taking max of right sides.
916 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917 previous_x1 = std::max(previous_x1, current_x1);
918 continue;
919 }
920 // Extreme case, blobs overlap significantly in the same partition...
921 // This should not happen often (if at all), but it does.
922 // TODO(nbeato): investigate cases when this happens.
923 else {
924 // The behavior before was to completely ignore this case.
925 }
926 }
927
928 // If a large enough gap is found, mark it as a table cell (return true)
929 if (gap > max_gap) {
930 return true;
931 }
932 if (gap > largest_partition_gap_found) {
933 largest_partition_gap_found = gap;
934 }
935 }
936 previous_x1 = current_x1;
937 }
938 // Since no large gap was found, return false if the partition is too
939 // long to be a data cell
940 if (part->bounding_box().width() >
941 kMaxBoxesInDataPartition * part->median_height() ||
942 part_boxes->length() > kMaxBoxesInDataPartition) {
943 return false;
944 }
945
946 // A partition may be a single blob. In this case, it's an isolated symbol
947 // or non-text (such as a ruling or image).
948 // Detect these as table partitions? Shouldn't this be case by case?
949 // The behavior before was to ignore this, making max_partition_gap < 0
950 // and implicitly return true. Just making it explicit.
951 if (largest_partition_gap_found == -1) {
952 return true;
953 }
954
955 // return true if the maximum gap found is smaller than the minimum allowed
956 // max_gap in a text partition. This indicates that there is no significant
957 // space in the partition, hence it is likely a single word.
958 return largest_partition_gap_found < min_gap;
959}
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:73
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:63
const double kMaxGapInTextPartition
Definition: tablefind.cpp:69
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:66
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:77

◆ HLineBelongsToTable()

bool tesseract::TableFinder::HLineBelongsToTable ( const ColPartition part,
const TBOX table_box 
)
protected

Definition at line 1658 of file tablefind.cpp.

1659 {
1660 if (!part.IsHorizontalLine()) {
1661 return false;
1662 }
1663 const TBOX &part_box = part.bounding_box();
1664 if (!part_box.major_x_overlap(table_box)) {
1665 return false;
1666 }
1667 // Do not consider top-most horizontal line since it usually
1668 // originates from noise.
1669 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670 // have neighbors solved.
1671 // if (!part.nearest_neighbor_above())
1672 // return false;
1673 const TBOX bbox = part_box.bounding_union(table_box);
1674 // In the "unioned table" box (the table extents expanded by the line),
1675 // keep track of how many partitions have significant padding to the left
1676 // and right. If more than half of the partitions covered by the new table
1677 // have significant spacing, the line belongs to the table and the table
1678 // grows to include all of the partitions.
1679 int num_extra_partitions = 0;
1680 int extra_space_to_right = 0;
1681 int extra_space_to_left = 0;
1682 // Rulings are in a different grid, so search 2 grids for rulings, text,
1683 // and table partitions that are introduced by the new box.
1684 for (int i = 0; i < 2; ++i) {
1685 ColPartitionGrid *grid =
1687 // Start a rect search on bbox
1688 ColPartitionGridSearch rectsearch(grid);
1689 rectsearch.SetUniqueMode(true);
1690 rectsearch.StartRectSearch(bbox);
1691 ColPartition *extra_part = nullptr;
1692 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693 // ColPartition already in table
1694 const TBOX &extra_part_box = extra_part->bounding_box();
1695 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696 continue;
1697 }
1698 // Non-text ColPartitions do not contribute
1699 if (extra_part->IsImageType()) {
1700 continue;
1701 }
1702 // Consider this partition.
1703 num_extra_partitions++;
1704 // presence of a table cell is a strong hint, so just increment the scores
1705 // without looking at the spacing.
1706 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707 extra_space_to_right++;
1708 extra_space_to_left++;
1709 continue;
1710 }
1711 int space_threshold = kSideSpaceMargin * part.median_height();
1712 if (extra_part->space_to_right() > space_threshold) {
1713 extra_space_to_right++;
1714 }
1715 if (extra_part->space_to_left() > space_threshold) {
1716 extra_space_to_left++;
1717 }
1718 }
1719 }
1720 // tprintf("%d %d %d\n",
1721 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722 return (extra_space_to_right > num_extra_partitions / 2) ||
1723 (extra_space_to_left > num_extra_partitions / 2);
1724}
const int kSideSpaceMargin
Definition: tablefind.cpp:102

◆ IncludeLeftOutColumnHeaders()

void tesseract::TableFinder::IncludeLeftOutColumnHeaders ( TBOX table_box)
protected

Definition at line 1728 of file tablefind.cpp.

1728 {
1729 // Start a search above the current table to look for column headers
1731 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732 table_box->top());
1733 ColPartition *neighbor = nullptr;
1734 ColPartition *previous_neighbor = nullptr;
1735 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736 // Max distance to find a table heading.
1737 const int max_distance =
1738 kMaxColumnHeaderDistance * neighbor->median_height();
1739 int table_top = table_box->top();
1740 const TBOX &box = neighbor->bounding_box();
1741 // Do not continue if the next box is way above
1742 if (box.bottom() - table_top > max_distance) {
1743 break;
1744 }
1745 // Unconditionally include partitions of type TABLE or LINE
1746 // TODO(faisal): add some reasonable conditions here
1747 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748 table_box->set_top(box.top());
1749 previous_neighbor = nullptr;
1750 continue;
1751 }
1752 // If there are two text partitions, one above the other, without a table
1753 // cell on their left or right side, consider them a barrier and quit
1754 if (previous_neighbor == nullptr) {
1755 previous_neighbor = neighbor;
1756 } else {
1757 const TBOX &previous_box = previous_neighbor->bounding_box();
1758 if (!box.major_y_overlap(previous_box)) {
1759 break;
1760 }
1761 }
1762 }
1763}
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:85

◆ Init()

void tesseract::TableFinder::Init ( int  grid_size,
const ICOORD bottom_left,
const ICOORD top_right 
)

Definition at line 181 of file tablefind.cpp.

182 {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189}
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488

◆ InitializePartitions()

void tesseract::TableFinder::InitializePartitions ( ColPartitionSet **  all_columns)
protected

Definition at line 582 of file tablefind.cpp.

582 {
586}
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:716
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:589

◆ InsertCleanPartitions()

void tesseract::TableFinder::InsertCleanPartitions ( ColPartitionGrid grid,
TO_BLOCK block 
)

Definition at line 193 of file tablefind.cpp.

194 {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition *part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207 continue;
208 }
209 ColPartition *clean_part = part->ShallowCopy();
210 ColPartition *leader_part = nullptr;
211 if (part->IsLineType()) {
212 InsertRulingPartition(clean_part);
213 continue;
214 }
215 // Insert all non-text partitions to clean_parts
216 if (!part->IsTextType()) {
217 InsertImagePartition(clean_part);
218 continue;
219 }
220 // Insert text colpartitions after removing noisy components from them
221 // The leaders are split into a separate grid.
222 BLOBNBOX_CLIST *part_boxes = part->boxes();
223 BLOBNBOX_C_IT pit(part_boxes);
224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225 BLOBNBOX *pblob = pit.data();
226 // Bad blobs... happens in UNLV set.
227 // news.3G1, page 17 (around x=6)
228 if (!AllowBlob(*pblob)) {
229 continue;
230 }
231 if (pblob->flow() == BTFT_LEADER) {
232 if (leader_part == nullptr) {
233 leader_part = part->ShallowCopy();
234 leader_part->set_flow(BTFT_LEADER);
235 }
236 leader_part->AddBox(pblob);
237 } else if (pblob->region_type() != BRT_NOISE) {
238 clean_part->AddBox(pblob);
239 }
240 }
241 clean_part->ComputeLimits();
242 ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243 InsertTextPartition(clean_part);
245 if (leader_part != nullptr) {
246 // TODO(nbeato): Note that ComputeLimits does not update the column
247 // information. So the leader may appear to span more columns than it
248 // really does later on when IsInSameColumnAs gets called to test
249 // for adjacent leaders.
250 leader_part->ComputeLimits();
251 InsertLeaderPartition(leader_part);
252 }
253 }
254
255 // Make the partition partners better for upper and lower neighbors.
258}
@ BRT_NOISE
Definition: blobbox.h:75
void RefinePartitionPartners(bool get_desperate)
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:437
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:411
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:419
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:503
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:395
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:422

◆ InsertFragmentedTextPartition()

void tesseract::TableFinder::InsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 403 of file tablefind.cpp.

403 {
404 ASSERT_HOST(part != nullptr);
405 if (AllowTextPartition(*part)) {
406 fragmented_text_grid_.InsertBBox(true, true, part);
407 } else {
408 delete part;
409 }
410}
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490

◆ InsertImagePartition()

void tesseract::TableFinder::InsertImagePartition ( ColPartition part)
protected

Definition at line 422 of file tablefind.cpp.

422 {
423 // NOTE: If images are placed into a different grid in the future,
424 // the function SetPartitionSpacings needs to be updated. It should
425 // be the only thing that cares about image partitions.
426 clean_part_grid_.InsertBBox(true, true, part);
427}

◆ InsertLeaderPartition()

void tesseract::TableFinder::InsertLeaderPartition ( ColPartition part)
protected

Definition at line 411 of file tablefind.cpp.

411 {
412 ASSERT_HOST(part != nullptr);
413 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414 leader_and_ruling_grid_.InsertBBox(true, true, part);
415 } else {
416 delete part;
417 }
418}

◆ InsertRulingPartition()

void tesseract::TableFinder::InsertRulingPartition ( ColPartition part)
protected

Definition at line 419 of file tablefind.cpp.

419 {
420 leader_and_ruling_grid_.InsertBBox(true, true, part);
421}

◆ InsertTextPartition()

void tesseract::TableFinder::InsertTextPartition ( ColPartition part)
protected

Definition at line 395 of file tablefind.cpp.

395 {
396 ASSERT_HOST(part != nullptr);
397 if (AllowTextPartition(*part)) {
398 clean_part_grid_.InsertBBox(true, true, part);
399 } else {
400 delete part;
401 }
402}

◆ LocateTables()

void tesseract::TableFinder::LocateTables ( ColPartitionGrid grid,
ColPartitionSet **  columns,
WidthCallback  width_cb,
const FCOORD reskew 
)

Definition at line 261 of file tablefind.cpp.

263 {
264 // initialize spacing, neighbors, and columns
265 InitializePartitions(all_columns);
266
267#ifndef GRAPHICS_DISABLED
268 if (textord_show_tables) {
269 ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
275
276 table_win = MakeWindow(100, 300, "Fragmented Text");
278 }
279#endif // !GRAPHICS_DISABLED
280
281 // mark, filter, and smooth candidate table partitions
283
284 // Make single-column blocks from good_columns_ partitions. col_segments are
285 // moved to a grid later which takes the ownership
286 ColSegment_LIST column_blocks;
287 GetColumnBlocks(all_columns, &column_blocks);
288 // Set the ratio of candidate table partitions in each column
289 SetColumnsType(&column_blocks);
290
291 // Move column segments to col_seg_grid_
292 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293
294 // Detect split in column layout that might have occurred due to the
295 // presence of a table. In such a case, merge the corresponding columns.
297
298 // Group horizontally overlapping table partitions into table columns.
299 // table_columns created here get deleted at the end of this method.
300 ColSegment_LIST table_columns;
301 GetTableColumns(&table_columns);
302
303 // Within each column, mark the range table regions occupy based on the
304 // table columns detected. table_regions are moved to a grid later which
305 // takes the ownership
306 ColSegment_LIST table_regions;
307 GetTableRegions(&table_columns, &table_regions);
308
309#ifndef GRAPHICS_DISABLED
310 if (textord_tablefind_show_mark) {
311 ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314 }
315#endif // !GRAPHICS_DISABLED
316
317 // Merge table regions across columns for tables spanning multiple
318 // columns
319 MoveColSegmentsToGrid(&table_regions, &table_grid_);
321
322 // Adjust table boundaries by including nearby horizontal lines and left
323 // out column headers
326
327 if (textord_tablefind_recognize_tables) {
328 // Remove false alarms consisting of a single column
330
331#ifndef GRAPHICS_DISABLED
332 if (textord_show_tables) {
333 ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
335 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336 table_grid_.DisplayBoxes(table_win);
337 }
338#endif // !GRAPHICS_DISABLED
339
340 // Find table grid structure and reject tables that are malformed.
344
345#ifndef GRAPHICS_DISABLED
346 if (textord_show_tables) {
347 ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
350 table_grid_.DisplayBoxes(table_win);
351 }
352#endif // !GRAPHICS_DISABLED
353 } else {
354 // Remove false alarms consisting of a single column
355 // TODO(nbeato): verify this is a NOP after structured table rejection.
356 // Right now it isn't. If the recognize function is doing what it is
357 // supposed to do, this function is obsolete.
359
360#ifndef GRAPHICS_DISABLED
361 if (textord_show_tables) {
362 ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
365 table_grid_.DisplayBoxes(table_win);
366 }
367#endif // !GRAPHICS_DISABLED
368 }
369
370 // Merge all colpartitions in table regions to make them a single
371 // colpartition and revert types of isolated table cells not
372 // assigned to any table to their original types.
373 MakeTableBlocks(grid, all_columns, width_cb);
374}
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:649
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1950
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1320
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
Definition: tablefind.cpp:2046
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1374
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:582
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:525
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1219
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1186
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:2001

◆ MakeTableBlocks()

void tesseract::TableFinder::MakeTableBlocks ( ColPartitionGrid grid,
ColPartitionSet **  columns,
const WidthCallback width_cb 
)
protected

Definition at line 2046 of file tablefind.cpp.

2048 {
2049 // Since we have table blocks already, remove table tags from all
2050 // colpartitions
2051 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(grid);
2052 gsearch.StartFullSearch();
2053 ColPartition *part = nullptr;
2054
2055 while ((part = gsearch.NextFullSearch()) != nullptr) {
2056 if (part->type() == PT_TABLE) {
2057 part->clear_table_type();
2058 }
2059 }
2060 // Now make a single colpartition out of each table block and remove
2061 // all colpartitions contained within a table
2062 GridSearch<ColSegment, ColSegment_CLIST, ColSegment_C_IT> table_search(
2063 &table_grid_);
2064 table_search.StartFullSearch();
2065 ColSegment *table;
2066 while ((table = table_search.NextFullSearch()) != nullptr) {
2067 const TBOX &table_box = table->bounding_box();
2068 // Start a rect search on table_box
2069 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
2070 grid);
2071 rectsearch.StartRectSearch(table_box);
2072 ColPartition *part;
2073 ColPartition *table_partition = nullptr;
2074 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075 // Do not consider image partitions
2076 if (!part->IsTextType()) {
2077 continue;
2078 }
2079 TBOX part_box = part->bounding_box();
2080 // Include partition in the table if more than half of it
2081 // is covered by the table
2082 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083 rectsearch.RemoveBBox();
2084 if (table_partition) {
2085 table_partition->Absorb(part, width_cb);
2086 } else {
2087 table_partition = part;
2088 }
2089 }
2090 }
2091 // Insert table colpartition back to part_grid_
2092 if (table_partition) {
2093 // To match the columns used when transforming to blocks, the new table
2094 // partition must have its first and last column set at the grid y that
2095 // corresponds to its bottom.
2096 const TBOX &table_box = table_partition->bounding_box();
2097 int grid_x, grid_y;
2098 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100 table_partition->set_table_type();
2101 table_partition->set_blob_type(BRT_TEXT);
2102 table_partition->set_flow(BTFT_CHAIN);
2103 table_partition->SetBlobTypes();
2104 grid->InsertBBox(true, true, table_partition);
2105 }
2106 }
2107}
@ BRT_TEXT
Definition: blobbox.h:82
@ BTFT_CHAIN
Definition: blobbox.h:114

◆ MakeWindow()

ScrollView * tesseract::TableFinder::MakeWindow ( int  x,
int  y,
const char *  window_name 
)
protected

Definition at line 519 of file tablefind.cpp.

519 {
520 return clean_part_grid_.MakeWindow(x, y, window_name);
521}
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633

◆ MarkPartitionsUsingLocalInformation()

void tesseract::TableFinder::MarkPartitionsUsingLocalInformation ( )
protected

Definition at line 844 of file tablefind.cpp.

844 {
845 // Iterate the ColPartitions in the grid.
846 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> gsearch(
848 gsearch.StartFullSearch();
849 ColPartition *part = nullptr;
850 while ((part = gsearch.NextFullSearch()) != nullptr) {
851 if (!part->IsTextType()) { // Only consider text partitions
852 continue;
853 }
854 // Only consider partitions in dominant font size or smaller
855 if (part->median_height() > kMaxTableCellXheight * global_median_xheight_) {
856 continue;
857 }
858 // Mark partitions with a large gap, or no significant gap as
859 // table partitions.
860 // Comments: It produces several false alarms at:
861 // - last line of a paragraph (fixed)
862 // - single word section headings
863 // - page headers and footers
864 // - numbered equations
865 // - line drawing regions
866 // TODO(faisal): detect and fix above-mentioned cases
867 if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868 part->set_table_type();
869 }
870 }
871}
const double kMaxTableCellXheight
Definition: tablefind.cpp:81
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:875
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:969

◆ MarkTablePartitions()

void tesseract::TableFinder::MarkTablePartitions ( )
protected

Definition at line 798 of file tablefind.cpp.

798 {
800#ifndef GRAPHICS_DISABLED
801 if (textord_tablefind_show_mark) {
802 ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
806 }
807#endif
809#ifndef GRAPHICS_DISABLED
810 if (textord_tablefind_show_mark) {
811 ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
815 }
816#endif
818#ifndef GRAPHICS_DISABLED
819 if (textord_tablefind_show_mark) {
820 ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
824 }
825#endif
827#ifndef GRAPHICS_DISABLED
828 if (textord_tablefind_show_mark || textord_show_tables) {
829 ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
833 }
834#endif
835}
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:844

◆ MoveColSegmentsToGrid()

void tesseract::TableFinder::MoveColSegmentsToGrid ( ColSegment_LIST *  segments,
ColSegmentGrid col_seg_grid 
)
protected

Definition at line 1219 of file tablefind.cpp.

1220 {
1221 ColSegment_IT it(segments);
1222 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223 ColSegment *seg = it.extract();
1224 col_seg_grid->InsertBBox(true, true, seg);
1225 }
1226}

◆ RecognizeTables()

void tesseract::TableFinder::RecognizeTables ( )
protected

Definition at line 1890 of file tablefind.cpp.

1890 {
1891#ifndef GRAPHICS_DISABLED
1892 ScrollView *table_win = nullptr;
1893 if (textord_show_tables) {
1894 table_win = MakeWindow(0, 0, "Table Structure");
1897 // table_grid_.DisplayBoxes(table_win);
1898 }
1899#endif
1900
1901 TableRecognizer recognizer;
1902 recognizer.Init();
1903 recognizer.set_line_grid(&leader_and_ruling_grid_);
1904 recognizer.set_text_grid(&fragmented_text_grid_);
1905 recognizer.set_max_text_height(global_median_xheight_ * 2.0);
1906 recognizer.set_min_height(1.5 * gridheight());
1907 // Loop over all of the tables and try to fit them.
1908 // Store the good tables here.
1909 ColSegment_CLIST good_tables;
1910 ColSegment_C_IT good_it(&good_tables);
1911
1913 gsearch.StartFullSearch();
1914 ColSegment *found_table = nullptr;
1915 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916 gsearch.RemoveBBox();
1917
1918 // The goal is to make the tables persistent in a list.
1919 // When that happens, this will move into the search loop.
1920 const TBOX &found_box = found_table->bounding_box();
1921 StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922
1923 // Process a table. Good tables are inserted into the grid again later on
1924 // We can't change boxes in the grid while it is running a search.
1925 if (table_structure != nullptr) {
1926#ifndef GRAPHICS_DISABLED
1927 if (textord_show_tables) {
1928 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929 }
1930#endif
1931 found_table->set_bounding_box(table_structure->bounding_box());
1932 delete table_structure;
1933 good_it.add_after_then_move(found_table);
1934 } else {
1935 delete found_table;
1936 }
1937 }
1938 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939
1940 // At this point, the grid is empty. We can safely insert the good tables
1941 // back into grid.
1942 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943 table_grid_.InsertBBox(true, true, good_it.extract());
1944 }
1945}

◆ set_global_median_blob_width()

void tesseract::TableFinder::set_global_median_blob_width ( int  width)
protected

Definition at line 766 of file tablefind.cpp.

766 {
768}

◆ set_global_median_ledding()

void tesseract::TableFinder::set_global_median_ledding ( int  ledding)
protected

Definition at line 769 of file tablefind.cpp.

769 {
770 global_median_ledding_ = ledding;
771}

◆ set_global_median_xheight()

void tesseract::TableFinder::set_global_median_xheight ( int  xheight)
protected

Definition at line 763 of file tablefind.cpp.

763 {
764 global_median_xheight_ = xheight;
765}

◆ set_left_to_right_language()

void tesseract::TableFinder::set_left_to_right_language ( bool  order)

Definition at line 177 of file tablefind.cpp.

177 {
179}

◆ set_resolution()

void tesseract::TableFinder::set_resolution ( int  resolution)
inline

Definition at line 128 of file tablefind.h.

128 {
129 resolution_ = resolution;
130 }

◆ SetColumnsType()

void tesseract::TableFinder::SetColumnsType ( ColSegment_LIST *  col_segments)
protected

Definition at line 1186 of file tablefind.cpp.

1186 {
1187 ColSegment_IT it(column_blocks);
1188 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189 ColSegment *seg = it.data();
1190 TBOX box = seg->bounding_box();
1191 int num_table_cells = 0;
1192 int num_text_cells = 0;
1193 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rsearch(
1195 rsearch.SetUniqueMode(true);
1196 rsearch.StartRectSearch(box);
1197 ColPartition *part = nullptr;
1198 while ((part = rsearch.NextRectSearch()) != nullptr) {
1199 if (part->type() == PT_TABLE) {
1200 num_table_cells++;
1201 } else if (part->type() == PT_FLOWING_TEXT) {
1202 num_text_cells++;
1203 }
1204 }
1205 // If a column block has no text or table partition in it, it is not needed
1206 // for table detection.
1207 if (!num_table_cells && !num_text_cells) {
1208 delete it.extract();
1209 } else {
1210 seg->set_num_table_cells(num_table_cells);
1211 seg->set_num_text_cells(num_text_cells);
1212 // set column type based on the ratio of table to text cells
1213 seg->set_type();
1214 }
1215 }
1216}

◆ SetGlobalSpacings()

void tesseract::TableFinder::SetGlobalSpacings ( ColPartitionGrid grid)
protected

Definition at line 716 of file tablefind.cpp.

716 {
717 STATS xheight_stats(0, kMaxVerticalSpacing);
718 STATS width_stats(0, kMaxBlobWidth);
719 STATS ledding_stats(0, kMaxVerticalSpacing);
720 // Iterate the ColPartitions in the grid.
721 ColPartitionGridSearch gsearch(grid);
722 gsearch.SetUniqueMode(true);
723 gsearch.StartFullSearch();
724 ColPartition *part = nullptr;
725 while ((part = gsearch.NextFullSearch()) != nullptr) {
726 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727 // ComputeLimits needs to get called somewhere outside of TableFinder
728 // to make sure the partitions are properly initialized.
729 // When this is called, SmoothPartitionPartners dies in an assert after
730 // table find runs. Alternative solution.
731 // part->ComputeLimits();
732 if (part->IsTextType()) {
733 // xheight_stats.add(part->median_height(), part->boxes_count());
734 // width_stats.add(part->median_width(), part->boxes_count());
735
736 // This loop can be removed when above issues are fixed.
737 // Replace it with the 2 lines commented out above.
738 BLOBNBOX_C_IT it(part->boxes());
739 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740 xheight_stats.add(it.data()->bounding_box().height(), 1);
741 width_stats.add(it.data()->bounding_box().width(), 1);
742 }
743
744 ledding_stats.add(part->space_above(), 1);
745 ledding_stats.add(part->space_below(), 1);
746 }
747 }
748 // Set estimates based on median of statistics obtained
749 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752#ifndef GRAPHICS_DISABLED
753 if (textord_tablefind_show_stats) {
754 const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755 ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759 }
760#endif // !GRAPHICS_DISABLED
761}
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
const int kMaxBlobWidth
Definition: tablefind.cpp:40
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:766
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:763
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:769

◆ SetPartitionSpacings()

void tesseract::TableFinder::SetPartitionSpacings ( ColPartitionGrid grid,
ColPartitionSet **  all_columns 
)
staticprotected

Definition at line 589 of file tablefind.cpp.

590 {
591 // Iterate the ColPartitions in the grid.
592 ColPartitionGridSearch gsearch(grid);
593 gsearch.StartFullSearch();
594 ColPartition *part = nullptr;
595 while ((part = gsearch.NextFullSearch()) != nullptr) {
596 ColPartitionSet *columns = all_columns[gsearch.GridY()];
597 TBOX box = part->bounding_box();
598 int y = part->MidY();
599 ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600 ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601 // set distance from left column as space to the left
602 if (left_column) {
603 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604 part->set_space_to_left(left_space);
605 }
606 // set distance from right column as space to the right
607 if (right_column) {
608 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609 part->set_space_to_right(right_space);
610 }
611
612 // Look for images that may be closer.
613 // NOTE: used to be part_grid_, might cause issues now
614 ColPartitionGridSearch hsearch(grid);
615 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616 ColPartition *neighbor = nullptr;
617 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618 if (neighbor->type() == PT_PULLOUT_IMAGE ||
619 neighbor->type() == PT_FLOWING_IMAGE ||
620 neighbor->type() == PT_HEADING_IMAGE) {
621 int right = neighbor->bounding_box().right();
622 if (right < box.left()) {
623 int space = std::min(box.left() - right, part->space_to_left());
624 part->set_space_to_left(space);
625 }
626 }
627 }
628 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629 neighbor = nullptr;
630 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631 if (neighbor->type() == PT_PULLOUT_IMAGE ||
632 neighbor->type() == PT_FLOWING_IMAGE ||
633 neighbor->type() == PT_HEADING_IMAGE) {
634 int left = neighbor->bounding_box().left();
635 if (left > box.right()) {
636 int space = std::min(left - box.right(), part->space_to_right());
637 part->set_space_to_right(space);
638 }
639 }
640 }
641
642 ColPartition *upper_part = part->SingletonPartner(true);
643 if (upper_part) {
644 int space =
645 std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646 part->bounding_box().bottom()));
647 part->set_space_above(space);
648 } else {
649 // TODO(nbeato): What constitutes a good value?
650 // 0 is the default value when not set, explicitly noting it needs to
651 // be something else.
652 part->set_space_above(INT32_MAX);
653 }
654
655 ColPartition *lower_part = part->SingletonPartner(false);
656 if (lower_part) {
657 int space =
658 std::max(0, static_cast<int>(part->bounding_box().bottom() -
659 lower_part->bounding_box().bottom()));
660 part->set_space_below(space);
661 } else {
662 // TODO(nbeato): What constitutes a good value?
663 // 0 is the default value when not set, explicitly noting it needs to
664 // be something else.
665 part->set_space_below(INT32_MAX);
666 }
667 }
668}
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:63
@ PT_HEADING_IMAGE
Definition: publictypes.h:62
@ PT_FLOWING_IMAGE
Definition: publictypes.h:61

◆ SetVerticalSpacing()

void tesseract::TableFinder::SetVerticalSpacing ( ColPartition part)
protected

Definition at line 671 of file tablefind.cpp.

671 {
672 TBOX box = part->bounding_box();
673 int top_range =
674 std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676 static_cast<int>(bleft().y()));
677 box.set_top(top_range);
678 box.set_bottom(bottom_range);
679
680 TBOX part_box = part->bounding_box();
681 // Start a rect search
682 GridSearch<ColPartition, ColPartition_CLIST, ColPartition_C_IT> rectsearch(
684 rectsearch.StartRectSearch(box);
685 ColPartition *neighbor;
686 int min_space_above = kMaxVerticalSpacing;
687 int min_space_below = kMaxVerticalSpacing;
688 ColPartition *above_neighbor = nullptr;
689 ColPartition *below_neighbor = nullptr;
690 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691 if (neighbor == part) {
692 continue;
693 }
694 TBOX neighbor_box = neighbor->bounding_box();
695 if (neighbor_box.major_x_overlap(part_box)) {
696 int gap = abs(part->median_bottom() - neighbor->median_bottom());
697 // If neighbor is below current partition
698 if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699 min_space_below = gap;
700 below_neighbor = neighbor;
701 } // If neighbor is above current partition
702 else if (part_box.top() < neighbor_box.bottom() &&
703 gap < min_space_above) {
704 min_space_above = gap;
705 above_neighbor = neighbor;
706 }
707 }
708 }
709 part->set_space_above(min_space_above);
710 part->set_space_below(min_space_below);
711 part->set_nearest_neighbor_above(above_neighbor);
712 part->set_nearest_neighbor_below(below_neighbor);
713}

◆ SmoothTablePartitionRuns()

void tesseract::TableFinder::SmoothTablePartitionRuns ( )
protected

Definition at line 1147 of file tablefind.cpp.

1147 {
1148 // Iterate the ColPartitions in the grid.
1150 gsearch.StartFullSearch();
1151 ColPartition *part = nullptr;
1152 while ((part = gsearch.NextFullSearch()) != nullptr) {
1153 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154 continue; // Consider only text partitions
1155 }
1156 ColPartition *upper_part = part->nearest_neighbor_above();
1157 ColPartition *lower_part = part->nearest_neighbor_below();
1158 if (!upper_part || !lower_part) {
1159 continue;
1160 }
1161 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162 part->set_table_type();
1163 }
1164 }
1165
1166 // Pass 2, do the opposite. If both the upper and lower neighbors
1167 // exist and are not tables, this probably shouldn't be a table.
1168 gsearch.StartFullSearch();
1169 part = nullptr;
1170 while ((part = gsearch.NextFullSearch()) != nullptr) {
1171 if (part->type() != PT_TABLE) {
1172 continue; // Consider only text partitions
1173 }
1174 ColPartition *upper_part = part->nearest_neighbor_above();
1175 ColPartition *lower_part = part->nearest_neighbor_below();
1176
1177 // table can't be by itself
1178 if ((upper_part && upper_part->type() != PT_TABLE) &&
1179 (lower_part && lower_part->type() != PT_TABLE)) {
1180 part->clear_table_type();
1181 }
1182 }
1183}

◆ SplitAndInsertFragmentedTextPartition()

void tesseract::TableFinder::SplitAndInsertFragmentedTextPartition ( ColPartition part)
protected

Definition at line 437 of file tablefind.cpp.

437 {
438 ASSERT_HOST(part != nullptr);
439 // Bye bye empty partitions!
440 if (part->boxes()->empty()) {
441 delete part;
442 return;
443 }
444
445 // The AllowBlob function prevents this.
446 ASSERT_HOST(part->median_width() > 0);
447 const double kThreshold = part->median_width() * kSplitPartitionSize;
448
449 ColPartition *right_part = part;
450 bool found_split = true;
451 while (found_split) {
452 found_split = false;
453 BLOBNBOX_C_IT box_it(right_part->boxes());
454 // Blobs are sorted left side first. If blobs overlap,
455 // the previous blob may have a "more right" right side.
456 // Account for this by always keeping the largest "right"
457 // so far.
458 int previous_right = INT32_MIN;
459
460 // Look for the next split in the partition.
461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462 const TBOX &box = box_it.data()->bounding_box();
463 if (previous_right != INT32_MIN &&
464 box.left() - previous_right > kThreshold) {
465 // We have a split position. Split the partition in two pieces.
466 // Insert the left piece in the grid and keep processing the right.
467 int mid_x = (box.left() + previous_right) / 2;
468 ColPartition *left_part = right_part;
469 right_part = left_part->SplitAt(mid_x);
470
472 found_split = true;
473 break;
474 }
475
476 // The right side of the previous blobs.
477 previous_right = std::max(previous_right, static_cast<int>(box.right()));
478 }
479 }
480 // When a split is not found, the right part is minimized
481 // as much as possible, so process it.
483}
const double kSplitPartitionSize
Definition: tablefind.cpp:44
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:403

◆ tright()

const ICOORD & tesseract::TableFinder::tright ( ) const
protected

Definition at line 391 of file tablefind.cpp.

391 {
392 return clean_part_grid_.tright();
393}
const ICOORD & tright() const
Definition: bbgrid.h:75

Member Data Documentation

◆ clean_part_grid_

ColPartitionGrid tesseract::TableFinder::clean_part_grid_
protected

Definition at line 395 of file tablefind.h.

◆ col_seg_grid_

ColSegmentGrid tesseract::TableFinder::col_seg_grid_
protected

Definition at line 403 of file tablefind.h.

◆ fragmented_text_grid_

ColPartitionGrid tesseract::TableFinder::fragmented_text_grid_
protected

Definition at line 401 of file tablefind.h.

◆ global_median_blob_width_

int tesseract::TableFinder::global_median_blob_width_
protected

Definition at line 389 of file tablefind.h.

◆ global_median_ledding_

int tesseract::TableFinder::global_median_ledding_
protected

Definition at line 391 of file tablefind.h.

◆ global_median_xheight_

int tesseract::TableFinder::global_median_xheight_
protected

Definition at line 387 of file tablefind.h.

◆ leader_and_ruling_grid_

ColPartitionGrid tesseract::TableFinder::leader_and_ruling_grid_
protected

Definition at line 397 of file tablefind.h.

◆ left_to_right_language_

bool tesseract::TableFinder::left_to_right_language_
protected

Definition at line 407 of file tablefind.h.

◆ resolution_

int tesseract::TableFinder::resolution_
protected

Definition at line 385 of file tablefind.h.

◆ table_grid_

ColSegmentGrid tesseract::TableFinder::table_grid_
protected

Definition at line 405 of file tablefind.h.


The documentation for this class was generated from the following files: