20# include "config_auto.h"
28#include <allheaders.h>
143#ifndef GRAPHICS_DISABLED
144static BOOL_VAR(textord_show_tables,
false,
"Show table regions (ScrollView)");
145static BOOL_VAR(textord_tablefind_show_mark,
false,
146 "Debug table marking steps in detail (ScrollView)");
147static BOOL_VAR(textord_tablefind_show_stats,
false,
148 "Show page stats used in table finding (ScrollView)");
150static BOOL_VAR(textord_tablefind_recognize_tables,
false,
151 "Enables the table recognizer for table layout and filtering.");
162 global_median_xheight_(0),
163 global_median_blob_width_(0),
164 global_median_ledding_(0),
165 left_to_right_language_(true) {}
182 const ICOORD &top_right) {
222 BLOBNBOX_CLIST *part_boxes = part->
boxes();
223 BLOBNBOX_C_IT pit(part_boxes);
224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
232 if (leader_part ==
nullptr) {
236 leader_part->
AddBox(pblob);
238 clean_part->
AddBox(pblob);
245 if (leader_part !=
nullptr) {
267#ifndef GRAPHICS_DISABLED
268 if (textord_show_tables) {
276 table_win =
MakeWindow(100, 300,
"Fragmented Text");
286 ColSegment_LIST column_blocks;
300 ColSegment_LIST table_columns;
306 ColSegment_LIST table_regions;
309#ifndef GRAPHICS_DISABLED
310 if (textord_tablefind_show_mark) {
327 if (textord_tablefind_recognize_tables) {
331#ifndef GRAPHICS_DISABLED
332 if (textord_show_tables) {
345#ifndef GRAPHICS_DISABLED
346 if (textord_show_tables) {
360#ifndef GRAPHICS_DISABLED
361 if (textord_show_tables) {
362 ScrollView *table_win =
MakeWindow(1500, 300,
"Detected Tables");
440 if (part->
boxes()->empty()) {
450 bool found_split =
true;
451 while (found_split) {
453 BLOBNBOX_C_IT box_it(right_part->
boxes());
458 int previous_right = INT32_MIN;
461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462 const TBOX &box = box_it.data()->bounding_box();
463 if (previous_right != INT32_MIN &&
464 box.
left() - previous_right > kThreshold) {
467 int mid_x = (box.
left() + previous_right) / 2;
469 right_part = left_part->
SplitAt(mid_x);
477 previous_right = std::max(previous_right,
static_cast<int>(box.
right()));
510 return box.
height() > kHeightRequired && box.
width() > kWidthRequired &&
511 box.
area() > kAreaRequired;
518#ifndef GRAPHICS_DISABLED
526 ColSegment_LIST *column_blocks) {
529 if (columns !=
nullptr) {
530 ColSegment_LIST new_blocks;
542 ColSegment_LIST *column_blocks) {
543 ColSegment_IT src_it(new_blocks);
544 ColSegment_IT dest_it(column_blocks);
546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
549 bool match_found =
false;
551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
559 delete src_it.extract();
565 dest_it.add_after_then_move(src_it.extract());
574 return (abs(b1.
left() - b2.
left()) < x_margin) &&
576 (abs(b1.
top() - b2.
bottom()) < y_margin ||
598 int y = part->
MidY();
603 int left_space = std::max(0, box.
left() - left_column->
LeftAtY(
y));
608 int right_space = std::max(0, right_column->
RightAtY(
y) - box.
right());
622 if (right < box.
left()) {
635 if (left > box.
right()) {
676 static_cast<int>(
bleft().
y()));
691 if (neighbor == part) {
698 if (neighbor_box.
top() < part_box.
bottom() && gap < min_space_below) {
699 min_space_below = gap;
700 below_neighbor = neighbor;
702 else if (part_box.
top() < neighbor_box.
bottom() &&
703 gap < min_space_above) {
704 min_space_above = gap;
705 above_neighbor = neighbor;
738 BLOBNBOX_C_IT it(part->
boxes());
739 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740 xheight_stats.
add(it.data()->bounding_box().height(), 1);
741 width_stats.
add(it.data()->bounding_box().width(), 1);
752#ifndef GRAPHICS_DISABLED
753 if (textord_tablefind_show_stats) {
754 const char *kWindowName =
"X-height (R), X-width (G), and ledding (B)";
800#ifndef GRAPHICS_DISABLED
801 if (textord_tablefind_show_mark) {
809#ifndef GRAPHICS_DISABLED
810 if (textord_tablefind_show_mark) {
818#ifndef GRAPHICS_DISABLED
819 if (textord_tablefind_show_mark) {
827#ifndef GRAPHICS_DISABLED
828 if (textord_tablefind_show_mark || textord_show_tables) {
879 BLOBNBOX_CLIST *part_boxes = part->
boxes();
880 BLOBNBOX_C_IT it(part_boxes);
891 int previous_x1 = -1;
893 int largest_partition_gap_found = -1;
900 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
904 if (previous_x1 != -1) {
905 int gap = current_x0 - previous_x1;
917 previous_x1 = std::max(previous_x1, current_x1);
932 if (gap > largest_partition_gap_found) {
933 largest_partition_gap_found = gap;
936 previous_x1 = current_x1;
951 if (largest_partition_gap_found == -1) {
958 return largest_partition_gap_found < min_gap;
979 const int top = box.
top() + search_size;
980 const int bottom = box.
bottom() - search_size;
982 for (
int direction = 0; direction < 2; ++direction) {
983 bool right_to_left = (direction == 0);
984 int x = right_to_left ? box.
right() : box.
left();
987 while ((leader = hsearch.
NextSideSearch(right_to_left)) !=
nullptr) {
1053 int current_spacing = 0;
1054 int upper_spacing = 0;
1060 current_spacing = mid - left;
1061 upper_spacing = upper_mid - left;
1067 current_spacing = right - mid;
1068 upper_spacing = right - upper_mid;
1114 int max_top = INT32_MIN;
1115 int min_bottom = INT32_MAX;
1125 if (top > max_top) {
1129 if (bottom < min_bottom) {
1130 min_bottom = bottom;
1158 if (!upper_part || !lower_part) {
1187 ColSegment_IT it(column_blocks);
1188 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1191 int num_table_cells = 0;
1192 int num_text_cells = 0;
1207 if (!num_table_cells && !num_text_cells) {
1208 delete it.extract();
1221 ColSegment_IT it(segments);
1222 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1250 bool neighbor_found =
false;
1251 bool modified =
false;
1258 std::min(box.
top() + margin,
static_cast<int>(
tright().
y()));
1260 std::max(box.
bottom() - margin,
static_cast<int>(
bleft().
y()));
1263 neighbor_found =
false;
1269 if (neighbor == seg) {
1292 neighbor_found =
true;
1299 }
while (neighbor_found);
1321 ColSegment_IT it(table_columns);
1333 col->InsertBox(box);
1342 bool found_neighbours =
false;
1359 col->InsertBox(neighbor_box);
1361 found_neighbours =
true;
1363 if (found_neighbours) {
1364 it.add_after_then_move(col);
1375 ColSegment_LIST *table_regions) {
1376 ColSegment_IT cit(table_columns);
1377 ColSegment_IT rit(table_regions);
1386 bool *table_region =
new bool[page_height];
1390 for (
int i = 0;
i < page_height;
i++) {
1391 table_region[
i] =
false;
1395 cit.move_to_first();
1396 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397 TBOX col_box = cit.data()->bounding_box();
1401 for (
int i = intersection_box.
bottom();
i < intersection_box.
top();
i++) {
1402 table_region[
i -
bleft().
y()] =
true;
1406 TBOX current_table_box;
1411 for (
int i = 1;
i < page_height;
i++) {
1413 if (!table_region[
i - 1] && table_region[
i]) {
1418 if (table_region[
i - 1] && !table_region[
i]) {
1420 if (!current_table_box.
null_box()) {
1422 seg->InsertBox(current_table_box);
1423 rit.add_after_then_move(seg);
1428 delete[] table_region;
1444 bool neighbor_found =
false;
1445 bool modified =
false;
1449 TBOX search_region(box);
1452 neighbor_found =
false;
1458 if (neighbor == seg) {
1477 neighbor_found =
true;
1484 }
while (neighbor_found);
1542 ColSegment_CLIST adjusted_tables;
1543 ColSegment_C_IT it(&adjusted_tables);
1549 TBOX grown_box = table_box;
1556 col->InsertBox(grown_box);
1557 it.add_after_then_move(col);
1568 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1583 TBOX search_box = table_box;
1597 const TBOX &search_range,
1601 for (
int i = 0;
i < 2; ++
i) {
1626 const TBOX &search_range,
1642 if (result_box->
contains(part_box)) {
1659 const TBOX &table_box) {
1679 int num_extra_partitions = 0;
1680 int extra_space_to_right = 0;
1681 int extra_space_to_left = 0;
1684 for (
int i = 0;
i < 2; ++
i) {
1703 num_extra_partitions++;
1707 extra_space_to_right++;
1708 extra_space_to_left++;
1713 extra_space_to_right++;
1716 extra_space_to_left++;
1722 return (extra_space_to_right > num_extra_partitions / 2) ||
1723 (extra_space_to_left > num_extra_partitions / 2);
1737 const int max_distance =
1739 int table_top = table_box->
top();
1742 if (box.
bottom() - table_top > max_distance) {
1749 previous_neighbor =
nullptr;
1754 if (previous_neighbor ==
nullptr) {
1755 previous_neighbor = neighbor;
1773 int *table_xprojection =
new int[page_width];
1782 for (
int i = 0;
i < page_width;
i++) {
1783 table_xprojection[
i] = 0;
1803 BLOBNBOX_CLIST *part_boxes = part->
boxes();
1804 BLOBNBOX_C_IT pit(part_boxes);
1811 int next_position_to_write = 0;
1813 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1820 xstart = std::max(xstart, next_position_to_write);
1821 for (
int i = xstart;
i < xend;
i++) {
1822 table_xprojection[
i -
bleft().
x()]++;
1824 next_position_to_write = xend;
1833 delete[] table_xprojection;
1841 for (
int i = 0;
i < length;
i++) {
1842 if (xprojection[
i] > peak_value) {
1843 peak_value = xprojection[
i];
1857 for (
int i = 0;
i < length;
i++) {
1858 xprojection[
i] = (xprojection[
i] >= projection_threshold) ? 1 : 0;
1861 int largest_gap = 0;
1863 for (
int i = 1;
i < length;
i++) {
1865 if (xprojection[
i - 1] && !xprojection[
i]) {
1869 if (run_start != -1 && !xprojection[
i - 1] && xprojection[
i]) {
1870 int gap =
i - run_start;
1871 if (gap > largest_gap) {
1891#ifndef GRAPHICS_DISABLED
1893 if (textord_show_tables) {
1894 table_win =
MakeWindow(0, 0,
"Table Structure");
1909 ColSegment_CLIST good_tables;
1910 ColSegment_C_IT good_it(&good_tables);
1925 if (table_structure !=
nullptr) {
1926#ifndef GRAPHICS_DISABLED
1927 if (textord_show_tables) {
1932 delete table_structure;
1933 good_it.add_after_then_move(found_table);
1942 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1947#ifndef GRAPHICS_DISABLED
1954 ColSegment_IT it(segments);
1955 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1958 int left_x = box.
left();
1959 int right_x = box.
right();
1960 int top_y = box.
top();
1961 int bottom_y = box.
bottom();
1962 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1979 color = default_color;
1981 color = table_color;
1985 int left_x = box.
left();
1986 int right_x = box.
right();
1987 int top_y = box.
top();
1988 int bottom_y = box.
bottom();
1991 win->
Rectangle(left_x, bottom_y, right_x, top_y);
2010 int left_x = box.
left();
2011 int right_x = box.
right();
2012 int top_y = box.
top();
2013 int bottom_y = box.
bottom();
2018 int mid_x = (left_x + right_x) / 2;
2019 int mid_y = (top_y + bottom_y) / 2;
2020 int other_x = (upper_box.
left() + upper_box.
right()) / 2;
2021 int other_y = (upper_box.
top() + upper_box.
bottom()) / 2;
2024 win->
Line(mid_x, mid_y, other_x, other_y);
2029 int mid_x = (left_x + right_x) / 2;
2030 int mid_y = (top_y + bottom_y) / 2;
2031 int other_x = (lower_box.
left() + lower_box.
right()) / 2;
2032 int other_y = (lower_box.
top() + lower_box.
bottom()) / 2;
2035 win->
Line(mid_x, mid_y, other_x, other_y);
2084 if (table_partition) {
2085 table_partition->
Absorb(part, width_cb);
2087 table_partition = part;
2092 if (table_partition) {
2104 grid->
InsertBBox(
true,
true, table_partition);
2113 num_table_cells_(0),
2125 return kBoxColors[type_];
2138 }
else if (num_text_cells_ > num_table_cells_) {
#define BOOL_VAR(name, val, comment)
const int kMaxVerticalSpacing
const double kAllowBlobHeight
const double kMinOverlapWithTable
const double kMaxTableCellXheight
const double kMaxParagraphEndingLeftSpaceMultiple
const double kMinMaxGapInTextPartition
const double kLargeTableProjectionThreshold
const int kMinBoxesInTextPartition
const double kStrokeWidthFractionalTolerance
const int kMinRowsInTable
std::function< bool(int)> WidthCallback
const double kMinParagraphEndingTextToWhitespaceRatio
const double kMaxGapInTextPartition
void DeleteObject(T *object)
const double kTableColumnThreshold
const double kAllowBlobArea
const double kAllowTextArea
const int kAdjacentLeaderSearchPadding
const double kStrokeWidthConstantTolerance
const double kAllowTextWidth
const double kAllowTextHeight
const double kAllowBlobWidth
const int kMaxColumnHeaderDistance
const int kMaxBoxesInDataPartition
const double kParagraphEndingPreviousLineRatio
const int kLargeTableRowCount
const double kSmallTableProjectionThreshold
const int kSideSpaceMargin
const double kSplitPartitionSize
const double kMaxBlobOverlapFactor
const double kMaxXProjectionGapFactor
const TBOX & bounding_box() const
BlobRegionType region_type() const
BlobTextFlowType flow() const
TDimension y() const
access_function
TDimension x() const
access function
bool major_y_overlap(const TBOX &box) const
TDimension height() const
TBOX bounding_union(const TBOX &box) const
TBOX intersection(const TBOX &box) const
bool major_x_overlap(const TBOX &box) const
bool overlap(const TBOX &box) const
TDimension bottom() const
bool contains(const FCOORD pt) const
double overlap_fraction(const TBOX &box) const
void add(int32_t value, int32_t count)
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
void StartVerticalSearch(int xmin, int xmax, int y)
void SetUniqueMode(bool mode)
BBC * NextSideSearch(bool right_to_left)
void StartSideSearch(int x, int ymin, int ymax)
BBC * NextVerticalSearch(bool top_to_bottom)
void RepositionIterator()
void StartRectSearch(const TBOX &rect)
const ICOORD & bleft() const
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
const ICOORD & tright() const
void DisplayBoxes(ScrollView *window)
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
void ClearGridData(void(*free_method)(BBC *))
ScrollView * MakeWindow(int x, int y, const char *window_name)
void set_space_to_left(int space)
BlobTextFlowType flow() const
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
bool IsHorizontalLine() const
PolyBlockType type() const
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
int median_bottom() const
bool inside_table_column()
bool VSignificantCoreOverlap(const ColPartition &other) const
void set_nearest_neighbor_above(ColPartition *part)
BlobRegionType blob_type() const
void AddBox(BLOBNBOX *box)
void set_blob_type(BlobRegionType t)
const TBOX & bounding_box() const
void set_nearest_neighbor_below(ColPartition *part)
int space_to_left() const
ColPartition * nearest_neighbor_above() const
void set_space_above(int space)
void set_inside_table_column(bool val)
bool MatchingSizes(const ColPartition &other) const
void set_space_to_right(int space)
ColPartition * ShallowCopy() const
bool IsInSameColumnAs(const ColPartition &part) const
int space_to_right() const
ColPartition * SingletonPartner(bool upper)
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_space_below(int space)
int RightAtY(int y) const
void Absorb(ColPartition *other, const WidthCallback &cb)
ColPartition * nearest_neighbor_below() const
int median_height() const
void set_flow(BlobTextFlowType f)
void RefinePartitionPartners(bool get_desperate)
void FindPartitionPartners()
ColPartition * ColumnContaining(int x, int y)
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void InsertBox(const TBOX &other)
void set_bounding_box(const TBOX &other)
void set_num_table_cells(int n)
void set_num_text_cells(int n)
ScrollView::Color BoxColor() const
const TBOX & bounding_box() const
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
ScrollView * MakeWindow(int x, int y, const char *window_name)
int global_median_ledding_
void InsertFragmentedTextPartition(ColPartition *part)
void IncludeLeftOutColumnHeaders(TBOX *table_box)
void FilterHeaderAndFooter()
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
void AdjustTableBoundaries()
const ICOORD & bleft() const
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
void GetTableColumns(ColSegment_LIST *table_columns)
const ICOORD & tright() const
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
void SetGlobalSpacings(ColPartitionGrid *grid)
bool left_to_right_language_
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
bool HasWideOrNoInterWordGap(ColPartition *part) const
void FilterParagraphEndings()
void InitializePartitions(ColPartitionSet **all_columns)
bool HasLeaderAdjacent(const ColPartition &part)
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
int global_median_blob_width_
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
void set_global_median_blob_width(int width)
void GridMergeColumnBlocks()
void MarkTablePartitions()
ColPartitionGrid leader_and_ruling_grid_
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
void InsertLeaderPartition(ColPartition *part)
bool GapInXProjection(int *xprojection, int length)
void set_global_median_xheight(int xheight)
void GridMergeTableRegions()
ColSegmentGrid col_seg_grid_
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
void set_global_median_ledding(int ledding)
void InsertRulingPartition(ColPartition *part)
void set_left_to_right_language(bool order)
bool AllowBlob(const BLOBNBOX &blob) const
int global_median_xheight_
ColSegmentGrid table_grid_
void SetColumnsType(ColSegment_LIST *col_segments)
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
void DeleteSingleColumnTables()
ColPartitionGrid fragmented_text_grid_
void InsertTextPartition(ColPartition *part)
void SetVerticalSpacing(ColPartition *part)
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
void SmoothTablePartitionRuns()
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
void MarkPartitionsUsingLocalInformation()
ColPartitionGrid clean_part_grid_
void InsertImagePartition(ColPartition *part)
bool AllowTextPartition(const ColPartition &part) const
const TBOX & bounding_box() const
void Display(ScrollView *window, ScrollView::Color color)
void set_max_text_height(int height)
void set_line_grid(ColPartitionGrid *lines)
void set_text_grid(ColPartitionGrid *text)
void set_min_height(int height)
StructuredTable * RecognizeTable(const TBOX &guess_box)
void Line(int x1, int y1, int x2, int y2)
void Rectangle(int x1, int y1, int x2, int y2)