tesseract v5.3.3.20231005
tablefind.cpp
Go to the documentation of this file.
1
2// File: tablefind.cpp
3// Description: Helper classes to find tables from ColPartitions.
4// Author: Faisal Shafait (faisal.shafait@dfki.de)
5//
6// (C) Copyright 2009, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20# include "config_auto.h"
21#endif
22
23#include <algorithm>
24#include <cmath>
25#include <utility>
26#include "tablefind.h"
27
28#include <allheaders.h>
29
30#include "colpartitionset.h"
31#include "tablerecog.h"
32
33namespace tesseract {
34
35// These numbers are used to calculate the global median stats.
36// They just set an upper bound on the stats objects.
37// Maximum vertical spacing between neighbor partitions.
38const int kMaxVerticalSpacing = 500;
39// Maximum width of a blob in a partition.
40const int kMaxBlobWidth = 500;
41
42// Minimum whitespace size to split a partition (measured as a multiple
43// of a partition's median width).
44const double kSplitPartitionSize = 2.0;
45// To insert text, the partition must satisfy these size constraints
46// in AllowTextPartition(). The idea is to filter noise partitions
47// determined by the size compared to the global medians.
48// TODO(nbeato): Need to find good numbers again.
49const double kAllowTextHeight = 0.5;
50const double kAllowTextWidth = 0.6;
51const double kAllowTextArea = 0.8;
52// The same thing applies to blobs (to filter noise).
53// TODO(nbeato): These numbers are a shot in the dark...
54// height and width are 0.5 * gridsize() in colfind.cpp
55// area is a rough guess for the size of a period.
56const double kAllowBlobHeight = 0.3;
57const double kAllowBlobWidth = 0.4;
58const double kAllowBlobArea = 0.05;
59
60// Minimum number of components in a text partition. A partition having fewer
61// components than that is more likely a data partition and is a candidate
62// table cell.
64
65// Maximum number of components that a data partition can have
67
68// Maximum allowed gap in a text partitions as a multiple of its median size.
69const double kMaxGapInTextPartition = 4.0;
70
71// Minimum value that the maximum gap in a text partition should have as a
72// factor of its median size.
73const double kMinMaxGapInTextPartition = 0.5;
74
75// The amount of overlap that is "normal" for adjacent blobs in a text
76// partition. This is used to calculate gap between overlapping blobs.
77const double kMaxBlobOverlapFactor = 4.0;
78
79// Maximum x-height a table partition can have as a multiple of global
80// median x-height
81const double kMaxTableCellXheight = 2.0;
82
83// Maximum line spacing between a table column header and column contents
84// for merging the two (as a multiple of the partition's median_height).
86
87// Minimum ratio of num_table_partitions to num_text_partitions in a column
88// block to be called it a table column
89const double kTableColumnThreshold = 3.0;
90
91// Search for horizontal ruling lines within the vertical margin as a
92// multiple of grid size
93// const int kRulingVerticalMargin = 3;
94
95// Minimum overlap that a colpartition must have with a table region
96// to become part of that table
97const double kMinOverlapWithTable = 0.6;
98
99// Maximum side space (distance from column boundary) that a typical
100// text-line in flowing text should have as a multiple of its x-height
101// (Median size).
102const int kSideSpaceMargin = 10;
103
104// Fraction of the peak of x-projection of a table region to set the
105// threshold for the x-projection histogram
108// Minimum number of rows required to look for more rows in the projection.
110
111// Minimum number of rows in a table
112const int kMinRowsInTable = 3;
113
114// The amount of padding (multiplied by global_median_xheight_ during use)
115// that is vertically added to the search adjacent leader search during
116// ColPartition marking.
118
119// Used when filtering false positives. When finding the last line
120// of a paragraph (typically left-aligned), the previous line should have
121// its center to the right of the last line by this scaled amount.
123
124// The maximum amount of whitespace allowed left of a paragraph ending.
125// Do not filter a ColPartition with more than this space left of it.
127
128// Used when filtering false positives. The last line of a paragraph
129// should be preceded by a line that is predominantly text. This is the
130// ratio of text to whitespace (to the right of the text) that is required
131// for the previous line to be a text.
133
134// When counting table columns, this is the required gap between two columns
135// (it is multiplied by global_median_xheight_).
136const double kMaxXProjectionGapFactor = 2.0;
137
138// Used for similarity in partitions using stroke width. Values copied
139// from ColFind.cpp in Ray's CL.
142
143#ifndef GRAPHICS_DISABLED
144static BOOL_VAR(textord_show_tables, false, "Show table regions (ScrollView)");
145static BOOL_VAR(textord_tablefind_show_mark, false,
146 "Debug table marking steps in detail (ScrollView)");
147static BOOL_VAR(textord_tablefind_show_stats, false,
148 "Show page stats used in table finding (ScrollView)");
149#endif
150static BOOL_VAR(textord_tablefind_recognize_tables, false,
151 "Enables the table recognizer for table layout and filtering.");
152
153// Templated helper function used to create destructor callbacks for the
154// BBGrid::ClearGridData() method.
155template <typename T>
156void DeleteObject(T *object) {
157 delete object;
158}
159
161 : resolution_(0),
162 global_median_xheight_(0),
163 global_median_blob_width_(0),
164 global_median_ledding_(0),
165 left_to_right_language_(true) {}
166
168 // ColPartitions and ColSegments created by this class for storage in grids
169 // need to be deleted explicitly.
170 clean_part_grid_.ClearGridData(&DeleteObject<ColPartition>);
171 leader_and_ruling_grid_.ClearGridData(&DeleteObject<ColPartition>);
172 fragmented_text_grid_.ClearGridData(&DeleteObject<ColPartition>);
173 col_seg_grid_.ClearGridData(&DeleteObject<ColSegment>);
174 table_grid_.ClearGridData(&DeleteObject<ColSegment>);
175}
176
179}
180
181void TableFinder::Init(int grid_size, const ICOORD &bottom_left,
182 const ICOORD &top_right) {
183 // Initialize clean partitions list and grid
184 clean_part_grid_.Init(grid_size, bottom_left, top_right);
185 leader_and_ruling_grid_.Init(grid_size, bottom_left, top_right);
186 fragmented_text_grid_.Init(grid_size, bottom_left, top_right);
187 col_seg_grid_.Init(grid_size, bottom_left, top_right);
188 table_grid_.Init(grid_size, bottom_left, top_right);
189}
190
191// Copy cleaned partitions from part_grid_ to clean_part_grid_ and
192// insert leaders and rulers into the leader_and_ruling_grid_
194 TO_BLOCK *block) {
195 // Calculate stats. This lets us filter partitions in AllowTextPartition()
196 // and filter blobs in AllowBlob().
197 SetGlobalSpacings(grid);
198
199 // Iterate the ColPartitions in the grid.
200 ColPartitionGridSearch gsearch(grid);
201 gsearch.SetUniqueMode(true);
202 gsearch.StartFullSearch();
203 ColPartition *part = nullptr;
204 while ((part = gsearch.NextFullSearch()) != nullptr) {
205 // Reject partitions with nothing useful inside of them.
206 if (part->blob_type() == BRT_NOISE || part->bounding_box().area() <= 0) {
207 continue;
208 }
209 ColPartition *clean_part = part->ShallowCopy();
210 ColPartition *leader_part = nullptr;
211 if (part->IsLineType()) {
212 InsertRulingPartition(clean_part);
213 continue;
215 // Insert all non-text partitions to clean_parts
216 if (!part->IsTextType()) {
217 InsertImagePartition(clean_part);
218 continue;
220 // Insert text colpartitions after removing noisy components from them
221 // The leaders are split into a separate grid.
222 BLOBNBOX_CLIST *part_boxes = part->boxes();
223 BLOBNBOX_C_IT pit(part_boxes);
224 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
225 BLOBNBOX *pblob = pit.data();
226 // Bad blobs... happens in UNLV set.
227 // news.3G1, page 17 (around x=6)
228 if (!AllowBlob(*pblob)) {
229 continue;
230 }
231 if (pblob->flow() == BTFT_LEADER) {
232 if (leader_part == nullptr) {
233 leader_part = part->ShallowCopy();
234 leader_part->set_flow(BTFT_LEADER);
235 }
236 leader_part->AddBox(pblob);
237 } else if (pblob->region_type() != BRT_NOISE) {
238 clean_part->AddBox(pblob);
239 }
240 }
241 clean_part->ComputeLimits();
242 ColPartition *fragmented = clean_part->CopyButDontOwnBlobs();
243 InsertTextPartition(clean_part);
245 if (leader_part != nullptr) {
246 // TODO(nbeato): Note that ComputeLimits does not update the column
247 // information. So the leader may appear to span more columns than it
248 // really does later on when IsInSameColumnAs gets called to test
249 // for adjacent leaders.
250 leader_part->ComputeLimits();
251 InsertLeaderPartition(leader_part);
252 }
253 }
254
255 // Make the partition partners better for upper and lower neighbors.
258}
259
260// High level function to perform table detection
262 ColPartitionSet **all_columns,
263 WidthCallback width_cb, const FCOORD &reskew) {
264 // initialize spacing, neighbors, and columns
265 InitializePartitions(all_columns);
266
267#ifndef GRAPHICS_DISABLED
268 if (textord_show_tables) {
269 ScrollView *table_win = MakeWindow(0, 300, "Column Partitions & Neighbors");
275
276 table_win = MakeWindow(100, 300, "Fragmented Text");
278 }
279#endif // !GRAPHICS_DISABLED
280
281 // mark, filter, and smooth candidate table partitions
283
284 // Make single-column blocks from good_columns_ partitions. col_segments are
285 // moved to a grid later which takes the ownership
286 ColSegment_LIST column_blocks;
287 GetColumnBlocks(all_columns, &column_blocks);
288 // Set the ratio of candidate table partitions in each column
289 SetColumnsType(&column_blocks);
290
291 // Move column segments to col_seg_grid_
292 MoveColSegmentsToGrid(&column_blocks, &col_seg_grid_);
293
294 // Detect split in column layout that might have occurred due to the
295 // presence of a table. In such a case, merge the corresponding columns.
297
298 // Group horizontally overlapping table partitions into table columns.
299 // table_columns created here get deleted at the end of this method.
300 ColSegment_LIST table_columns;
301 GetTableColumns(&table_columns);
302
303 // Within each column, mark the range table regions occupy based on the
304 // table columns detected. table_regions are moved to a grid later which
305 // takes the ownership
306 ColSegment_LIST table_regions;
307 GetTableRegions(&table_columns, &table_regions);
308
309#ifndef GRAPHICS_DISABLED
310 if (textord_tablefind_show_mark) {
311 ScrollView *table_win = MakeWindow(1200, 300, "Table Columns and Regions");
312 DisplayColSegments(table_win, &table_columns, ScrollView::DARK_TURQUOISE);
313 DisplayColSegments(table_win, &table_regions, ScrollView::YELLOW);
314 }
315#endif // !GRAPHICS_DISABLED
316
317 // Merge table regions across columns for tables spanning multiple
318 // columns
319 MoveColSegmentsToGrid(&table_regions, &table_grid_);
321
322 // Adjust table boundaries by including nearby horizontal lines and left
323 // out column headers
326
327 if (textord_tablefind_recognize_tables) {
328 // Remove false alarms consisting of a single column
330
331#ifndef GRAPHICS_DISABLED
332 if (textord_show_tables) {
333 ScrollView *table_win = MakeWindow(1200, 300, "Detected Table Locations");
335 DisplayColSegments(table_win, &table_columns, ScrollView::KHAKI);
336 table_grid_.DisplayBoxes(table_win);
337 }
338#endif // !GRAPHICS_DISABLED
339
340 // Find table grid structure and reject tables that are malformed.
344
345#ifndef GRAPHICS_DISABLED
346 if (textord_show_tables) {
347 ScrollView *table_win = MakeWindow(1400, 600, "Recognized Tables");
350 table_grid_.DisplayBoxes(table_win);
351 }
352#endif // !GRAPHICS_DISABLED
353 } else {
354 // Remove false alarms consisting of a single column
355 // TODO(nbeato): verify this is a NOP after structured table rejection.
356 // Right now it isn't. If the recognize function is doing what it is
357 // supposed to do, this function is obsolete.
359
360#ifndef GRAPHICS_DISABLED
361 if (textord_show_tables) {
362 ScrollView *table_win = MakeWindow(1500, 300, "Detected Tables");
365 table_grid_.DisplayBoxes(table_win);
366 }
367#endif // !GRAPHICS_DISABLED
368 }
369
370 // Merge all colpartitions in table regions to make them a single
371 // colpartition and revert types of isolated table cells not
372 // assigned to any table to their original types.
373 MakeTableBlocks(grid, all_columns, width_cb);
374}
375// All grids have the same dimensions. The clean_part_grid_ sizes are set from
376// the part_grid_ that is passed to InsertCleanPartitions, which was the same as
377// the grid that is the base of ColumnFinder. Just return the clean_part_grid_
378// dimensions instead of duplicated memory.
380 return clean_part_grid_.gridsize();
381}
384}
387}
389 return clean_part_grid_.bleft();
390}
392 return clean_part_grid_.tright();
393}
394
396 ASSERT_HOST(part != nullptr);
397 if (AllowTextPartition(*part)) {
398 clean_part_grid_.InsertBBox(true, true, part);
399 } else {
400 delete part;
401 }
402}
404 ASSERT_HOST(part != nullptr);
405 if (AllowTextPartition(*part)) {
406 fragmented_text_grid_.InsertBBox(true, true, part);
407 } else {
408 delete part;
409 }
410}
412 ASSERT_HOST(part != nullptr);
413 if (!part->IsEmpty() && part->bounding_box().area() > 0) {
414 leader_and_ruling_grid_.InsertBBox(true, true, part);
415 } else {
416 delete part;
417 }
418}
420 leader_and_ruling_grid_.InsertBBox(true, true, part);
421}
423 // NOTE: If images are placed into a different grid in the future,
424 // the function SetPartitionSpacings needs to be updated. It should
425 // be the only thing that cares about image partitions.
426 clean_part_grid_.InsertBBox(true, true, part);
427}
428
429// Splits a partition into its "words". The splits happen
430// at locations with wide inter-blob spacing. This is useful
431// because it allows the table recognize to "cut through" the
432// text lines on the page. The assumption is that a table
433// will have several lines with similar overlapping whitespace
434// whereas text will not have this type of property.
435// Note: The code assumes that blobs are sorted by the left side x!
436// This will not work (as well) if the blobs are sorted by center/right.
438 ASSERT_HOST(part != nullptr);
439 // Bye bye empty partitions!
440 if (part->boxes()->empty()) {
441 delete part;
442 return;
443 }
444
445 // The AllowBlob function prevents this.
446 ASSERT_HOST(part->median_width() > 0);
447 const double kThreshold = part->median_width() * kSplitPartitionSize;
448
449 ColPartition *right_part = part;
450 bool found_split = true;
451 while (found_split) {
452 found_split = false;
453 BLOBNBOX_C_IT box_it(right_part->boxes());
454 // Blobs are sorted left side first. If blobs overlap,
455 // the previous blob may have a "more right" right side.
456 // Account for this by always keeping the largest "right"
457 // so far.
458 int previous_right = INT32_MIN;
459
460 // Look for the next split in the partition.
461 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
462 const TBOX &box = box_it.data()->bounding_box();
463 if (previous_right != INT32_MIN &&
464 box.left() - previous_right > kThreshold) {
465 // We have a split position. Split the partition in two pieces.
466 // Insert the left piece in the grid and keep processing the right.
467 int mid_x = (box.left() + previous_right) / 2;
468 ColPartition *left_part = right_part;
469 right_part = left_part->SplitAt(mid_x);
470
472 found_split = true;
473 break;
474 }
475
476 // The right side of the previous blobs.
477 previous_right = std::max(previous_right, static_cast<int>(box.right()));
478 }
479 }
480 // When a split is not found, the right part is minimized
481 // as much as possible, so process it.
483}
484
485// Some simple criteria to filter out now. We want to make sure the
486// average blob size in the partition is consistent with the
487// global page stats.
488// The area metric will almost always pass for multi-blob partitions.
489// It is useful when filtering out noise caused by an isolated blob.
491 const double kHeightRequired = global_median_xheight_ * kAllowTextHeight;
492 const double kWidthRequired = global_median_blob_width_ * kAllowTextWidth;
493 const int median_area = global_median_xheight_ * global_median_blob_width_;
494 const double kAreaPerBlobRequired = median_area * kAllowTextArea;
495 // Keep comparisons strictly greater to disallow 0!
496 return part.median_height() > kHeightRequired &&
497 part.median_width() > kWidthRequired &&
498 part.bounding_box().area() > kAreaPerBlobRequired * part.boxes_count();
499}
500
501// Same as above, applied to blobs. Keep in mind that
502// leaders, commas, and periods are important in tables.
503bool TableFinder::AllowBlob(const BLOBNBOX &blob) const {
504 const TBOX &box = blob.bounding_box();
505 const double kHeightRequired = global_median_xheight_ * kAllowBlobHeight;
506 const double kWidthRequired = global_median_blob_width_ * kAllowBlobWidth;
507 const int median_area = global_median_xheight_ * global_median_blob_width_;
508 const double kAreaRequired = median_area * kAllowBlobArea;
509 // Keep comparisons strictly greater to disallow 0!
510 return box.height() > kHeightRequired && box.width() > kWidthRequired &&
511 box.area() > kAreaRequired;
512}
513
514// TODO(nbeato): The grid that makes the window doesn't seem to matter.
515// The only downside is that window messages will be caught by
516// clean_part_grid_ instead of a useful object. This is a temporary solution
517// for the debug windows created by the TableFinder.
518#ifndef GRAPHICS_DISABLED
519ScrollView *TableFinder::MakeWindow(int x, int y, const char *window_name) {
520 return clean_part_grid_.MakeWindow(x, y, window_name);
521}
522#endif
523
524// Make single-column blocks from good_columns_ partitions.
526 ColSegment_LIST *column_blocks) {
527 for (int i = 0; i < gridheight(); ++i) {
528 ColPartitionSet *columns = all_columns[i];
529 if (columns != nullptr) {
530 ColSegment_LIST new_blocks;
531 // Get boxes from the current vertical position on the grid
532 columns->GetColumnBoxes(i * gridsize(), (i + 1) * gridsize(),
533 &new_blocks);
534 // Merge the new_blocks boxes into column_blocks if they are well-aligned
535 GroupColumnBlocks(&new_blocks, column_blocks);
536 }
537 }
538}
539
540// Merge column segments into the current list if they are well aligned.
541void TableFinder::GroupColumnBlocks(ColSegment_LIST *new_blocks,
542 ColSegment_LIST *column_blocks) {
543 ColSegment_IT src_it(new_blocks);
544 ColSegment_IT dest_it(column_blocks);
545 // iterate through the source list
546 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
547 ColSegment *src_seg = src_it.data();
548 const TBOX &src_box = src_seg->bounding_box();
549 bool match_found = false;
550 // iterate through the destination list to find a matching column block
551 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
552 ColSegment *dest_seg = dest_it.data();
553 TBOX dest_box = dest_seg->bounding_box();
554 if (ConsecutiveBoxes(src_box, dest_box)) {
555 // If matching block is found, insert the current block into it
556 // and delete the source block.
557 dest_seg->InsertBox(src_box);
558 match_found = true;
559 delete src_it.extract();
560 break;
561 }
562 }
563 // If no match is found, just append the source block to column_blocks
564 if (!match_found) {
565 dest_it.add_after_then_move(src_it.extract());
566 }
567 }
568}
569
570// are the two boxes immediate neighbors along the vertical direction
571bool TableFinder::ConsecutiveBoxes(const TBOX &b1, const TBOX &b2) {
572 int x_margin = 20;
573 int y_margin = 5;
574 return (abs(b1.left() - b2.left()) < x_margin) &&
575 (abs(b1.right() - b2.right()) < x_margin) &&
576 (abs(b1.top() - b2.bottom()) < y_margin ||
577 abs(b2.top() - b1.bottom()) < y_margin);
578}
579
580// Set up info for clean_part_grid_ partitions to be valid during detection
581// code.
586}
587
588// Set left, right and top, bottom spacings of each colpartition.
590 ColPartitionSet **all_columns) {
591 // Iterate the ColPartitions in the grid.
592 ColPartitionGridSearch gsearch(grid);
593 gsearch.StartFullSearch();
594 ColPartition *part = nullptr;
595 while ((part = gsearch.NextFullSearch()) != nullptr) {
596 ColPartitionSet *columns = all_columns[gsearch.GridY()];
597 TBOX box = part->bounding_box();
598 int y = part->MidY();
599 ColPartition *left_column = columns->ColumnContaining(box.left(), y);
600 ColPartition *right_column = columns->ColumnContaining(box.right(), y);
601 // set distance from left column as space to the left
602 if (left_column) {
603 int left_space = std::max(0, box.left() - left_column->LeftAtY(y));
604 part->set_space_to_left(left_space);
605 }
606 // set distance from right column as space to the right
607 if (right_column) {
608 int right_space = std::max(0, right_column->RightAtY(y) - box.right());
609 part->set_space_to_right(right_space);
610 }
611
612 // Look for images that may be closer.
613 // NOTE: used to be part_grid_, might cause issues now
614 ColPartitionGridSearch hsearch(grid);
615 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
616 ColPartition *neighbor = nullptr;
617 while ((neighbor = hsearch.NextSideSearch(true)) != nullptr) {
618 if (neighbor->type() == PT_PULLOUT_IMAGE ||
619 neighbor->type() == PT_FLOWING_IMAGE ||
620 neighbor->type() == PT_HEADING_IMAGE) {
621 int right = neighbor->bounding_box().right();
622 if (right < box.left()) {
623 int space = std::min(box.left() - right, part->space_to_left());
624 part->set_space_to_left(space);
625 }
626 }
627 }
628 hsearch.StartSideSearch(box.left(), box.bottom(), box.top());
629 neighbor = nullptr;
630 while ((neighbor = hsearch.NextSideSearch(false)) != nullptr) {
631 if (neighbor->type() == PT_PULLOUT_IMAGE ||
632 neighbor->type() == PT_FLOWING_IMAGE ||
633 neighbor->type() == PT_HEADING_IMAGE) {
634 int left = neighbor->bounding_box().left();
635 if (left > box.right()) {
636 int space = std::min(left - box.right(), part->space_to_right());
637 part->set_space_to_right(space);
638 }
639 }
640 }
641
642 ColPartition *upper_part = part->SingletonPartner(true);
643 if (upper_part) {
644 int space =
645 std::max(0, static_cast<int>(upper_part->bounding_box().bottom() -
646 part->bounding_box().bottom()));
647 part->set_space_above(space);
648 } else {
649 // TODO(nbeato): What constitutes a good value?
650 // 0 is the default value when not set, explicitly noting it needs to
651 // be something else.
652 part->set_space_above(INT32_MAX);
653 }
654
655 ColPartition *lower_part = part->SingletonPartner(false);
656 if (lower_part) {
657 int space =
658 std::max(0, static_cast<int>(part->bounding_box().bottom() -
659 lower_part->bounding_box().bottom()));
660 part->set_space_below(space);
661 } else {
662 // TODO(nbeato): What constitutes a good value?
663 // 0 is the default value when not set, explicitly noting it needs to
664 // be something else.
665 part->set_space_below(INT32_MAX);
666 }
667 }
668}
669
670// Set spacing and closest neighbors above and below a given colpartition.
672 TBOX box = part->bounding_box();
673 int top_range =
674 std::min(box.top() + kMaxVerticalSpacing, static_cast<int>(tright().y()));
675 int bottom_range = std::max(box.bottom() - kMaxVerticalSpacing,
676 static_cast<int>(bleft().y()));
677 box.set_top(top_range);
678 box.set_bottom(bottom_range);
679
680 TBOX part_box = part->bounding_box();
681 // Start a rect search
684 rectsearch.StartRectSearch(box);
685 ColPartition *neighbor;
686 int min_space_above = kMaxVerticalSpacing;
687 int min_space_below = kMaxVerticalSpacing;
688 ColPartition *above_neighbor = nullptr;
689 ColPartition *below_neighbor = nullptr;
690 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
691 if (neighbor == part) {
692 continue;
693 }
694 TBOX neighbor_box = neighbor->bounding_box();
695 if (neighbor_box.major_x_overlap(part_box)) {
696 int gap = abs(part->median_bottom() - neighbor->median_bottom());
697 // If neighbor is below current partition
698 if (neighbor_box.top() < part_box.bottom() && gap < min_space_below) {
699 min_space_below = gap;
700 below_neighbor = neighbor;
701 } // If neighbor is above current partition
702 else if (part_box.top() < neighbor_box.bottom() &&
703 gap < min_space_above) {
704 min_space_above = gap;
705 above_neighbor = neighbor;
706 }
707 }
708 }
709 part->set_space_above(min_space_above);
710 part->set_space_below(min_space_below);
711 part->set_nearest_neighbor_above(above_neighbor);
712 part->set_nearest_neighbor_below(below_neighbor);
713}
714
715// Set global spacing and x-height estimates
717 STATS xheight_stats(0, kMaxVerticalSpacing);
718 STATS width_stats(0, kMaxBlobWidth);
719 STATS ledding_stats(0, kMaxVerticalSpacing);
720 // Iterate the ColPartitions in the grid.
721 ColPartitionGridSearch gsearch(grid);
722 gsearch.SetUniqueMode(true);
723 gsearch.StartFullSearch();
724 ColPartition *part = nullptr;
725 while ((part = gsearch.NextFullSearch()) != nullptr) {
726 // TODO(nbeato): HACK HACK HACK! medians are equal to partition length.
727 // ComputeLimits needs to get called somewhere outside of TableFinder
728 // to make sure the partitions are properly initialized.
729 // When this is called, SmoothPartitionPartners dies in an assert after
730 // table find runs. Alternative solution.
731 // part->ComputeLimits();
732 if (part->IsTextType()) {
733 // xheight_stats.add(part->median_height(), part->boxes_count());
734 // width_stats.add(part->median_width(), part->boxes_count());
735
736 // This loop can be removed when above issues are fixed.
737 // Replace it with the 2 lines commented out above.
738 BLOBNBOX_C_IT it(part->boxes());
739 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
740 xheight_stats.add(it.data()->bounding_box().height(), 1);
741 width_stats.add(it.data()->bounding_box().width(), 1);
742 }
743
744 ledding_stats.add(part->space_above(), 1);
745 ledding_stats.add(part->space_below(), 1);
746 }
747 }
748 // Set estimates based on median of statistics obtained
749 set_global_median_xheight(static_cast<int>(xheight_stats.median() + 0.5));
750 set_global_median_blob_width(static_cast<int>(width_stats.median() + 0.5));
751 set_global_median_ledding(static_cast<int>(ledding_stats.median() + 0.5));
752#ifndef GRAPHICS_DISABLED
753 if (textord_tablefind_show_stats) {
754 const char *kWindowName = "X-height (R), X-width (G), and ledding (B)";
755 ScrollView *stats_win = MakeWindow(500, 10, kWindowName);
756 xheight_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::RED);
757 width_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::GREEN);
758 ledding_stats.plot(stats_win, 10, 200, 2, 15, ScrollView::BLUE);
759 }
760#endif // !GRAPHICS_DISABLED
761}
762
764 global_median_xheight_ = xheight;
765}
768}
770 global_median_ledding_ = ledding;
771}
772
775 gsearch.StartFullSearch();
776 ColPartition *part = nullptr;
777 while ((part = gsearch.NextFullSearch()) != nullptr) {
778 // TODO(nbeato): Rename this function, meaning is different now.
779 // IT is finding nearest neighbors its own way
780 // SetVerticalSpacing(part);
781
782 ColPartition *upper = part->SingletonPartner(true);
783 if (upper) {
784 part->set_nearest_neighbor_above(upper);
785 }
786
787 ColPartition *lower = part->SingletonPartner(false);
788 if (lower) {
789 part->set_nearest_neighbor_below(lower);
790 }
791 }
792}
793
794// High level interface. Input is an unmarked ColPartitionGrid
795// (namely, clean_part_grid_). Partitions are identified using local
796// information and filter/smoothed. The function exit should contain
797// a good sampling of the table partitions.
800#ifndef GRAPHICS_DISABLED
801 if (textord_tablefind_show_mark) {
802 ScrollView *table_win = MakeWindow(300, 300, "Initial Table Partitions");
806 }
807#endif
809#ifndef GRAPHICS_DISABLED
810 if (textord_tablefind_show_mark) {
811 ScrollView *table_win = MakeWindow(600, 300, "Filtered Table Partitions");
815 }
816#endif
818#ifndef GRAPHICS_DISABLED
819 if (textord_tablefind_show_mark) {
820 ScrollView *table_win = MakeWindow(900, 300, "Smoothed Table Partitions");
824 }
825#endif
827#ifndef GRAPHICS_DISABLED
828 if (textord_tablefind_show_mark || textord_show_tables) {
829 ScrollView *table_win = MakeWindow(900, 300, "Final Table Partitions");
833 }
834#endif
835}
836
837// These types of partitions are marked as table partitions:
838// 1- Partitions that have at lease one large gap between words
839// 2- Partitions that consist of only one word (no significant gap
840// between components)
841// 3- Partitions that vertically overlap with other partitions within the
842// same column.
843// 4- Partitions with leaders before/after them.
845 // Iterate the ColPartitions in the grid.
848 gsearch.StartFullSearch();
849 ColPartition *part = nullptr;
850 while ((part = gsearch.NextFullSearch()) != nullptr) {
851 if (!part->IsTextType()) { // Only consider text partitions
852 continue;
853 }
854 // Only consider partitions in dominant font size or smaller
856 continue;
857 }
858 // Mark partitions with a large gap, or no significant gap as
859 // table partitions.
860 // Comments: It produces several false alarms at:
861 // - last line of a paragraph (fixed)
862 // - single word section headings
863 // - page headers and footers
864 // - numbered equations
865 // - line drawing regions
866 // TODO(faisal): detect and fix above-mentioned cases
867 if (HasWideOrNoInterWordGap(part) || HasLeaderAdjacent(*part)) {
868 part->set_table_type();
869 }
870 }
871}
872
873// Check if the partition has at least one large gap between words or no
874// significant gap at all
876 // Should only get text partitions.
877 ASSERT_HOST(part->IsTextType());
878 // Blob access
879 BLOBNBOX_CLIST *part_boxes = part->boxes();
880 BLOBNBOX_C_IT it(part_boxes);
881 // Check if this is a relatively small partition (such as a single word)
882 if (part->bounding_box().width() <
884 part_boxes->length() < kMinBoxesInTextPartition) {
885 return true;
886 }
887
888 // Variables used to compute inter-blob spacing.
889 int current_x0 = -1;
890 int current_x1 = -1;
891 int previous_x1 = -1;
892 // Stores the maximum gap detected.
893 int largest_partition_gap_found = -1;
894 // Text partition gap limits. If this is text (and not a table),
895 // there should be at least one gap larger than min_gap and no gap
896 // larger than max_gap.
897 const double max_gap = kMaxGapInTextPartition * part->median_height();
898 const double min_gap = kMinMaxGapInTextPartition * part->median_height();
899
900 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
901 BLOBNBOX *blob = it.data();
902 current_x0 = blob->bounding_box().left();
903 current_x1 = blob->bounding_box().right();
904 if (previous_x1 != -1) {
905 int gap = current_x0 - previous_x1;
906
907 // TODO(nbeato): Boxes may overlap? Huh?
908 // For example, mag.3B 8003_033.3B.tif in UNLV data. The titles/authors
909 // on the top right of the page are filtered out with this line.
910 // Note 2: Iterating over blobs in a partition, so we are looking for
911 // spacing between the words.
912 if (gap < 0) {
913 // More likely case, the blobs slightly overlap. This can happen
914 // with diacritics (accents) or broken alphabet symbols (characters).
915 // Merge boxes together by taking max of right sides.
916 if (-gap < part->median_height() * kMaxBlobOverlapFactor) {
917 previous_x1 = std::max(previous_x1, current_x1);
918 continue;
919 }
920 // Extreme case, blobs overlap significantly in the same partition...
921 // This should not happen often (if at all), but it does.
922 // TODO(nbeato): investigate cases when this happens.
923 else {
924 // The behavior before was to completely ignore this case.
925 }
926 }
927
928 // If a large enough gap is found, mark it as a table cell (return true)
929 if (gap > max_gap) {
930 return true;
931 }
932 if (gap > largest_partition_gap_found) {
933 largest_partition_gap_found = gap;
934 }
935 }
936 previous_x1 = current_x1;
937 }
938 // Since no large gap was found, return false if the partition is too
939 // long to be a data cell
940 if (part->bounding_box().width() >
942 part_boxes->length() > kMaxBoxesInDataPartition) {
943 return false;
944 }
945
946 // A partition may be a single blob. In this case, it's an isolated symbol
947 // or non-text (such as a ruling or image).
948 // Detect these as table partitions? Shouldn't this be case by case?
949 // The behavior before was to ignore this, making max_partition_gap < 0
950 // and implicitly return true. Just making it explicit.
951 if (largest_partition_gap_found == -1) {
952 return true;
953 }
954
955 // return true if the maximum gap found is smaller than the minimum allowed
956 // max_gap in a text partition. This indicates that there is no significant
957 // space in the partition, hence it is likely a single word.
958 return largest_partition_gap_found < min_gap;
959}
960
961// A criteria for possible tables is that a table may have leaders
962// between data cells. An aggressive solution to find such tables is to
963// explicitly mark partitions that have adjacent leaders.
964// Note that this includes overlapping leaders. However, it does not
965// include leaders in different columns on the page.
966// Possible false-positive will include lists, such as a table of contents.
967// As these arise, the aggressive nature of this search may need to be
968// trimmed down.
970 if (part.flow() == BTFT_LEADER) {
971 return true;
972 }
973 // Search range is left and right bounded by an offset of the
974 // median xheight. This offset is to allow some tolerance to the
975 // the leaders on the page in the event that the alignment is still
976 // a bit off.
977 const TBOX &box = part.bounding_box();
979 const int top = box.top() + search_size;
980 const int bottom = box.bottom() - search_size;
982 for (int direction = 0; direction < 2; ++direction) {
983 bool right_to_left = (direction == 0);
984 int x = right_to_left ? box.right() : box.left();
985 hsearch.StartSideSearch(x, bottom, top);
986 ColPartition *leader = nullptr;
987 while ((leader = hsearch.NextSideSearch(right_to_left)) != nullptr) {
988 // The leader could be a horizontal ruling in the grid.
989 // Make sure it is actually a leader.
990 if (leader->flow() != BTFT_LEADER) {
991 continue;
992 }
993 // This should not happen, they are in different grids.
994 ASSERT_HOST(&part != leader);
995 // Make sure the leader shares a page column with the partition,
996 // otherwise we are spreading across columns.
997 if (!part.IsInSameColumnAs(*leader)) {
998 break;
999 }
1000 // There should be a significant vertical overlap
1001 if (!leader->VSignificantCoreOverlap(part)) {
1002 continue;
1003 }
1004 // Leader passed all tests, so it is adjacent.
1005 return true;
1006 }
1007 }
1008 // No leaders are adjacent to the given partition.
1009 return false;
1010}
1011
1012// Filter individual text partitions marked as table partitions
1013// consisting of paragraph endings, small section headings, and
1014// headers and footers.
1018 // TODO(nbeato): Fully justified text as non-table?
1019}
1020
1022 // Detect last line of paragraph
1023 // Iterate the ColPartitions in the grid.
1025 gsearch.StartFullSearch();
1026 ColPartition *part = nullptr;
1027 while ((part = gsearch.NextFullSearch()) != nullptr) {
1028 if (part->type() != PT_TABLE) {
1029 continue; // Consider only table partitions
1030 }
1031
1032 // Paragraph ending should have flowing text above it.
1033 ColPartition *upper_part = part->nearest_neighbor_above();
1034 if (!upper_part) {
1035 continue;
1036 }
1037 if (upper_part->type() != PT_FLOWING_TEXT) {
1038 continue;
1039 }
1040 if (upper_part->bounding_box().width() < 2 * part->bounding_box().width()) {
1041 continue;
1042 }
1043 // Check if its the last line of a paragraph.
1044 // In most cases, a paragraph ending should be left-aligned to text line
1045 // above it. Sometimes, it could be a 2 line paragraph, in which case
1046 // the line above it is indented.
1047 // To account for that, check if the partition center is to
1048 // the left of the one above it.
1049 int mid = (part->bounding_box().left() + part->bounding_box().right()) / 2;
1050 int upper_mid = (upper_part->bounding_box().left() +
1051 upper_part->bounding_box().right()) /
1052 2;
1053 int current_spacing = 0; // spacing of the current line to margin
1054 int upper_spacing = 0; // spacing of the previous line to the margin
1056 // Left to right languages, use mid - left to figure out the distance
1057 // the middle is from the left margin.
1058 int left = std::min(part->bounding_box().left(),
1059 upper_part->bounding_box().left());
1060 current_spacing = mid - left;
1061 upper_spacing = upper_mid - left;
1062 } else {
1063 // Right to left languages, use right - mid to figure out the distance
1064 // the middle is from the right margin.
1065 int right = std::max(part->bounding_box().right(),
1066 upper_part->bounding_box().right());
1067 current_spacing = right - mid;
1068 upper_spacing = right - upper_mid;
1069 }
1070 if (current_spacing * kParagraphEndingPreviousLineRatio > upper_spacing) {
1071 continue;
1072 }
1073
1074 // Paragraphs should have similar fonts.
1075 if (!part->MatchingSizes(*upper_part) ||
1078 continue;
1079 }
1080
1081 // The last line of a paragraph should be left aligned.
1082 // TODO(nbeato): This would be untrue if the text was right aligned.
1083 // How often is that?
1084 if (part->space_to_left() >
1086 continue;
1087 }
1088 // The line above it should be right aligned (assuming justified format).
1089 // Since we can't assume justified text, we compare whitespace to text.
1090 // The above line should have majority spanning text (or the current
1091 // line could have fit on the previous line). So compare
1092 // whitespace to text.
1093 if (upper_part->bounding_box().width() <
1095 upper_part->space_to_right()) {
1096 continue;
1097 }
1098
1099 // Ledding above the line should be less than ledding below
1100 if (part->space_above() >= part->space_below() ||
1101 part->space_above() > 2 * global_median_ledding_) {
1102 continue;
1103 }
1104
1105 // If all checks failed, it is probably text.
1106 part->clear_table_type();
1107 }
1108}
1109
1111 // Consider top-most text colpartition as header and bottom most as footer
1112 ColPartition *header = nullptr;
1113 ColPartition *footer = nullptr;
1114 int max_top = INT32_MIN;
1115 int min_bottom = INT32_MAX;
1117 gsearch.StartFullSearch();
1118 ColPartition *part = nullptr;
1119 while ((part = gsearch.NextFullSearch()) != nullptr) {
1120 if (!part->IsTextType()) {
1121 continue; // Consider only text partitions
1122 }
1123 int top = part->bounding_box().top();
1124 int bottom = part->bounding_box().bottom();
1125 if (top > max_top) {
1126 max_top = top;
1127 header = part;
1128 }
1129 if (bottom < min_bottom) {
1130 min_bottom = bottom;
1131 footer = part;
1132 }
1133 }
1134 if (header) {
1135 header->clear_table_type();
1136 }
1137 if (footer) {
1138 footer->clear_table_type();
1139 }
1140}
1141
1142// Mark all ColPartitions as table cells that have a table cell above
1143// and below them
1144// TODO(faisal): This is too aggressive at the moment. The method needs to
1145// consider spacing and alignment as well. Detection of false alarm table cells
1146// should also be done as part of it.
1148 // Iterate the ColPartitions in the grid.
1150 gsearch.StartFullSearch();
1151 ColPartition *part = nullptr;
1152 while ((part = gsearch.NextFullSearch()) != nullptr) {
1153 if (part->type() >= PT_TABLE || part->type() == PT_UNKNOWN) {
1154 continue; // Consider only text partitions
1155 }
1156 ColPartition *upper_part = part->nearest_neighbor_above();
1157 ColPartition *lower_part = part->nearest_neighbor_below();
1158 if (!upper_part || !lower_part) {
1159 continue;
1160 }
1161 if (upper_part->type() == PT_TABLE && lower_part->type() == PT_TABLE) {
1162 part->set_table_type();
1163 }
1164 }
1165
1166 // Pass 2, do the opposite. If both the upper and lower neighbors
1167 // exist and are not tables, this probably shouldn't be a table.
1168 gsearch.StartFullSearch();
1169 part = nullptr;
1170 while ((part = gsearch.NextFullSearch()) != nullptr) {
1171 if (part->type() != PT_TABLE) {
1172 continue; // Consider only text partitions
1173 }
1174 ColPartition *upper_part = part->nearest_neighbor_above();
1175 ColPartition *lower_part = part->nearest_neighbor_below();
1176
1177 // table can't be by itself
1178 if ((upper_part && upper_part->type() != PT_TABLE) &&
1179 (lower_part && lower_part->type() != PT_TABLE)) {
1180 part->clear_table_type();
1181 }
1182 }
1183}
1184
1185// Set the type of a column segment based on the ratio of table to text cells
1186void TableFinder::SetColumnsType(ColSegment_LIST *column_blocks) {
1187 ColSegment_IT it(column_blocks);
1188 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1189 ColSegment *seg = it.data();
1190 TBOX box = seg->bounding_box();
1191 int num_table_cells = 0;
1192 int num_text_cells = 0;
1195 rsearch.SetUniqueMode(true);
1196 rsearch.StartRectSearch(box);
1197 ColPartition *part = nullptr;
1198 while ((part = rsearch.NextRectSearch()) != nullptr) {
1199 if (part->type() == PT_TABLE) {
1200 num_table_cells++;
1201 } else if (part->type() == PT_FLOWING_TEXT) {
1202 num_text_cells++;
1203 }
1204 }
1205 // If a column block has no text or table partition in it, it is not needed
1206 // for table detection.
1207 if (!num_table_cells && !num_text_cells) {
1208 delete it.extract();
1209 } else {
1210 seg->set_num_table_cells(num_table_cells);
1211 seg->set_num_text_cells(num_text_cells);
1212 // set column type based on the ratio of table to text cells
1213 seg->set_type();
1214 }
1215 }
1216}
1217
1218// Move column blocks to grid
1219void TableFinder::MoveColSegmentsToGrid(ColSegment_LIST *segments,
1220 ColSegmentGrid *col_seg_grid) {
1221 ColSegment_IT it(segments);
1222 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1223 ColSegment *seg = it.extract();
1224 col_seg_grid->InsertBBox(true, true, seg);
1225 }
1226}
1227
1228// Merge column blocks if a split is detected due to the presence of a
1229// table. A text block is considered split if it has multiple
1230// neighboring blocks above/below it, and at least one of the
1231// neighboring blocks is of table type (has a high density of table
1232// partitions). In this case neighboring blocks in the direction
1233// (above/below) of the table block are merged with the text block.
1234
1235// Comment: This method does not handle split due to a full page table
1236// since table columns in this case do not have a text column on which
1237// split decision can be based.
1239 int margin = gridsize();
1240
1241 // Iterate the Column Blocks in the grid.
1243 &col_seg_grid_);
1244 gsearch.StartFullSearch();
1245 ColSegment *seg;
1246 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1247 if (seg->type() != COL_TEXT) {
1248 continue; // only consider text blocks for split detection
1249 }
1250 bool neighbor_found = false;
1251 bool modified = false; // Modified at least once
1252 // keep expanding current box as long as neighboring table columns
1253 // are found above or below it.
1254 do {
1255 TBOX box = seg->bounding_box();
1256 // slightly expand the search region vertically
1257 int top_range =
1258 std::min(box.top() + margin, static_cast<int>(tright().y()));
1259 int bottom_range =
1260 std::max(box.bottom() - margin, static_cast<int>(bleft().y()));
1261 box.set_top(top_range);
1262 box.set_bottom(bottom_range);
1263 neighbor_found = false;
1265 &col_seg_grid_);
1266 rectsearch.StartRectSearch(box);
1267 ColSegment *neighbor = nullptr;
1268 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1269 if (neighbor == seg) {
1270 continue;
1271 }
1272 const TBOX &neighbor_box = neighbor->bounding_box();
1273 // If the neighbor box significantly overlaps with the current
1274 // box (due to the expansion of the current box in the
1275 // previous iteration of this loop), remove the neighbor box
1276 // and expand the current box to include it.
1277 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1278 seg->InsertBox(neighbor_box);
1279 modified = true;
1280 rectsearch.RemoveBBox();
1281 gsearch.RepositionIterator();
1282 delete neighbor;
1283 continue;
1284 }
1285 // Only expand if the neighbor box is of table type
1286 if (neighbor->type() != COL_TABLE) {
1287 continue;
1288 }
1289 // Insert the neighbor box into the current column block
1290 if (neighbor_box.major_x_overlap(box) && !box.contains(neighbor_box)) {
1291 seg->InsertBox(neighbor_box);
1292 neighbor_found = true;
1293 modified = true;
1294 rectsearch.RemoveBBox();
1295 gsearch.RepositionIterator();
1296 delete neighbor;
1297 }
1298 }
1299 } while (neighbor_found);
1300 if (modified) {
1301 // Because the box has changed, it has to be removed first.
1302 gsearch.RemoveBBox();
1303 col_seg_grid_.InsertBBox(true, true, seg);
1304 gsearch.RepositionIterator();
1305 }
1306 }
1307}
1308
1309// Group horizontally overlapping table partitions into table columns.
1310// TODO(faisal): This is too aggressive at the moment. The method should
1311// consider more attributes to group table partitions together. Some common
1312// errors are:
1313// 1- page number is merged with a table column above it even
1314// if there is a large vertical gap between them.
1315// 2- column headers go on to catch one of the columns arbitrarily
1316// 3- an isolated noise blob near page top or bottom merges with the table
1317// column below/above it
1318// 4- cells from two vertically adjacent tables merge together to make a
1319// single column resulting in merging of the two tables
1320void TableFinder::GetTableColumns(ColSegment_LIST *table_columns) {
1321 ColSegment_IT it(table_columns);
1322 // Iterate the ColPartitions in the grid.
1325 gsearch.StartFullSearch();
1326 ColPartition *part;
1327 while ((part = gsearch.NextFullSearch()) != nullptr) {
1328 if (part->inside_table_column() || part->type() != PT_TABLE) {
1329 continue; // prevent a partition to be assigned to multiple columns
1330 }
1331 const TBOX &box = part->bounding_box();
1332 auto *col = new ColSegment();
1333 col->InsertBox(box);
1334 part->set_inside_table_column(true);
1335 // Start a search below the current cell to find bottom neighbours
1336 // Note: a full search will always process things above it first, so
1337 // this should be starting at the highest cell and working its way down.
1340 vsearch.StartVerticalSearch(box.left(), box.right(), box.bottom());
1341 ColPartition *neighbor = nullptr;
1342 bool found_neighbours = false;
1343 while ((neighbor = vsearch.NextVerticalSearch(true)) != nullptr) {
1344 // only consider neighbors not assigned to any column yet
1345 if (neighbor->inside_table_column()) {
1346 continue;
1347 }
1348 // Horizontal lines should not break the flow
1349 if (neighbor->IsHorizontalLine()) {
1350 continue;
1351 }
1352 // presence of a non-table neighbor marks the end of current
1353 // table column
1354 if (neighbor->type() != PT_TABLE) {
1355 break;
1356 }
1357 // add the neighbor partition to the table column
1358 const TBOX &neighbor_box = neighbor->bounding_box();
1359 col->InsertBox(neighbor_box);
1360 neighbor->set_inside_table_column(true);
1361 found_neighbours = true;
1362 }
1363 if (found_neighbours) {
1364 it.add_after_then_move(col);
1365 } else {
1366 part->set_inside_table_column(false);
1367 delete col;
1368 }
1369 }
1370}
1371
1372// Mark regions in a column that are x-bounded by the column boundaries and
1373// y-bounded by the table columns' projection on the y-axis as table regions
1374void TableFinder::GetTableRegions(ColSegment_LIST *table_columns,
1375 ColSegment_LIST *table_regions) {
1376 ColSegment_IT cit(table_columns);
1377 ColSegment_IT rit(table_regions);
1378 // Iterate through column blocks
1380 &col_seg_grid_);
1381 gsearch.StartFullSearch();
1382 ColSegment *part;
1383 int page_height = tright().y() - bleft().y();
1384 ASSERT_HOST(page_height > 0);
1385 // create a bool array to hold projection on y-axis
1386 bool *table_region = new bool[page_height];
1387 while ((part = gsearch.NextFullSearch()) != nullptr) {
1388 const TBOX &part_box = part->bounding_box();
1389 // reset the projection array
1390 for (int i = 0; i < page_height; i++) {
1391 table_region[i] = false;
1392 }
1393 // iterate through all table columns to find regions in the current
1394 // page column block
1395 cit.move_to_first();
1396 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1397 TBOX col_box = cit.data()->bounding_box();
1398 // find intersection region of table column and page column
1399 TBOX intersection_box = col_box.intersection(part_box);
1400 // project table column on the y-axis
1401 for (int i = intersection_box.bottom(); i < intersection_box.top(); i++) {
1402 table_region[i - bleft().y()] = true;
1403 }
1404 }
1405 // set x-limits of table regions to page column width
1406 TBOX current_table_box;
1407 current_table_box.set_left(part_box.left());
1408 current_table_box.set_right(part_box.right());
1409 // go through the y-axis projection to find runs of table
1410 // regions. Each run makes one table region.
1411 for (int i = 1; i < page_height; i++) {
1412 // detect start of a table region
1413 if (!table_region[i - 1] && table_region[i]) {
1414 current_table_box.set_bottom(i + bleft().y());
1415 }
1416 // TODO(nbeato): Is it guaranteed that the last row is not a table region?
1417 // detect end of a table region
1418 if (table_region[i - 1] && !table_region[i]) {
1419 current_table_box.set_top(i + bleft().y());
1420 if (!current_table_box.null_box()) {
1421 auto *seg = new ColSegment();
1422 seg->InsertBox(current_table_box);
1423 rit.add_after_then_move(seg);
1424 }
1425 }
1426 }
1427 }
1428 delete[] table_region;
1429}
1430
1431// Merge table regions corresponding to tables spanning multiple columns if
1432// there is a colpartition (horizontal ruling line or normal text) that
1433// touches both regions.
1434// TODO(faisal): A rare error occurs if there are two horizontally adjacent
1435// tables with aligned ruling lines. In this case, line finder returns a
1436// single line and hence the tables get merged together
1438 // Iterate the table regions in the grid.
1440 &table_grid_);
1441 gsearch.StartFullSearch();
1442 ColSegment *seg = nullptr;
1443 while ((seg = gsearch.NextFullSearch()) != nullptr) {
1444 bool neighbor_found = false;
1445 bool modified = false; // Modified at least once
1446 do {
1447 // Start a rectangle search x-bounded by the image and y by the table
1448 const TBOX &box = seg->bounding_box();
1449 TBOX search_region(box);
1450 search_region.set_left(bleft().x());
1451 search_region.set_right(tright().x());
1452 neighbor_found = false;
1454 &table_grid_);
1455 rectsearch.StartRectSearch(search_region);
1456 ColSegment *neighbor = nullptr;
1457 while ((neighbor = rectsearch.NextRectSearch()) != nullptr) {
1458 if (neighbor == seg) {
1459 continue;
1460 }
1461 const TBOX &neighbor_box = neighbor->bounding_box();
1462 // Check if a neighbor box has a large overlap with the table
1463 // region. This may happen as a result of merging two table
1464 // regions in the previous iteration.
1465 if (neighbor_box.overlap_fraction(box) >= 0.9) {
1466 seg->InsertBox(neighbor_box);
1467 rectsearch.RemoveBBox();
1468 gsearch.RepositionIterator();
1469 delete neighbor;
1470 modified = true;
1471 continue;
1472 }
1473 // Check if two table regions belong together based on a common
1474 // horizontal ruling line
1475 if (BelongToOneTable(box, neighbor_box)) {
1476 seg->InsertBox(neighbor_box);
1477 neighbor_found = true;
1478 modified = true;
1479 rectsearch.RemoveBBox();
1480 gsearch.RepositionIterator();
1481 delete neighbor;
1482 }
1483 }
1484 } while (neighbor_found);
1485 if (modified) {
1486 // Because the box has changed, it has to be removed first.
1487 gsearch.RemoveBBox();
1488 table_grid_.InsertBBox(true, true, seg);
1489 gsearch.RepositionIterator();
1490 }
1491 }
1492}
1493
1494// Decide if two table regions belong to one table based on a common
1495// horizontal ruling line or another colpartition
1496bool TableFinder::BelongToOneTable(const TBOX &box1, const TBOX &box2) {
1497 // Check the obvious case. Most likely not true because overlapping boxes
1498 // should already be merged, but seems like a good thing to do in case things
1499 // change.
1500 if (box1.overlap(box2)) {
1501 return true;
1502 }
1503 // Check for ColPartitions spanning both table regions
1504 TBOX bbox = box1.bounding_union(box2);
1505 // Start a rect search on bbox
1508 rectsearch.StartRectSearch(bbox);
1509 ColPartition *part = nullptr;
1510 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1511 const TBOX &part_box = part->bounding_box();
1512 // return true if a colpartition spanning both table regions is found
1513 if (part_box.overlap(box1) && part_box.overlap(box2) &&
1514 !part->IsImageType()) {
1515 return true;
1516 }
1517 }
1518 return false;
1519}
1520
1521// Adjust table boundaries by:
1522// - building a tight bounding box around all ColPartitions contained in it.
1523// - expanding table boundaries to include all colpartitions that overlap the
1524// table by more than half of their area
1525// - expanding table boundaries to include nearby horizontal rule lines
1526// - expanding table vertically to include left out column headers
1527// TODO(faisal): Expansion of table boundaries is quite aggressive. It usually
1528// makes following errors:
1529// 1- horizontal lines consisting of underlines are included in the table if
1530// they are close enough
1531// 2- horizontal lines originating from noise tend to get merged with a table
1532// near the top of the page
1533// 3- the criteria for including horizontal lines is very generous. Many times
1534// horizontal lines separating headers and footers get merged with a
1535// single-column table in a multi-column page thereby including text
1536// from the neighboring column inside the table
1537// 4- the criteria for including left out column headers also tends to
1538// occasionally include text-lines above the tables, typically from
1539// table caption
1541 // Iterate the table regions in the grid
1542 ColSegment_CLIST adjusted_tables;
1543 ColSegment_C_IT it(&adjusted_tables);
1545 gsearch.StartFullSearch();
1546 ColSegment *table = nullptr;
1547 while ((table = gsearch.NextFullSearch()) != nullptr) {
1548 const TBOX &table_box = table->bounding_box();
1549 TBOX grown_box = table_box;
1550 GrowTableBox(table_box, &grown_box);
1551 // To prevent a table from expanding again, do not insert the
1552 // modified box back to the grid. Instead move it to a list and
1553 // and remove it from the grid. The list is moved later back to the grid.
1554 if (!grown_box.null_box()) {
1555 auto *col = new ColSegment();
1556 col->InsertBox(grown_box);
1557 it.add_after_then_move(col);
1558 }
1559 gsearch.RemoveBBox();
1560 delete table;
1561 }
1562 // clear table grid to move final tables in it
1563 // TODO(nbeato): table_grid_ should already be empty. The above loop
1564 // removed everything. Maybe just assert it is empty?
1566 it.move_to_first();
1567 // move back final tables to table_grid_
1568 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1569 ColSegment *seg = it.extract();
1570 table_grid_.InsertBBox(true, true, seg);
1571 }
1572}
1573
1574void TableFinder::GrowTableBox(const TBOX &table_box, TBOX *result_box) {
1575 // TODO(nbeato): The growing code is a bit excessive right now.
1576 // By removing these lines, the partitions considered need
1577 // to have some overlap or be special cases. These lines could
1578 // be added again once a check is put in place to make sure that
1579 // growing tables don't stomp on a lot of non-table partitions.
1580
1581 // search for horizontal ruling lines within the vertical margin
1582 // int vertical_margin = kRulingVerticalMargin * gridsize();
1583 TBOX search_box = table_box;
1584 // int top = MIN(search_box.top() + vertical_margin, tright().y());
1585 // int bottom = MAX(search_box.bottom() - vertical_margin, bleft().y());
1586 // search_box.set_top(top);
1587 // search_box.set_bottom(bottom);
1588
1589 GrowTableToIncludePartials(table_box, search_box, result_box);
1590 GrowTableToIncludeLines(table_box, search_box, result_box);
1591 IncludeLeftOutColumnHeaders(result_box);
1592}
1593
1594// Grow a table by increasing the size of the box to include
1595// partitions with significant overlap with the table.
1597 const TBOX &search_range,
1598 TBOX *result_box) {
1599 // Rulings are in a different grid, so search 2 grids for rulings, text,
1600 // and table partitions that are not entirely within the new box.
1601 for (int i = 0; i < 2; ++i) {
1602 ColPartitionGrid *grid =
1604 ColPartitionGridSearch rectsearch(grid);
1605 rectsearch.StartRectSearch(search_range);
1606 ColPartition *part = nullptr;
1607 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1608 // Only include text and table types.
1609 if (part->IsImageType()) {
1610 continue;
1611 }
1612 const TBOX &part_box = part->bounding_box();
1613 // Include partition in the table if more than half of it
1614 // is covered by the table
1615 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1616 *result_box = result_box->bounding_union(part_box);
1617 continue;
1618 }
1619 }
1620 }
1621}
1622
1623// Grow a table by expanding to the extents of significantly
1624// overlapping lines.
1626 const TBOX &search_range,
1627 TBOX *result_box) {
1629 rsearch.SetUniqueMode(true);
1630 rsearch.StartRectSearch(search_range);
1631 ColPartition *part = nullptr;
1632 while ((part = rsearch.NextRectSearch()) != nullptr) {
1633 // TODO(nbeato) This should also do vertical, but column
1634 // boundaries are breaking things. This function needs to be
1635 // updated to allow vertical lines as well.
1636 if (!part->IsLineType()) {
1637 continue;
1638 }
1639 // Avoid the following function call if the result of the
1640 // function is irrelevant.
1641 const TBOX &part_box = part->bounding_box();
1642 if (result_box->contains(part_box)) {
1643 continue;
1644 }
1645 // Include a partially overlapping horizontal line only if the
1646 // extra ColPartitions that will be included due to expansion
1647 // have large side spacing w.r.t. columns containing them.
1648 if (HLineBelongsToTable(*part, table_box)) {
1649 *result_box = result_box->bounding_union(part_box);
1650 }
1651 // TODO(nbeato): Vertical
1652 }
1653}
1654
1655// Checks whether the horizontal line belong to the table by looking at the
1656// side spacing of extra ColParitions that will be included in the table
1657// due to expansion
1659 const TBOX &table_box) {
1660 if (!part.IsHorizontalLine()) {
1661 return false;
1662 }
1663 const TBOX &part_box = part.bounding_box();
1664 if (!part_box.major_x_overlap(table_box)) {
1665 return false;
1666 }
1667 // Do not consider top-most horizontal line since it usually
1668 // originates from noise.
1669 // TODO(nbeato): I had to comment this out because the ruling grid doesn't
1670 // have neighbors solved.
1671 // if (!part.nearest_neighbor_above())
1672 // return false;
1673 const TBOX bbox = part_box.bounding_union(table_box);
1674 // In the "unioned table" box (the table extents expanded by the line),
1675 // keep track of how many partitions have significant padding to the left
1676 // and right. If more than half of the partitions covered by the new table
1677 // have significant spacing, the line belongs to the table and the table
1678 // grows to include all of the partitions.
1679 int num_extra_partitions = 0;
1680 int extra_space_to_right = 0;
1681 int extra_space_to_left = 0;
1682 // Rulings are in a different grid, so search 2 grids for rulings, text,
1683 // and table partitions that are introduced by the new box.
1684 for (int i = 0; i < 2; ++i) {
1685 ColPartitionGrid *grid =
1687 // Start a rect search on bbox
1688 ColPartitionGridSearch rectsearch(grid);
1689 rectsearch.SetUniqueMode(true);
1690 rectsearch.StartRectSearch(bbox);
1691 ColPartition *extra_part = nullptr;
1692 while ((extra_part = rectsearch.NextRectSearch()) != nullptr) {
1693 // ColPartition already in table
1694 const TBOX &extra_part_box = extra_part->bounding_box();
1695 if (extra_part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
1696 continue;
1697 }
1698 // Non-text ColPartitions do not contribute
1699 if (extra_part->IsImageType()) {
1700 continue;
1701 }
1702 // Consider this partition.
1703 num_extra_partitions++;
1704 // presence of a table cell is a strong hint, so just increment the scores
1705 // without looking at the spacing.
1706 if (extra_part->type() == PT_TABLE || extra_part->IsLineType()) {
1707 extra_space_to_right++;
1708 extra_space_to_left++;
1709 continue;
1710 }
1711 int space_threshold = kSideSpaceMargin * part.median_height();
1712 if (extra_part->space_to_right() > space_threshold) {
1713 extra_space_to_right++;
1714 }
1715 if (extra_part->space_to_left() > space_threshold) {
1716 extra_space_to_left++;
1717 }
1718 }
1719 }
1720 // tprintf("%d %d %d\n",
1721 // num_extra_partitions,extra_space_to_right,extra_space_to_left);
1722 return (extra_space_to_right > num_extra_partitions / 2) ||
1723 (extra_space_to_left > num_extra_partitions / 2);
1724}
1725
1726// Look for isolated column headers above the given table box and
1727// include them in the table
1729 // Start a search above the current table to look for column headers
1731 vsearch.StartVerticalSearch(table_box->left(), table_box->right(),
1732 table_box->top());
1733 ColPartition *neighbor = nullptr;
1734 ColPartition *previous_neighbor = nullptr;
1735 while ((neighbor = vsearch.NextVerticalSearch(false)) != nullptr) {
1736 // Max distance to find a table heading.
1737 const int max_distance =
1739 int table_top = table_box->top();
1740 const TBOX &box = neighbor->bounding_box();
1741 // Do not continue if the next box is way above
1742 if (box.bottom() - table_top > max_distance) {
1743 break;
1744 }
1745 // Unconditionally include partitions of type TABLE or LINE
1746 // TODO(faisal): add some reasonable conditions here
1747 if (neighbor->type() == PT_TABLE || neighbor->IsLineType()) {
1748 table_box->set_top(box.top());
1749 previous_neighbor = nullptr;
1750 continue;
1751 }
1752 // If there are two text partitions, one above the other, without a table
1753 // cell on their left or right side, consider them a barrier and quit
1754 if (previous_neighbor == nullptr) {
1755 previous_neighbor = neighbor;
1756 } else {
1757 const TBOX &previous_box = previous_neighbor->bounding_box();
1758 if (!box.major_y_overlap(previous_box)) {
1759 break;
1760 }
1761 }
1762 }
1763}
1764
1765// Remove false alarms consisting of a single column based on their
1766// projection on the x-axis. Projection of a real table on the x-axis
1767// should have at least one zero-valley larger than the global median
1768// x-height of the page.
1770 int page_width = tright().x() - bleft().x();
1771 ASSERT_HOST(page_width > 0);
1772 // create an integer array to hold projection on x-axis
1773 int *table_xprojection = new int[page_width];
1774 // Iterate through all tables in the table grid
1776 &table_grid_);
1777 table_search.StartFullSearch();
1778 ColSegment *table;
1779 while ((table = table_search.NextFullSearch()) != nullptr) {
1780 TBOX table_box = table->bounding_box();
1781 // reset the projection array
1782 for (int i = 0; i < page_width; i++) {
1783 table_xprojection[i] = 0;
1784 }
1785 // Start a rect search on table_box
1788 rectsearch.SetUniqueMode(true);
1789 rectsearch.StartRectSearch(table_box);
1790 ColPartition *part;
1791 while ((part = rectsearch.NextRectSearch()) != nullptr) {
1792 if (!part->IsTextType()) {
1793 continue; // Do not consider non-text partitions
1794 }
1795 if (part->flow() == BTFT_LEADER) {
1796 continue; // Assume leaders are in tables
1797 }
1798 TBOX part_box = part->bounding_box();
1799 // Do not consider partitions partially covered by the table
1800 if (part_box.overlap_fraction(table_box) < kMinOverlapWithTable) {
1801 continue;
1802 }
1803 BLOBNBOX_CLIST *part_boxes = part->boxes();
1804 BLOBNBOX_C_IT pit(part_boxes);
1805
1806 // Make sure overlapping blobs don't artificially inflate the number
1807 // of rows in the table. This happens frequently with things such as
1808 // decimals and split characters. Do this by assuming the column
1809 // partition is sorted mostly left to right and just clip
1810 // bounding boxes by the previous box's extent.
1811 int next_position_to_write = 0;
1812
1813 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1814 BLOBNBOX *pblob = pit.data();
1815 // ignore blob height for the purpose of projection since we
1816 // are only interested in finding valleys
1817 int xstart = pblob->bounding_box().left();
1818 int xend = pblob->bounding_box().right();
1819
1820 xstart = std::max(xstart, next_position_to_write);
1821 for (int i = xstart; i < xend; i++) {
1822 table_xprojection[i - bleft().x()]++;
1823 }
1824 next_position_to_write = xend;
1825 }
1826 }
1827 // Find largest valley between two reasonable peaks in the table
1828 if (!GapInXProjection(table_xprojection, page_width)) {
1829 table_search.RemoveBBox();
1830 delete table;
1831 }
1832 }
1833 delete[] table_xprojection;
1834}
1835
1836// Return true if at least one gap larger than the global x-height
1837// exists in the horizontal projection
1838bool TableFinder::GapInXProjection(int *xprojection, int length) {
1839 // Find peak value of the histogram
1840 int peak_value = 0;
1841 for (int i = 0; i < length; i++) {
1842 if (xprojection[i] > peak_value) {
1843 peak_value = xprojection[i];
1844 }
1845 }
1846 // Peak value represents the maximum number of horizontally
1847 // overlapping colpartitions, so this can be considered as the
1848 // number of rows in the table
1849 if (peak_value < kMinRowsInTable) {
1850 return false;
1851 }
1852 double projection_threshold = kSmallTableProjectionThreshold * peak_value;
1853 if (peak_value >= kLargeTableRowCount) {
1854 projection_threshold = kLargeTableProjectionThreshold * peak_value;
1855 }
1856 // Threshold the histogram
1857 for (int i = 0; i < length; i++) {
1858 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1859 }
1860 // Find the largest run of zeros between two ones
1861 int largest_gap = 0;
1862 int run_start = -1;
1863 for (int i = 1; i < length; i++) {
1864 // detect start of a run of zeros
1865 if (xprojection[i - 1] && !xprojection[i]) {
1866 run_start = i;
1867 }
1868 // detect end of a run of zeros and update the value of largest gap
1869 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1870 int gap = i - run_start;
1871 if (gap > largest_gap) {
1872 largest_gap = gap;
1873 }
1874 run_start = -1;
1875 }
1876 }
1877 return largest_gap > kMaxXProjectionGapFactor * global_median_xheight_;
1878}
1879
1880// Given the location of a table "guess", try to overlay a cellular
1881// grid in the location, adjusting the boundaries.
1882// TODO(nbeato): Falsely introduces:
1883// -headers/footers (not any worse, too much overlap destroys cells)
1884// -page numbers (not worse, included because maximize margins)
1885// -equations (nicely fit into a celluar grid, but more sparsely)
1886// -figures (random text box, also sparse)
1887// -small left-aligned text areas with overlapping positioned whitespace
1888// (rejected before)
1889// Overall, this just needs some more work.
1891#ifndef GRAPHICS_DISABLED
1892 ScrollView *table_win = nullptr;
1893 if (textord_show_tables) {
1894 table_win = MakeWindow(0, 0, "Table Structure");
1897 // table_grid_.DisplayBoxes(table_win);
1898 }
1899#endif
1900
1901 TableRecognizer recognizer;
1902 recognizer.Init();
1906 recognizer.set_min_height(1.5 * gridheight());
1907 // Loop over all of the tables and try to fit them.
1908 // Store the good tables here.
1909 ColSegment_CLIST good_tables;
1910 ColSegment_C_IT good_it(&good_tables);
1911
1913 gsearch.StartFullSearch();
1914 ColSegment *found_table = nullptr;
1915 while ((found_table = gsearch.NextFullSearch()) != nullptr) {
1916 gsearch.RemoveBBox();
1917
1918 // The goal is to make the tables persistent in a list.
1919 // When that happens, this will move into the search loop.
1920 const TBOX &found_box = found_table->bounding_box();
1921 StructuredTable *table_structure = recognizer.RecognizeTable(found_box);
1922
1923 // Process a table. Good tables are inserted into the grid again later on
1924 // We can't change boxes in the grid while it is running a search.
1925 if (table_structure != nullptr) {
1926#ifndef GRAPHICS_DISABLED
1927 if (textord_show_tables) {
1928 table_structure->Display(table_win, ScrollView::LIME_GREEN);
1929 }
1930#endif
1931 found_table->set_bounding_box(table_structure->bounding_box());
1932 delete table_structure;
1933 good_it.add_after_then_move(found_table);
1934 } else {
1935 delete found_table;
1936 }
1937 }
1938 // TODO(nbeato): MERGE!! There is awesome info now available for merging.
1939
1940 // At this point, the grid is empty. We can safely insert the good tables
1941 // back into grid.
1942 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward()) {
1943 table_grid_.InsertBBox(true, true, good_it.extract());
1944 }
1945}
1946
1947#ifndef GRAPHICS_DISABLED
1948
1949// Displays the column segments in some window.
1950void TableFinder::DisplayColSegments(ScrollView *win, ColSegment_LIST *segments,
1951 ScrollView::Color color) {
1952 win->Pen(color);
1953 win->Brush(ScrollView::NONE);
1954 ColSegment_IT it(segments);
1955 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1956 ColSegment *col = it.data();
1957 const TBOX &box = col->bounding_box();
1958 int left_x = box.left();
1959 int right_x = box.right();
1960 int top_y = box.top();
1961 int bottom_y = box.bottom();
1962 win->Rectangle(left_x, bottom_y, right_x, top_y);
1963 }
1964 win->UpdateWindow();
1965}
1966
1967// Displays the colpartitions using a new coloring on an existing window.
1968// Note: This method is only for debug purpose during development and
1969// would not be part of checked in code
1971 ScrollView::Color default_color,
1972 ScrollView::Color table_color) {
1973 ScrollView::Color color = default_color;
1974 // Iterate the ColPartitions in the grid.
1976 gsearch.StartFullSearch();
1977 ColPartition *part = nullptr;
1978 while ((part = gsearch.NextFullSearch()) != nullptr) {
1979 color = default_color;
1980 if (part->type() == PT_TABLE) {
1981 color = table_color;
1982 }
1983
1984 const TBOX &box = part->bounding_box();
1985 int left_x = box.left();
1986 int right_x = box.right();
1987 int top_y = box.top();
1988 int bottom_y = box.bottom();
1989 win->Brush(ScrollView::NONE);
1990 win->Pen(color);
1991 win->Rectangle(left_x, bottom_y, right_x, top_y);
1992 }
1993 win->UpdateWindow();
1994}
1995
1997 ScrollView::Color default_color) {
1998 DisplayColPartitions(win, grid, default_color, ScrollView::YELLOW);
1999}
2000
2002 ColPartitionGrid *grid,
2003 ScrollView::Color color) {
2004 // Iterate the ColPartitions in the grid.
2006 gsearch.StartFullSearch();
2007 ColPartition *part = nullptr;
2008 while ((part = gsearch.NextFullSearch()) != nullptr) {
2009 const TBOX &box = part->bounding_box();
2010 int left_x = box.left();
2011 int right_x = box.right();
2012 int top_y = box.top();
2013 int bottom_y = box.bottom();
2014
2015 ColPartition *upper_part = part->nearest_neighbor_above();
2016 if (upper_part) {
2017 const TBOX &upper_box = upper_part->bounding_box();
2018 int mid_x = (left_x + right_x) / 2;
2019 int mid_y = (top_y + bottom_y) / 2;
2020 int other_x = (upper_box.left() + upper_box.right()) / 2;
2021 int other_y = (upper_box.top() + upper_box.bottom()) / 2;
2022 win->Brush(ScrollView::NONE);
2023 win->Pen(color);
2024 win->Line(mid_x, mid_y, other_x, other_y);
2025 }
2026 ColPartition *lower_part = part->nearest_neighbor_below();
2027 if (lower_part) {
2028 const TBOX &lower_box = lower_part->bounding_box();
2029 int mid_x = (left_x + right_x) / 2;
2030 int mid_y = (top_y + bottom_y) / 2;
2031 int other_x = (lower_box.left() + lower_box.right()) / 2;
2032 int other_y = (lower_box.top() + lower_box.bottom()) / 2;
2033 win->Brush(ScrollView::NONE);
2034 win->Pen(color);
2035 win->Line(mid_x, mid_y, other_x, other_y);
2036 }
2037 }
2038 win->UpdateWindow();
2039}
2040
2041#endif
2042
2043// Merge all colpartitions in table regions to make them a single
2044// colpartition and revert types of isolated table cells not
2045// assigned to any table to their original types.
2047 ColPartitionSet **all_columns,
2048 const WidthCallback &width_cb) {
2049 // Since we have table blocks already, remove table tags from all
2050 // colpartitions
2052 gsearch.StartFullSearch();
2053 ColPartition *part = nullptr;
2054
2055 while ((part = gsearch.NextFullSearch()) != nullptr) {
2056 if (part->type() == PT_TABLE) {
2057 part->clear_table_type();
2058 }
2059 }
2060 // Now make a single colpartition out of each table block and remove
2061 // all colpartitions contained within a table
2063 &table_grid_);
2064 table_search.StartFullSearch();
2065 ColSegment *table;
2066 while ((table = table_search.NextFullSearch()) != nullptr) {
2067 const TBOX &table_box = table->bounding_box();
2068 // Start a rect search on table_box
2070 grid);
2071 rectsearch.StartRectSearch(table_box);
2072 ColPartition *part;
2073 ColPartition *table_partition = nullptr;
2074 while ((part = rectsearch.NextRectSearch()) != nullptr) {
2075 // Do not consider image partitions
2076 if (!part->IsTextType()) {
2077 continue;
2078 }
2079 TBOX part_box = part->bounding_box();
2080 // Include partition in the table if more than half of it
2081 // is covered by the table
2082 if (part_box.overlap_fraction(table_box) > kMinOverlapWithTable) {
2083 rectsearch.RemoveBBox();
2084 if (table_partition) {
2085 table_partition->Absorb(part, width_cb);
2086 } else {
2087 table_partition = part;
2088 }
2089 }
2090 }
2091 // Insert table colpartition back to part_grid_
2092 if (table_partition) {
2093 // To match the columns used when transforming to blocks, the new table
2094 // partition must have its first and last column set at the grid y that
2095 // corresponds to its bottom.
2096 const TBOX &table_box = table_partition->bounding_box();
2097 int grid_x, grid_y;
2098 grid->GridCoords(table_box.left(), table_box.bottom(), &grid_x, &grid_y);
2099 table_partition->SetPartitionType(resolution_, all_columns[grid_y]);
2100 table_partition->set_table_type();
2101 table_partition->set_blob_type(BRT_TEXT);
2102 table_partition->set_flow(BTFT_CHAIN);
2103 table_partition->SetBlobTypes();
2104 grid->InsertBBox(true, true, table_partition);
2105 }
2106 }
2107}
2108
2112 : ELIST_LINK(),
2113 num_table_cells_(0),
2114 num_text_cells_(0),
2115 type_(COL_UNKNOWN) {}
2116
2117// Provides a color for BBGrid to draw the rectangle.
2119 const ScrollView::Color kBoxColors[PT_COUNT] = {
2124 };
2125 return kBoxColors[type_];
2126}
2127
2128// Insert a box into this column segment
2129void ColSegment::InsertBox(const TBOX &other) {
2130 bounding_box_ = bounding_box_.bounding_union(other);
2131}
2132
2133// Set column segment type based on the ratio of text and table partitions
2134// in it.
2136 if (num_table_cells_ > kTableColumnThreshold * num_text_cells_) {
2137 type_ = COL_TABLE;
2138 } else if (num_text_cells_ > num_table_cells_) {
2139 type_ = COL_TEXT;
2140 } else {
2141 type_ = COL_MIXED;
2142 }
2143}
2144
2145} // namespace tesseract.
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define ASSERT_HOST(x)
Definition: errcode.h:54
const double y
const int kMaxVerticalSpacing
Definition: tablefind.cpp:38
const double kAllowBlobHeight
Definition: tablefind.cpp:56
const double kMinOverlapWithTable
Definition: tablefind.cpp:97
const double kMaxTableCellXheight
Definition: tablefind.cpp:81
const double kMaxParagraphEndingLeftSpaceMultiple
Definition: tablefind.cpp:126
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_NOISE
Definition: blobbox.h:75
@ COL_TEXT
Definition: tablefind.h:29
@ COL_MIXED
Definition: tablefind.h:29
@ COL_TABLE
Definition: tablefind.h:29
@ COL_UNKNOWN
Definition: tablefind.h:29
const double kMinMaxGapInTextPartition
Definition: tablefind.cpp:73
const double kLargeTableProjectionThreshold
Definition: tablefind.cpp:107
const int kMinBoxesInTextPartition
Definition: tablefind.cpp:63
const double kStrokeWidthFractionalTolerance
Definition: tablefind.cpp:140
const int kMinRowsInTable
Definition: tablefind.cpp:112
std::function< bool(int)> WidthCallback
Definition: tabfind.h:35
const double kMinParagraphEndingTextToWhitespaceRatio
Definition: tablefind.cpp:132
const double kMaxGapInTextPartition
Definition: tablefind.cpp:69
void DeleteObject(T *object)
Definition: tablefind.cpp:156
const double kTableColumnThreshold
Definition: tablefind.cpp:89
const double kAllowBlobArea
Definition: tablefind.cpp:58
const double kAllowTextArea
Definition: tablefind.cpp:51
const int kAdjacentLeaderSearchPadding
Definition: tablefind.cpp:117
const double kStrokeWidthConstantTolerance
Definition: tablefind.cpp:141
const int kMaxBlobWidth
Definition: tablefind.cpp:40
const double kAllowTextWidth
Definition: tablefind.cpp:50
const double kAllowTextHeight
Definition: tablefind.cpp:49
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
const double kAllowBlobWidth
Definition: tablefind.cpp:57
const int kMaxColumnHeaderDistance
Definition: tablefind.cpp:85
const int kMaxBoxesInDataPartition
Definition: tablefind.cpp:66
const double kParagraphEndingPreviousLineRatio
Definition: tablefind.cpp:122
const int kLargeTableRowCount
Definition: tablefind.cpp:109
const double kSmallTableProjectionThreshold
Definition: tablefind.cpp:106
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:63
@ PT_HEADING_IMAGE
Definition: publictypes.h:62
@ PT_FLOWING_IMAGE
Definition: publictypes.h:61
@ PT_FLOWING_TEXT
Definition: publictypes.h:53
const int kSideSpaceMargin
Definition: tablefind.cpp:102
const double kSplitPartitionSize
Definition: tablefind.cpp:44
const double kMaxBlobOverlapFactor
Definition: tablefind.cpp:77
const double kMaxXProjectionGapFactor
Definition: tablefind.cpp:136
const TBOX & bounding_box() const
Definition: blobbox.h:239
BlobRegionType region_type() const
Definition: blobbox.h:298
BlobTextFlowType flow() const
Definition: blobbox.h:310
integer coordinate
Definition: points.h:36
TDimension y() const
access_function
Definition: points.h:62
TDimension x() const
access function
Definition: points.h:58
TDimension left() const
Definition: rect.h:82
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:445
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
void set_right(int x)
Definition: rect.h:92
void set_left(int x)
Definition: rect.h:85
TDimension top() const
Definition: rect.h:68
TBOX bounding_union(const TBOX &box) const
Definition: rect.cpp:128
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84
bool null_box() const
Definition: rect.h:60
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:419
void set_bottom(int y)
Definition: rect.h:78
TDimension right() const
Definition: rect.h:89
bool overlap(const TBOX &box) const
Definition: rect.h:363
TDimension bottom() const
Definition: rect.h:75
bool contains(const FCOORD pt) const
Definition: rect.h:344
int32_t area() const
Definition: rect.h:134
double overlap_fraction(const TBOX &box) const
Definition: rect.h:396
void set_top(int y)
Definition: rect.h:71
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:596
double median() const
Definition: statistc.cpp:241
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:837
void SetUniqueMode(bool mode)
Definition: bbgrid.h:249
int GridY() const
Definition: bbgrid.h:241
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:802
BBC * NextRectSearch()
Definition: bbgrid.h:896
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:788
void StartFullSearch()
Definition: bbgrid.h:701
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:849
void RepositionIterator()
Definition: bbgrid.h:954
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:884
BBC * NextFullSearch()
Definition: bbgrid.h:711
int gridsize() const
Definition: bbgrid.h:63
int gridheight() const
Definition: bbgrid.h:69
const ICOORD & bleft() const
Definition: bbgrid.h:72
int gridwidth() const
Definition: bbgrid.h:66
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
const ICOORD & tright() const
Definition: bbgrid.h:75
void DisplayBoxes(ScrollView *window)
Definition: bbgrid.h:649
void Clear()
Definition: bbgrid.h:497
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
void ClearGridData(void(*free_method)(BBC *))
Definition: bbgrid.h:506
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633
bool IsImageType() const
Definition: colpartition.h:429
void set_space_to_left(int space)
Definition: colpartition.h:276
BlobTextFlowType flow() const
Definition: colpartition.h:153
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:186
bool IsHorizontalLine() const
Definition: colpartition.h:459
PolyBlockType type() const
Definition: colpartition.h:180
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:390
void set_nearest_neighbor_above(ColPartition *part)
Definition: colpartition.h:252
BlobRegionType blob_type() const
Definition: colpartition.h:147
void AddBox(BLOBNBOX *box)
void set_blob_type(BlobRegionType t)
Definition: colpartition.h:150
const TBOX & bounding_box() const
Definition: colpartition.h:108
void set_nearest_neighbor_below(ColPartition *part)
Definition: colpartition.h:258
ColPartition * nearest_neighbor_above() const
Definition: colpartition.h:249
void set_space_above(int space)
Definition: colpartition.h:264
void set_inside_table_column(bool val)
Definition: colpartition.h:246
bool MatchingSizes(const ColPartition &other) const
int LeftAtY(int y) const
Definition: colpartition.h:340
void set_space_to_right(int space)
Definition: colpartition.h:282
ColPartition * ShallowCopy() const
bool IsInSameColumnAs(const ColPartition &part) const
int space_to_right() const
Definition: colpartition.h:279
ColPartition * SingletonPartner(bool upper)
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_space_below(int space)
Definition: colpartition.h:270
int RightAtY(int y) const
Definition: colpartition.h:344
void Absorb(ColPartition *other, const WidthCallback &cb)
ColPartition * nearest_neighbor_below() const
Definition: colpartition.h:255
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:156
void RefinePartitionPartners(bool get_desperate)
ColPartition * ColumnContaining(int x, int y)
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void InsertBox(const TBOX &other)
Definition: tablefind.cpp:2129
void set_bounding_box(const TBOX &other)
Definition: tablefind.h:65
void set_num_table_cells(int n)
Definition: tablefind.h:74
void set_num_text_cells(int n)
Definition: tablefind.h:83
ColSegType type() const
Definition: tablefind.h:87
ScrollView::Color BoxColor() const
Definition: tablefind.cpp:2118
const TBOX & bounding_box() const
Definition: tablefind.h:45
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
Definition: tablefind.cpp:1950
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
Definition: tablefind.cpp:1496
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
Definition: tablefind.cpp:1574
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: tablefind.cpp:519
void InsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:403
void IncludeLeftOutColumnHeaders(TBOX *table_box)
Definition: tablefind.cpp:1728
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
Definition: tablefind.cpp:181
const ICOORD & bleft() const
Definition: tablefind.cpp:388
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1625
void GetTableColumns(ColSegment_LIST *table_columns)
Definition: tablefind.cpp:1320
const ICOORD & tright() const
Definition: tablefind.cpp:391
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
Definition: tablefind.cpp:437
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:541
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, const WidthCallback &width_cb)
Definition: tablefind.cpp:2046
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
Definition: tablefind.cpp:571
void SetGlobalSpacings(ColPartitionGrid *grid)
Definition: tablefind.cpp:716
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
Definition: tablefind.cpp:1374
bool HasWideOrNoInterWordGap(ColPartition *part) const
Definition: tablefind.cpp:875
void InitializePartitions(ColPartitionSet **all_columns)
Definition: tablefind.cpp:582
bool HasLeaderAdjacent(const ColPartition &part)
Definition: tablefind.cpp:969
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
Definition: tablefind.cpp:1658
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
Definition: tablefind.cpp:525
void set_global_median_blob_width(int width)
Definition: tablefind.cpp:766
ColPartitionGrid leader_and_ruling_grid_
Definition: tablefind.h:397
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
Definition: tablefind.cpp:1219
void InsertLeaderPartition(ColPartition *part)
Definition: tablefind.cpp:411
bool GapInXProjection(int *xprojection, int length)
Definition: tablefind.cpp:1838
void set_global_median_xheight(int xheight)
Definition: tablefind.cpp:763
ColSegmentGrid col_seg_grid_
Definition: tablefind.h:403
int gridheight() const
Definition: tablefind.cpp:385
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
Definition: tablefind.cpp:193
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
Definition: tablefind.cpp:589
void set_global_median_ledding(int ledding)
Definition: tablefind.cpp:769
void InsertRulingPartition(ColPartition *part)
Definition: tablefind.cpp:419
void set_left_to_right_language(bool order)
Definition: tablefind.cpp:177
bool AllowBlob(const BLOBNBOX &blob) const
Definition: tablefind.cpp:503
ColSegmentGrid table_grid_
Definition: tablefind.h:405
void SetColumnsType(ColSegment_LIST *col_segments)
Definition: tablefind.cpp:1186
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
Definition: tablefind.cpp:1596
ColPartitionGrid fragmented_text_grid_
Definition: tablefind.h:401
void InsertTextPartition(ColPartition *part)
Definition: tablefind.cpp:395
void SetVerticalSpacing(ColPartition *part)
Definition: tablefind.cpp:671
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback width_cb, const FCOORD &reskew)
Definition: tablefind.cpp:261
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
Definition: tablefind.cpp:2001
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
Definition: tablefind.cpp:1970
void MarkPartitionsUsingLocalInformation()
Definition: tablefind.cpp:844
ColPartitionGrid clean_part_grid_
Definition: tablefind.h:395
void InsertImagePartition(ColPartition *part)
Definition: tablefind.cpp:422
bool AllowTextPartition(const ColPartition &part) const
Definition: tablefind.cpp:490
const TBOX & bounding_box() const
Definition: tablerecog.cpp:126
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:319
void set_max_text_height(int height)
Definition: tablerecog.cpp:759
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:750
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:747
void set_min_height(int height)
Definition: tablerecog.cpp:753
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:763
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:498
void Pen(Color color)
Definition: scrollview.cpp:710
void Brush(Color color)
Definition: scrollview.cpp:716
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:576