tesseract v5.3.3.20231005
tablerecog.cpp
Go to the documentation of this file.
1
2// File: tablerecog.cpp
3// Description: Helper class to help structure table areas. Given an bounding
4// box from TableFinder, the TableRecognizer should give a
5// StructuredTable (maybe a list in the future) of "good" tables
6// in that area.
7// Author: Nicholas Beato
8//
9// (C) Copyright 2009, Google Inc.
10// Licensed under the Apache License, Version 2.0 (the "License");
11// you may not use this file except in compliance with the License.
12// You may obtain a copy of the License at
13// http://www.apache.org/licenses/LICENSE-2.0
14// Unless required by applicable law or agreed to in writing, software
15// distributed under the License is distributed on an "AS IS" BASIS,
16// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17// See the License for the specific language governing permissions and
18// limitations under the License.
19//
21
22#ifdef HAVE_CONFIG_H
23# include "config_auto.h"
24#endif
25
26#include "tablerecog.h"
27
28#include <algorithm>
29
30namespace tesseract {
31
32// The amount of space required between the ColPartitions in 2 columns
33// of a non-lined table as a multiple of the median width.
34const double kHorizontalSpacing = 0.30;
35// The amount of space required between the ColPartitions in 2 rows
36// of a non-lined table as multiples of the median height.
37const double kVerticalSpacing = -0.2;
38// The number of cells that the grid lines may intersect.
39// See FindCellSplitLocations for explanation.
42// For "lined tables", the number of required lines. Currently a guess.
45// Number of columns required, as a fraction of the most columns found.
46// None of these are tweaked at all.
47const double kRequiredColumns = 0.7;
48// The tolerance for comparing margins of potential tables.
49const double kMarginFactor = 1.1;
50// The first and last row should be consistent cell height.
51// This factor is the first and last row cell height max.
52const double kMaxRowSize = 2.5;
53// Number of filled columns required to form a strong table row.
54// For small tables, this is an absolute number.
55const double kGoodRowNumberOfColumnsSmall[] = {2, 2, 2, 2, 2, 3, 3};
56// For large tables, it is a relative number
58// The amount of area that must be covered in a cell by ColPartitions to
59// be considered "filled"
60const double kMinFilledArea = 0.35;
61
62// Indicates that a table row is weak. This means that it has
63// many missing data cells or very large cell heights compared.
64// to the rest of the table.
65// Code is buggy right now. It is disabled in the calling function.
66// It seems like sometimes the row that is passed in is not correct
67// sometimes (like a phantom row is introduced). There's something going
68// on in the cell_y_ data member before this is called... not certain.
69static bool IsWeakTableRow(StructuredTable *table, int row) {
70 if (!table->VerifyRowFilled(row)) {
71 return false;
72 }
73
74 double threshold;
76 threshold = kGoodRowNumberOfColumnsSmall[table->column_count()];
77 } else {
78 threshold = table->column_count() * kGoodRowNumberOfColumnsLarge;
79 }
80
81 return table->CountFilledCellsInRow(row) < threshold;
82}
83
87
89 : text_grid_(nullptr)
90 , line_grid_(nullptr)
91 , is_lined_(false)
92 , space_above_(0)
93 , space_below_(0)
94 , space_left_(0)
95 , space_right_(0)
96 , median_cell_height_(0)
97 , median_cell_width_(0)
98 , max_text_height_(INT32_MAX) {}
99
101
103 text_grid_ = text_grid;
104}
106 line_grid_ = line_grid;
107}
109 max_text_height_ = height;
112 return is_lined_;
113}
115 return cell_y_.empty() ? 0 : cell_y_.size() - 1;
116}
118 return cell_x_.empty() ? 0 : cell_x_.size() - 1;
119}
121 return row_count() * column_count();
122}
124 bounding_box_ = box;
125}
127 return bounding_box_;
128}
130 return median_cell_height_;
131}
133 return median_cell_width_;
134}
135int StructuredTable::row_height(unsigned row) const {
136 ASSERT_HOST(row < row_count());
137 return cell_y_[row + 1] - cell_y_[row];
138}
139int StructuredTable::column_width(unsigned column) const {
140 ASSERT_HOST(column < column_count());
141 return cell_x_[column + 1] - cell_x_[column];
142}
144 return space_above_;
145}
147 return space_below_;
149
150// At this point, we know that the lines are contained
151// by the box (by FindLinesBoundingBox).
152// So try to find the cell structure and make sure it works out.
153// The assumption is that all lines span the table. If this
154// assumption fails, the VerifyLinedTable method will
155// abort the lined table. The TableRecognizer will fall
156// back on FindWhitespacedStructure.
159
160 // Search for all of the lines in the current box.
161 // Update the cellular structure with the exact lines.
163 box_search.SetUniqueMode(true);
164 box_search.StartRectSearch(bounding_box_);
165 ColPartition *line = nullptr;
166
167 while ((line = box_search.NextRectSearch()) != nullptr) {
168 if (line->IsHorizontalLine()) {
169 cell_y_.push_back(line->MidY());
170 }
171 if (line->IsVerticalLine()) {
172 cell_x_.push_back(line->MidX());
173 }
174 }
175
176 // HasSignificantLines should guarantee cells.
177 // Because that code is a different class, just gracefully
178 // return false. This could be an assert.
179 if (cell_x_.size() < 3 || cell_y_.size() < 3) {
180 return false;
181 }
182
183 // Sort and remove duplicates that may have occurred due to split lines.
184 std::sort(cell_x_.begin(), cell_x_.end());
185 auto last_x = std::unique(cell_x_.begin(), cell_x_.end());
186 cell_x_.erase(last_x, cell_x_.end());
187 std::sort(cell_y_.begin(), cell_y_.end());
188 auto last_y = std::unique(cell_y_.begin(), cell_y_.end());
189 cell_y_.erase(last_y, cell_y_.end());
190
191 // The border should be the extents of line boxes, not middle.
193 cell_x_[cell_x_.size() - 1] = bounding_box_.right();
195 cell_y_[cell_y_.size() - 1] = bounding_box_.top();
196
197 // Remove duplicates that may have occurred due to moving the borders.
198 last_x = std::unique(cell_x_.begin(), cell_x_.end());
199 cell_x_.erase(last_x, cell_x_.end());
200 last_y = std::unique(cell_y_.begin(), cell_y_.end());
201 cell_y_.erase(last_y, cell_y_.end());
202
206 return is_lined_;
207}
208
209// Finds the cellular structure given a particular box.
214
215 if (!VerifyWhitespacedTable()) {
216 return false;
217 } else {
225 return true;
226 }
227}
228
229// Tests if a partition fits inside the table structure.
230// Partitions must fully span a grid line in order to intersect it.
231// This means that a partition does not intersect a line
232// that it "just" touches. This is mainly because the assumption
233// throughout the code is that "0" distance is a very very small space.
235 const TBOX &box = part.bounding_box();
236 for (int i : cell_x_) {
237 if (box.left() < i && i < box.right()) {
238 return false;
239 }
240 }
241 for (int i : cell_y_) {
242 if (box.bottom() < i && i < box.top()) {
243 return false;
244 }
245 }
246 return true;
247}
248
249// Checks if a sub-table has multiple data cells filled.
251 return CountFilledCells(0, row_count() - 1, 0, column_count() - 1);
252}
254 return CountFilledCells(row, row, 0, column_count() - 1);
255}
257 return CountFilledCells(0, row_count() - 1, column, column);
258}
259int StructuredTable::CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start,
260 unsigned column_end) {
261 ASSERT_HOST(row_start <= row_end && row_end < row_count());
262 ASSERT_HOST(column_start <= column_end && column_end < column_count());
263 int cell_count = 0;
264 TBOX cell_box;
265 for (unsigned row = row_start; row <= row_end; ++row) {
266 cell_box.set_bottom(cell_y_[row]);
267 cell_box.set_top(cell_y_[row + 1]);
268 for (unsigned col = column_start; col <= column_end; ++col) {
269 cell_box.set_left(cell_x_[col]);
270 cell_box.set_right(cell_x_[col + 1]);
271 if (CountPartitions(cell_box) > 0) {
272 ++cell_count;
273 }
274 }
275 }
276 return cell_count;
277}
278
279// Makes sure that at least one cell in a row has substantial area filled.
280// This can filter out large whitespace caused by growing tables too far
281// and page numbers.
283 for (unsigned i = 0; i < column_count(); ++i) {
284 auto area_filled = CalculateCellFilledPercentage(row, i);
285 if (area_filled >= kMinFilledArea) {
286 return true;
287 }
288 }
289 return false;
290}
291
292// Finds the filled area in a cell.
293// Assume ColPartitions do not overlap for simplicity (even though they do).
294double StructuredTable::CalculateCellFilledPercentage(unsigned row, unsigned column) {
295 ASSERT_HOST(row <= row_count());
296 ASSERT_HOST(column <= column_count());
297 const TBOX kCellBox(cell_x_[column], cell_y_[row], cell_x_[column + 1], cell_y_[row + 1]);
298 ASSERT_HOST(!kCellBox.null_box());
299
301 gsearch.SetUniqueMode(true);
302 gsearch.StartRectSearch(kCellBox);
303 double area_covered = 0;
304 ColPartition *text = nullptr;
305 while ((text = gsearch.NextRectSearch()) != nullptr) {
306 if (text->IsTextType()) {
307 area_covered += text->bounding_box().intersection(kCellBox).area();
308 }
309 }
310 const int32_t current_area = kCellBox.area();
311 if (current_area == 0) {
312 return 1.0;
313 }
314 return std::min(1.0, area_covered / current_area);
315}
316
317#ifndef GRAPHICS_DISABLED
318
320 window->Brush(ScrollView::NONE);
321 window->Pen(color);
324 for (int i : cell_x_) {
325 window->Line(i, bounding_box_.bottom(), i, bounding_box_.top());
326 }
327 for (int i : cell_y_) {
328 window->Line(bounding_box_.left(), i, bounding_box_.right(), i);
329 }
330 window->UpdateWindow();
331}
333#endif
334
335// Clear structure information.
337 cell_x_.clear();
338 cell_y_.clear();
339 is_lined_ = false;
340 space_above_ = 0;
341 space_below_ = 0;
342 space_left_ = 0;
343 space_right_ = 0;
346}
348// When a table has lines, the lines should not intersect any partitions.
349// The following function makes sure the previous assumption is met.
351 // Function only called when lines exist.
352 ASSERT_HOST(cell_y_.size() >= 2 && cell_x_.size() >= 2);
353 for (int i : cell_y_) {
355 return false;
356 }
357 }
358 for (int i : cell_x_) {
359 if (CountVerticalIntersections(i) > 0) {
360 return false;
361 }
362 }
363 return true;
364}
365
366// TODO(nbeato): Could be much better than this.
367// Examples:
368// - Calculate the percentage of filled cells.
369// - Calculate the average number of ColPartitions per cell.
370// - Calculate the number of cells per row with partitions.
371// - Check if ColPartitions in adjacent cells are similar.
372// - Check that all columns are at least a certain width.
373// - etc.
375 // criteria for a table, must be at least 2x3 or 3x2
376 return row_count() >= 2 && column_count() >= 2 && cell_count() >= 6;
377}
378
379// Finds vertical splits in the ColPartitions of text_grid_ by considering
380// all possible "good" guesses. A good guess is just the left/right sides of
381// the partitions, since these locations will uniquely define where the
382// extremal values where the splits can occur. The split happens
383// in the middle of the two nearest partitions.
385 // Set of the extents of all partitions on the page.
386 std::vector<int> left_sides;
387 std::vector<int> right_sides;
388
389 // Look at each text partition. We want to find the partitions
390 // that have extremal left/right sides. These will give us a basis
391 // for the table columns.
393 gsearch.SetUniqueMode(true);
395 ColPartition *text = nullptr;
396 while ((text = gsearch.NextRectSearch()) != nullptr) {
397 if (!text->IsTextType()) {
398 continue;
399 }
400
401 ASSERT_HOST(text->bounding_box().left() < text->bounding_box().right());
402 int spacing = static_cast<int>(text->median_width() * kHorizontalSpacing / 2.0 + 0.5);
403 left_sides.push_back(text->bounding_box().left() - spacing);
404 right_sides.push_back(text->bounding_box().right() + spacing);
405 }
406 // It causes disaster below, so avoid it!
407 if (left_sides.empty() || right_sides.empty()) {
408 return;
409 }
410
411 // Since data may be inserted in grid order, we sort the left/right sides.
412 std::sort(left_sides.begin(), left_sides.end());
413 std::sort(right_sides.begin(), right_sides.end());
414
415 // At this point, in the "merged list", we expect to have a left side,
416 // followed by either more left sides or a right side. The last number
417 // should be a right side. We find places where the splits occur by looking
418 // for "valleys". If we want to force gap sizes or allow overlap, change
419 // the spacing above. If you want to let lines "slice" partitions as long
420 // as it is infrequent, change the following function.
421 FindCellSplitLocations(left_sides, right_sides, kCellSplitColumnThreshold, &cell_x_);
422}
423
424// Finds horizontal splits in the ColPartitions of text_grid_ by considering
425// all possible "good" guesses. A good guess is just the bottom/top sides of
426// the partitions, since these locations will uniquely define where the
427// extremal values where the splits can occur. The split happens
428// in the middle of the two nearest partitions.
430 // Set of the extents of all partitions on the page.
431 std::vector<int> bottom_sides;
432 std::vector<int> top_sides;
433 // We will be "shrinking" partitions, so keep the min/max around to
434 // make sure the bottom/top lines do not intersect text.
435 int min_bottom = INT32_MAX;
436 int max_top = INT32_MIN;
437
438 // Look at each text partition. We want to find the partitions
439 // that have extremal bottom/top sides. These will give us a basis
440 // for the table rows. Because the textlines can be skewed and close due
441 // to warping, the height of the partitions is toned down a little bit.
443 gsearch.SetUniqueMode(true);
445 ColPartition *text = nullptr;
446 while ((text = gsearch.NextRectSearch()) != nullptr) {
447 if (!text->IsTextType()) {
448 continue;
449 }
450
451 ASSERT_HOST(text->bounding_box().bottom() < text->bounding_box().top());
452 min_bottom = std::min(min_bottom, static_cast<int>(text->bounding_box().bottom()));
453 max_top = std::max(max_top, static_cast<int>(text->bounding_box().top()));
454
455 // Ignore "tall" text partitions, as these are usually false positive
456 // vertical text or multiple lines pulled together.
457 if (text->bounding_box().height() > max_text_height_) {
458 continue;
459 }
460
461 int spacing = static_cast<int>(text->bounding_box().height() * kVerticalSpacing / 2.0 + 0.5);
462 int bottom = text->bounding_box().bottom() - spacing;
463 int top = text->bounding_box().top() + spacing;
464 // For horizontal text, the factor can be negative. This should
465 // probably cause a warning or failure. I haven't actually checked if
466 // it happens.
467 if (bottom >= top) {
468 continue;
469 }
470
471 bottom_sides.push_back(bottom);
472 top_sides.push_back(top);
473 }
474 // It causes disaster below, so avoid it!
475 if (bottom_sides.empty() || top_sides.empty()) {
476 return;
477 }
478
479 // Since data may be inserted in grid order, we sort the bottom/top sides.
480 std::sort(bottom_sides.begin(), bottom_sides.end());
481 std::sort(top_sides.begin(), top_sides.end());
482
483 // At this point, in the "merged list", we expect to have a bottom side,
484 // followed by either more bottom sides or a top side. The last number
485 // should be a top side. We find places where the splits occur by looking
486 // for "valleys". If we want to force gap sizes or allow overlap, change
487 // the spacing above. If you want to let lines "slice" partitions as long
488 // as it is infrequent, change the following function.
489 FindCellSplitLocations(bottom_sides, top_sides, kCellSplitRowThreshold, &cell_y_);
490
491 // Recover the min/max correctly since it was shifted.
492 cell_y_[0] = min_bottom;
493 cell_y_[cell_y_.size() - 1] = max_top;
494}
495
497 space_above_ = INT32_MAX;
498 space_below_ = INT32_MAX;
499 space_right_ = INT32_MAX;
500 space_left_ = INT32_MAX;
503}
504// Finds the nearest partition in grid to the table
505// boundaries and updates the margin.
507 int below = FindVerticalMargin(grid, bounding_box_.bottom(), true);
508 space_below_ = std::min(space_below_, below);
509 int above = FindVerticalMargin(grid, bounding_box_.top(), false);
510 space_above_ = std::min(space_above_, above);
511 int left = FindHorizontalMargin(grid, bounding_box_.left(), true);
512 space_left_ = std::min(space_left_, left);
513 int right = FindHorizontalMargin(grid, bounding_box_.right(), false);
514 space_right_ = std::min(space_right_, right);
515}
516int StructuredTable::FindVerticalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
517 ColPartitionGridSearch gsearch(grid);
518 gsearch.SetUniqueMode(true);
520 ColPartition *part = nullptr;
521 while ((part = gsearch.NextVerticalSearch(decrease)) != nullptr) {
522 if (!part->IsTextType() && !part->IsHorizontalLine()) {
523 continue;
524 }
525 int distance =
526 decrease ? border - part->bounding_box().top() : part->bounding_box().bottom() - border;
527 if (distance >= 0) {
528 return distance;
529 }
530 }
531 return INT32_MAX;
532}
533int StructuredTable::FindHorizontalMargin(ColPartitionGrid *grid, int border, bool decrease) const {
534 ColPartitionGridSearch gsearch(grid);
535 gsearch.SetUniqueMode(true);
537 ColPartition *part = nullptr;
538 while ((part = gsearch.NextSideSearch(decrease)) != nullptr) {
539 if (!part->IsTextType() && !part->IsVerticalLine()) {
540 continue;
541 }
542 int distance =
543 decrease ? border - part->bounding_box().right() : part->bounding_box().left() - border;
544 if (distance >= 0) {
545 return distance;
546 }
547 }
548 return INT32_MAX;
549}
550
552 const int kMaxCellHeight = 1000;
553 const int kMaxCellWidth = 1000;
554 STATS height_stats(0, kMaxCellHeight);
555 STATS width_stats(0, kMaxCellWidth);
556
557 for (unsigned i = 0; i < row_count(); ++i) {
558 height_stats.add(row_height(i), column_count());
559 }
560 for (unsigned i = 0; i < column_count(); ++i) {
561 width_stats.add(column_width(i), row_count());
562 }
563
564 median_cell_height_ = static_cast<int>(height_stats.median() + 0.5);
565 median_cell_width_ = static_cast<int>(width_stats.median() + 0.5);
566}
567
568// Looks for grid lines near the current bounding box and
569// grows the bounding box to include them if no intersections
570// will occur as a result. This is necessary because the margins
571// are calculated relative to the closest line/text. If the
572// line isn't absorbed, the margin will be the distance to the line.
575 gsearch.SetUniqueMode(true);
576
577 // Is the closest line above good? Loop multiple times for tables with
578 // multi-line (sometimes 2) borders. Limit the number of lines by
579 // making sure they stay within a table cell or so.
580 ColPartition *line = nullptr;
582 while ((line = gsearch.NextVerticalSearch(false)) != nullptr) {
583 if (!line->IsHorizontalLine()) {
584 break;
585 }
587 line->MidY());
588 if (text_search.height() > median_cell_height_ * 2) {
589 break;
590 }
591 if (CountPartitions(text_search) > 0) {
592 break;
593 }
594 bounding_box_.set_top(line->MidY());
595 }
596 // As above, is the closest line below good?
597 line = nullptr;
599 while ((line = gsearch.NextVerticalSearch(true)) != nullptr) {
600 if (!line->IsHorizontalLine()) {
601 break;
602 }
603 TBOX text_search(bounding_box_.left(), line->MidY(), bounding_box_.right(),
604 bounding_box_.bottom() - 1);
605 if (text_search.height() > median_cell_height_ * 2) {
606 break;
607 }
608 if (CountPartitions(text_search) > 0) {
609 break;
610 }
612 }
613 // TODO(nbeato): vertical lines
614}
615
616// This function will find all "0 valleys" (of any length) given two
617// arrays. The arrays are the mins and maxes of partitions (either
618// left and right or bottom and top). Since the min/max lists are generated
619// with pairs of increasing integers, we can make some assumptions in
620// the function about ordering of the overall list, which are shown in the
621// asserts.
622// The algorithm works as follows:
623// While there are numbers to process, take the smallest number.
624// If it is from the min_list, increment the "hill" counter.
625// Otherwise, decrement the "hill" counter.
626// In the process of doing this, keep track of "crossing" the
627// desired height.
628// The first/last items are extremal values of the list and known.
629// NOTE: This function assumes the lists are sorted!
630void StructuredTable::FindCellSplitLocations(const std::vector<int> &min_list,
631 const std::vector<int> &max_list, int max_merged,
632 std::vector<int> *locations) {
633 locations->clear();
634 ASSERT_HOST(min_list.size() == max_list.size());
635 if (min_list.empty()) {
636 return;
637 }
638 ASSERT_HOST(min_list.at(0) < max_list.at(0));
639 ASSERT_HOST(min_list.at(min_list.size() - 1) < max_list.at(max_list.size() - 1));
640
641 locations->push_back(min_list.at(0));
642 unsigned min_index = 0;
643 unsigned max_index = 0;
644 int stacked_partitions = 0;
645 int last_cross_position = INT32_MAX;
646 // max_index will expire after min_index.
647 // However, we can't "increase" the hill size if min_index expired.
648 // So finish processing when min_index expires.
649 while (min_index < min_list.size()) {
650 // Increase the hill count.
651 if (min_list[min_index] < max_list[max_index]) {
652 ++stacked_partitions;
653 if (last_cross_position != INT32_MAX && stacked_partitions > max_merged) {
654 int mid = (last_cross_position + min_list[min_index]) / 2;
655 locations->push_back(mid);
656 last_cross_position = INT32_MAX;
657 }
658 ++min_index;
659 } else {
660 // Decrease the hill count.
661 --stacked_partitions;
662 if (last_cross_position == INT32_MAX && stacked_partitions <= max_merged) {
663 last_cross_position = max_list[max_index];
664 }
665 ++max_index;
666 }
667 }
668 locations->push_back(max_list.at(max_list.size() - 1));
669}
670
671// Counts the number of partitions in the table
672// box that intersection the given x value.
674 int count = 0;
675 // Make a small box to keep the search time down.
676 const int kGridSize = text_grid_->gridsize();
677 TBOX vertical_box = bounding_box_;
678 vertical_box.set_left(x - kGridSize);
679 vertical_box.set_right(x + kGridSize);
680
682 gsearch.SetUniqueMode(true);
683 gsearch.StartRectSearch(vertical_box);
684 ColPartition *text = nullptr;
685 while ((text = gsearch.NextRectSearch()) != nullptr) {
686 if (!text->IsTextType()) {
687 continue;
688 }
689 const TBOX &box = text->bounding_box();
690 if (box.left() < x && x < box.right()) {
691 ++count;
692 }
693 }
694 return count;
695}
696
697// Counts the number of partitions in the table
698// box that intersection the given y value.
700 int count = 0;
701 // Make a small box to keep the search time down.
702 const int kGridSize = text_grid_->gridsize();
703 TBOX horizontal_box = bounding_box_;
704 horizontal_box.set_bottom(y - kGridSize);
705 horizontal_box.set_top(y + kGridSize);
706
708 gsearch.SetUniqueMode(true);
709 gsearch.StartRectSearch(horizontal_box);
710 ColPartition *text = nullptr;
711 while ((text = gsearch.NextRectSearch()) != nullptr) {
712 if (!text->IsTextType()) {
713 continue;
714 }
715
716 const TBOX &box = text->bounding_box();
717 if (box.bottom() < y && y < box.top()) {
718 ++count;
719 }
720 }
721 return count;
722}
723
724// Counts how many text partitions are in this box.
725// This is used to count partitions in cells, as that can indicate
726// how "strong" a potential table row/column (or even full table) actually is.
729 gsearch.SetUniqueMode(true);
730 gsearch.StartRectSearch(box);
731 int count = 0;
732 ColPartition *text = nullptr;
733 while ((text = gsearch.NextRectSearch()) != nullptr) {
734 if (text->IsTextType()) {
735 ++count;
736 }
737 }
738 return count;
739}
740
744
746
748 text_grid_ = text_grid;
749}
751 line_grid_ = line_grid;
752}
754 min_height_ = height;
755}
757 min_width_ = width;
758}
760 max_text_height_ = height;
761}
762
764 auto *table = new StructuredTable();
765 table->Init();
766 table->set_text_grid(text_grid_);
767 table->set_line_grid(line_grid_);
768 table->set_max_text_height(max_text_height_);
769
770 // Try to solve this simple case, a table with *both*
771 // vertical and horizontal lines.
772 if (RecognizeLinedTable(guess, table)) {
773 return table;
774 }
775
776 // Fallback to whitespace if that failed.
777 // TODO(nbeato): Break this apart to take advantage of horizontal
778 // lines or vertical lines when present.
779 if (RecognizeWhitespacedTable(guess, table)) {
780 return table;
781 }
782
783 // No table found...
784 delete table;
785 return nullptr;
786}
787
789 if (!HasSignificantLines(guess_box)) {
790 return false;
791 }
792 TBOX line_bound = guess_box;
793 if (!FindLinesBoundingBox(&line_bound)) {
794 return false;
795 }
796 table->set_bounding_box(line_bound);
797 return table->FindLinedStructure();
798}
799
800// Quick implementation. Just count the number of lines in the box.
801// A better implementation would counter intersections and look for connected
802// components. It could even go as far as finding similar length lines.
803// To account for these possible issues, the VerifyLinedTableCells function
804// will reject lined tables that cause intersections with text on the page.
805// TODO(nbeato): look for "better" lines
808 box_search.SetUniqueMode(true);
809 box_search.StartRectSearch(guess);
810 ColPartition *line = nullptr;
811 int vertical_count = 0;
812 int horizontal_count = 0;
813
814 while ((line = box_search.NextRectSearch()) != nullptr) {
815 if (line->IsHorizontalLine()) {
816 ++horizontal_count;
817 }
818 if (line->IsVerticalLine()) {
819 ++vertical_count;
820 }
821 }
822
823 return vertical_count >= kLinedTableMinVerticalLines &&
824 horizontal_count >= kLinedTableMinHorizontalLines;
825}
826
827// Given a bounding box with a bunch of horizontal / vertical lines,
828// we just find the extents of all of these lines iteratively.
829// The box will be at least as large as guess. This
830// could possibly be a bad assumption.
831// It is guaranteed to halt in at least O(n * gridarea) where n
832// is the number of lines.
833// The assumption is that growing the box iteratively will add lines
834// several times, but eventually we'll find the extents.
835//
836// For tables, the approach is a bit aggressive, a single line (which could be
837// noise or a column ruling) can destroy the table inside.
838//
839// TODO(nbeato): This is a quick first implementation.
840// A better implementation would actually look for consistency
841// in extents of the lines and find the extents using lines
842// that clearly describe the table. This would allow the
843// lines to "vote" for height/width. An approach like
844// this would solve issues with page layout rulings.
845// I haven't looked for these issues yet, so I can't even
846// say they happen confidently.
848 // The first iteration will tell us if there are lines
849 // present and shrink the box to a minimal iterative size.
850 if (!FindLinesBoundingBoxIteration(bounding_box)) {
851 return false;
852 }
853
854 // Keep growing until the area of the table stabilizes.
855 // The box can only get bigger, increasing area.
856 bool changed = true;
857 while (changed) {
858 changed = false;
859 int old_area = bounding_box->area();
860 bool check = FindLinesBoundingBoxIteration(bounding_box);
861 // At this point, the function will return true.
862 ASSERT_HOST(check);
863 ASSERT_HOST(bounding_box->area() >= old_area);
864 changed = (bounding_box->area() > old_area);
865 }
866
867 return true;
868}
869
871 // Search for all of the lines in the current box, keeping track of extents.
873 box_search.SetUniqueMode(true);
874 box_search.StartRectSearch(*bounding_box);
875 ColPartition *line = nullptr;
876 bool first_line = true;
877
878 while ((line = box_search.NextRectSearch()) != nullptr) {
879 if (line->IsLineType()) {
880 if (first_line) {
881 // The first iteration can shrink the box.
882 *bounding_box = line->bounding_box();
883 first_line = false;
884 } else {
885 *bounding_box += line->bounding_box();
886 }
887 }
888 }
889 return !first_line;
890}
891
892// The goal of this function is to move the table boundaries around and find
893// a table that maximizes the whitespace around the table while maximizing
894// the cellular structure. As a result, it gets confused by headers, footers,
895// and merged columns (text that crosses columns). There is a tolerance
896// that allows a few partitions to count towards potential cell merges.
897// It's the max_merged parameter to FindPartitionLocations.
898// It can work, but it needs some false positive remove on boundaries.
899// For now, the grid structure must not intersect any partitions.
900// Also, small tolerance is added to the horizontal lines for tightly packed
901// tables. The tolerance is added by adjusting the bounding boxes of the
902// partitions (in FindHorizontalPartitions). The current implementation
903// only adjusts the vertical extents of the table.
904//
905// Also note. This was hacked at a lot. It could probably use some
906// more hacking at to find a good set of border conditions and then a
907// nice clean up.
909 TBOX best_box = guess_box; // Best borders known.
910 int best_below = 0; // Margin size above best table.
911 int best_above = 0; // Margin size below best table.
912 TBOX adjusted = guess_box; // The search box.
913
914 // We assume that the guess box is somewhat accurate, so we don't allow
915 // the adjusted border to pass half of the guessed area. This prevents
916 // "negative" tables from forming.
917 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
918 // Keeps track of the most columns in an accepted table. The resulting table
919 // may be less than the max, but we don't want to stray too far.
920 unsigned best_cols = 0;
921 // Make sure we find a good border.
922 bool found_good_border = false;
923
924 // Find the bottom of the table by trying a few different locations. For
925 // each location, the top, left, and right are fixed. We start the search
926 // in a smaller table to favor best_cols getting a good estimate sooner.
927 int last_bottom = INT32_MAX;
928 int bottom =
929 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
930 int top =
931 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
932 adjusted.set_top(top);
933
934 // Headers/footers can be spaced far from everything.
935 // Make sure that the space below is greater than the space above
936 // the lowest row.
937 int previous_below = 0;
938 const int kMaxChances = 10;
939 int chances = kMaxChances;
940 while (bottom != last_bottom) {
941 adjusted.set_bottom(bottom);
942
943 if (adjusted.height() >= min_height_) {
944 // Try to fit the grid on the current box. We give it a chance
945 // if the number of columns didn't significantly drop.
946 table->set_bounding_box(adjusted);
947 if (table->FindWhitespacedStructure() &&
948 table->column_count() >= best_cols * kRequiredColumns) {
949 if (false && IsWeakTableRow(table, 0)) {
950 // Currently buggy, but was looking promising so disabled.
951 --chances;
952 } else {
953 // We favor 2 things,
954 // 1- Adding rows that have partitioned data.
955 // 2- Better margins (to find header/footer).
956 // For better tables, we just look for multiple cells in the
957 // bottom row with data in them.
958 // For margins, the space below the last row should
959 // be better than a table with the last row removed.
960 chances = kMaxChances;
961 double max_row_height = kMaxRowSize * table->median_cell_height();
962 if ((table->space_below() * kMarginFactor >= best_below &&
963 table->space_below() >= previous_below) ||
964 (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
965 best_box.set_bottom(bottom);
966 best_below = table->space_below();
967 best_cols = std::max(table->column_count(), best_cols);
968 found_good_border = true;
969 }
970 }
971 previous_below = table->space_below();
972 } else {
973 --chances;
974 }
975 }
976 if (chances <= 0) {
977 break;
978 }
979
980 last_bottom = bottom;
981 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
982 }
983 if (!found_good_border) {
984 return false;
985 }
986
987 // TODO(nbeato) comments: follow modified code above... put it in a function!
988 found_good_border = false;
989 int last_top = INT32_MIN;
990 top =
991 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
992 int previous_above = 0;
993 chances = kMaxChances;
994
995 adjusted.set_bottom(best_box.bottom());
996 while (last_top != top) {
997 adjusted.set_top(top);
998 if (adjusted.height() >= min_height_) {
999 table->set_bounding_box(adjusted);
1000 if (table->FindWhitespacedStructure() &&
1001 table->column_count() >= best_cols * kRequiredColumns) {
1002 int last_row = table->row_count() - 1;
1003 if (false && IsWeakTableRow(table, last_row)) {
1004 // Currently buggy, but was looking promising so disabled.
1005 --chances;
1006 } else {
1007 chances = kMaxChances;
1008 double max_row_height = kMaxRowSize * table->median_cell_height();
1009 if ((table->space_above() * kMarginFactor >= best_above &&
1010 table->space_above() >= previous_above) ||
1011 (table->CountFilledCellsInRow(last_row) > 1 &&
1012 table->row_height(last_row) < max_row_height)) {
1013 best_box.set_top(top);
1014 best_above = table->space_above();
1015 best_cols = std::max(table->column_count(), best_cols);
1016 found_good_border = true;
1017 }
1018 }
1019 previous_above = table->space_above();
1020 } else {
1021 --chances;
1022 }
1023 }
1024 if (chances <= 0) {
1025 break;
1026 }
1027
1028 last_top = top;
1029 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1030 }
1031
1032 if (!found_good_border) {
1033 return false;
1034 }
1035
1036 // If we get here, this shouldn't happen. It can be an assert, but
1037 // I haven't tested it enough to make it crash things.
1038 if (best_box.null_box()) {
1039 return false;
1040 }
1041
1042 // Given the best locations, fit the box to those locations.
1043 table->set_bounding_box(best_box);
1044 return table->FindWhitespacedStructure();
1045}
1046
1047// Finds the closest value to y that can safely cause a horizontal
1048// split in the partitions.
1049// This function has been buggy and not as reliable as I would've
1050// liked. I suggest finding all of the splits using the
1051// FindPartitionLocations once and then just keeping the results
1052// of that function cached somewhere.
1053int TableRecognizer::NextHorizontalSplit(int left, int right, int y, bool top_to_bottom) {
1055 gsearch.SetUniqueMode(true);
1056 gsearch.StartVerticalSearch(left, right, y);
1057 ColPartition *text = nullptr;
1058 int last_y = y;
1059 while ((text = gsearch.NextVerticalSearch(top_to_bottom)) != nullptr) {
1060 if (!text->IsTextType() || !text->IsHorizontalType()) {
1061 continue;
1062 }
1063 if (text->bounding_box().height() > max_text_height_) {
1064 continue;
1065 }
1066
1067 const TBOX &text_box = text->bounding_box();
1068 if (top_to_bottom && (last_y >= y || last_y <= text_box.top())) {
1069 last_y = std::min(last_y, static_cast<int>(text_box.bottom()));
1070 continue;
1071 }
1072 if (!top_to_bottom && (last_y <= y || last_y >= text_box.bottom())) {
1073 last_y = std::max(last_y, static_cast<int>(text_box.top()));
1074 continue;
1075 }
1076
1077 return last_y;
1078 }
1079 // If none is found, we at least want to preserve the min/max,
1080 // which defines the overlap of y with the last partition in the grid.
1081 return last_y;
1082}
1083
1084} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
const double y
int * count
const double kVerticalSpacing
Definition: tablerecog.cpp:37
const double kGoodRowNumberOfColumnsLarge
Definition: tablerecog.cpp:57
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:44
const int kCellSplitRowThreshold
Definition: tablerecog.cpp:40
const double kHorizontalSpacing
Definition: tablerecog.cpp:34
const int kCellSplitColumnThreshold
Definition: tablerecog.cpp:41
const double kRequiredColumns
Definition: tablerecog.cpp:47
const double kGoodRowNumberOfColumnsSmall[]
Definition: tablerecog.cpp:55
const double kMinFilledArea
Definition: tablerecog.cpp:60
const double kMaxRowSize
Definition: tablerecog.cpp:52
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:34
const double kMarginFactor
Definition: tablerecog.cpp:49
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:43
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
void set_right(int x)
Definition: rect.h:92
void set_left(int x)
Definition: rect.h:85
TDimension top() const
Definition: rect.h:68
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:84
bool null_box() const
Definition: rect.h:60
void set_bottom(int y)
Definition: rect.h:78
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
int32_t area() const
Definition: rect.h:134
void set_top(int y)
Definition: rect.h:71
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:241
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:837
void SetUniqueMode(bool mode)
Definition: bbgrid.h:249
BBC * NextSideSearch(bool right_to_left)
Definition: bbgrid.h:802
BBC * NextRectSearch()
Definition: bbgrid.h:896
void StartSideSearch(int x, int ymin, int ymax)
Definition: bbgrid.h:788
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:849
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:884
int gridsize() const
Definition: bbgrid.h:63
bool IsHorizontalLine() const
Definition: colpartition.h:459
bool IsVerticalLine() const
Definition: colpartition.h:454
const TBOX & bounding_box() const
Definition: colpartition.h:108
bool IsHorizontalType() const
Definition: colpartition.h:445
static void FindCellSplitLocations(const std::vector< int > &min_list, const std::vector< int > &max_list, int max_merged, std::vector< int > *locations)
Definition: tablerecog.cpp:630
int row_height(unsigned row) const
Definition: tablerecog.cpp:135
std::vector< int > cell_y_
Definition: tablerecog.h:238
bool VerifyRowFilled(int row)
Definition: tablerecog.cpp:282
unsigned column_count() const
Definition: tablerecog.cpp:117
const TBOX & bounding_box() const
Definition: tablerecog.cpp:126
ColPartitionGrid * text_grid_
Definition: tablerecog.h:231
void set_max_text_height(int height)
Definition: tablerecog.cpp:108
std::vector< int > cell_x_
Definition: tablerecog.h:237
bool DoesPartitionFit(const ColPartition &part) const
Definition: tablerecog.cpp:234
ColPartitionGrid * line_grid_
Definition: tablerecog.h:232
int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const
Definition: tablerecog.cpp:516
unsigned cell_count() const
Definition: tablerecog.cpp:120
void set_bounding_box(const TBOX &box)
Definition: tablerecog.cpp:123
int CountFilledCellsInColumn(int column)
Definition: tablerecog.cpp:256
double CalculateCellFilledPercentage(unsigned row, unsigned column)
Definition: tablerecog.cpp:294
void Display(ScrollView *window, ScrollView::Color color)
Definition: tablerecog.cpp:319
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:105
int CountHorizontalIntersections(int y)
Definition: tablerecog.cpp:699
int CountFilledCellsInRow(int row)
Definition: tablerecog.cpp:253
int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const
Definition: tablerecog.cpp:533
int column_width(unsigned column) const
Definition: tablerecog.cpp:139
int CountPartitions(const TBOX &box)
Definition: tablerecog.cpp:727
int CountVerticalIntersections(int x)
Definition: tablerecog.cpp:673
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:102
void UpdateMargins(ColPartitionGrid *grid)
Definition: tablerecog.cpp:506
unsigned row_count() const
Definition: tablerecog.cpp:114
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:788
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:870
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:847
ColPartitionGrid * text_grid_
Definition: tablerecog.h:356
void set_min_width(int width)
Definition: tablerecog.cpp:756
void set_max_text_height(int height)
Definition: tablerecog.cpp:759
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)
void set_line_grid(ColPartitionGrid *lines)
Definition: tablerecog.cpp:750
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:806
void set_text_grid(ColPartitionGrid *text)
Definition: tablerecog.cpp:747
ColPartitionGrid * line_grid_
Definition: tablerecog.h:357
void set_min_height(int height)
Definition: tablerecog.cpp:753
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:908
StructuredTable * RecognizeTable(const TBOX &guess_box)
Definition: tablerecog.cpp:763
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:498
void Pen(Color color)
Definition: scrollview.cpp:710
void Brush(Color color)
Definition: scrollview.cpp:716
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:576