tesseract v5.3.3.20231005
tesseract::TestableTableRecognizer Class Reference
Inheritance diagram for tesseract::TestableTableRecognizer:
tesseract::TableRecognizer

Public Member Functions

bool FindLinesBoundingBox (TBOX *bounding_box)
 
bool HasSignificantLines (const TBOX &guess)
 
bool RecognizeLinedTable (const TBOX &guess_box, StructuredTable *table)
 
StructuredTableRecognizeTable (const TBOX &guess_box)
 
bool RecognizeWhitespacedTable (const TBOX &guess_box, StructuredTable *table)
 
- Public Member Functions inherited from tesseract::TableRecognizer
 TableRecognizer ()=default
 
 ~TableRecognizer ()=default
 
void Init ()
 
void set_text_grid (ColPartitionGrid *text)
 
void set_line_grid (ColPartitionGrid *lines)
 
void set_min_height (int height)
 
void set_min_width (int width)
 
void set_max_text_height (int height)
 
StructuredTableRecognizeTable (const TBOX &guess_box)
 

Additional Inherited Members

- Protected Member Functions inherited from tesseract::TableRecognizer
bool RecognizeLinedTable (const TBOX &guess_box, StructuredTable *table)
 
bool HasSignificantLines (const TBOX &guess)
 
bool FindLinesBoundingBox (TBOX *bounding_box)
 
bool FindLinesBoundingBoxIteration (TBOX *bounding_box)
 
bool RecognizeWhitespacedTable (const TBOX &guess_box, StructuredTable *table)
 
int NextHorizontalSplit (int left, int right, int y, bool top_to_bottom)
 
- Protected Attributes inherited from tesseract::TableRecognizer
ColPartitionGridtext_grid_ = nullptr
 
ColPartitionGridline_grid_ = nullptr
 
int min_height_ = 0
 
int min_width_ = 0
 
int max_text_height_ = INT32_MAX
 

Detailed Description

Definition at line 22 of file tablerecog_test.cc.

Member Function Documentation

◆ FindLinesBoundingBox()

bool tesseract::TableRecognizer::FindLinesBoundingBox ( TBOX bounding_box)

Definition at line 332 of file tablerecog.cpp.

847 {
848 // The first iteration will tell us if there are lines
849 // present and shrink the box to a minimal iterative size.
850 if (!FindLinesBoundingBoxIteration(bounding_box)) {
851 return false;
852 }
853
854 // Keep growing until the area of the table stabilizes.
855 // The box can only get bigger, increasing area.
856 bool changed = true;
857 while (changed) {
858 changed = false;
859 int old_area = bounding_box->area();
860 bool check = FindLinesBoundingBoxIteration(bounding_box);
861 // At this point, the function will return true.
862 ASSERT_HOST(check);
863 ASSERT_HOST(bounding_box->area() >= old_area);
864 changed = (bounding_box->area() > old_area);
865 }
866
867 return true;
868}
#define ASSERT_HOST(x)
Definition: errcode.h:54
bool FindLinesBoundingBoxIteration(TBOX *bounding_box)
Definition: tablerecog.cpp:870

◆ HasSignificantLines()

bool tesseract::TableRecognizer::HasSignificantLines ( const TBOX guess)

Definition at line 323 of file tablerecog.cpp.

806 {
808 box_search.SetUniqueMode(true);
809 box_search.StartRectSearch(guess);
810 ColPartition *line = nullptr;
811 int vertical_count = 0;
812 int horizontal_count = 0;
813
814 while ((line = box_search.NextRectSearch()) != nullptr) {
815 if (line->IsHorizontalLine()) {
816 ++horizontal_count;
817 }
818 if (line->IsVerticalLine()) {
819 ++vertical_count;
820 }
821 }
822
823 return vertical_count >= kLinedTableMinVerticalLines &&
824 horizontal_count >= kLinedTableMinHorizontalLines;
825}
const int kLinedTableMinHorizontalLines
Definition: tablerecog.cpp:44
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919
const int kLinedTableMinVerticalLines
Definition: tablerecog.cpp:43
ColPartitionGrid * line_grid_
Definition: tablerecog.h:357

◆ RecognizeLinedTable()

bool tesseract::TableRecognizer::RecognizeLinedTable ( const TBOX guess_box,
StructuredTable table 
)

Definition at line 319 of file tablerecog.cpp.

788 {
789 if (!HasSignificantLines(guess_box)) {
790 return false;
791 }
792 TBOX line_bound = guess_box;
793 if (!FindLinesBoundingBox(&line_bound)) {
794 return false;
795 }
796 table->set_bounding_box(line_bound);
797 return table->FindLinedStructure();
798}
@ TBOX
bool FindLinesBoundingBox(TBOX *bounding_box)
Definition: tablerecog.cpp:847
bool HasSignificantLines(const TBOX &guess)
Definition: tablerecog.cpp:806

◆ RecognizeTable()

StructuredTable * tesseract::TableRecognizer::RecognizeTable ( const TBOX guess_box)

Definition at line 310 of file tablerecog.cpp.

763 {
764 auto *table = new StructuredTable();
765 table->Init();
766 table->set_text_grid(text_grid_);
767 table->set_line_grid(line_grid_);
768 table->set_max_text_height(max_text_height_);
769
770 // Try to solve this simple case, a table with *both*
771 // vertical and horizontal lines.
772 if (RecognizeLinedTable(guess, table)) {
773 return table;
774 }
775
776 // Fallback to whitespace if that failed.
777 // TODO(nbeato): Break this apart to take advantage of horizontal
778 // lines or vertical lines when present.
779 if (RecognizeWhitespacedTable(guess, table)) {
780 return table;
781 }
782
783 // No table found...
784 delete table;
785 return nullptr;
786}
ColPartitionGrid * text_grid_
Definition: tablerecog.h:356
bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:788
bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table)
Definition: tablerecog.cpp:908

◆ RecognizeWhitespacedTable()

bool tesseract::TableRecognizer::RecognizeWhitespacedTable ( const TBOX guess_box,
StructuredTable table 
)

Definition at line 347 of file tablerecog.cpp.

908 {
909 TBOX best_box = guess_box; // Best borders known.
910 int best_below = 0; // Margin size above best table.
911 int best_above = 0; // Margin size below best table.
912 TBOX adjusted = guess_box; // The search box.
913
914 // We assume that the guess box is somewhat accurate, so we don't allow
915 // the adjusted border to pass half of the guessed area. This prevents
916 // "negative" tables from forming.
917 const int kMidGuessY = (guess_box.bottom() + guess_box.top()) / 2;
918 // Keeps track of the most columns in an accepted table. The resulting table
919 // may be less than the max, but we don't want to stray too far.
920 unsigned best_cols = 0;
921 // Make sure we find a good border.
922 bool found_good_border = false;
923
924 // Find the bottom of the table by trying a few different locations. For
925 // each location, the top, left, and right are fixed. We start the search
926 // in a smaller table to favor best_cols getting a good estimate sooner.
927 int last_bottom = INT32_MAX;
928 int bottom =
929 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY - min_height_ / 2, true);
930 int top =
931 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
932 adjusted.set_top(top);
933
934 // Headers/footers can be spaced far from everything.
935 // Make sure that the space below is greater than the space above
936 // the lowest row.
937 int previous_below = 0;
938 const int kMaxChances = 10;
939 int chances = kMaxChances;
940 while (bottom != last_bottom) {
941 adjusted.set_bottom(bottom);
942
943 if (adjusted.height() >= min_height_) {
944 // Try to fit the grid on the current box. We give it a chance
945 // if the number of columns didn't significantly drop.
946 table->set_bounding_box(adjusted);
947 if (table->FindWhitespacedStructure() &&
948 table->column_count() >= best_cols * kRequiredColumns) {
949 if (false && IsWeakTableRow(table, 0)) {
950 // Currently buggy, but was looking promising so disabled.
951 --chances;
952 } else {
953 // We favor 2 things,
954 // 1- Adding rows that have partitioned data.
955 // 2- Better margins (to find header/footer).
956 // For better tables, we just look for multiple cells in the
957 // bottom row with data in them.
958 // For margins, the space below the last row should
959 // be better than a table with the last row removed.
960 chances = kMaxChances;
961 double max_row_height = kMaxRowSize * table->median_cell_height();
962 if ((table->space_below() * kMarginFactor >= best_below &&
963 table->space_below() >= previous_below) ||
964 (table->CountFilledCellsInRow(0) > 1 && table->row_height(0) < max_row_height)) {
965 best_box.set_bottom(bottom);
966 best_below = table->space_below();
967 best_cols = std::max(table->column_count(), best_cols);
968 found_good_border = true;
969 }
970 }
971 previous_below = table->space_below();
972 } else {
973 --chances;
974 }
975 }
976 if (chances <= 0) {
977 break;
978 }
979
980 last_bottom = bottom;
981 bottom = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_bottom, true);
982 }
983 if (!found_good_border) {
984 return false;
985 }
986
987 // TODO(nbeato) comments: follow modified code above... put it in a function!
988 found_good_border = false;
989 int last_top = INT32_MIN;
990 top =
991 NextHorizontalSplit(guess_box.left(), guess_box.right(), kMidGuessY + min_height_ / 2, false);
992 int previous_above = 0;
993 chances = kMaxChances;
994
995 adjusted.set_bottom(best_box.bottom());
996 while (last_top != top) {
997 adjusted.set_top(top);
998 if (adjusted.height() >= min_height_) {
999 table->set_bounding_box(adjusted);
1000 if (table->FindWhitespacedStructure() &&
1001 table->column_count() >= best_cols * kRequiredColumns) {
1002 int last_row = table->row_count() - 1;
1003 if (false && IsWeakTableRow(table, last_row)) {
1004 // Currently buggy, but was looking promising so disabled.
1005 --chances;
1006 } else {
1007 chances = kMaxChances;
1008 double max_row_height = kMaxRowSize * table->median_cell_height();
1009 if ((table->space_above() * kMarginFactor >= best_above &&
1010 table->space_above() >= previous_above) ||
1011 (table->CountFilledCellsInRow(last_row) > 1 &&
1012 table->row_height(last_row) < max_row_height)) {
1013 best_box.set_top(top);
1014 best_above = table->space_above();
1015 best_cols = std::max(table->column_count(), best_cols);
1016 found_good_border = true;
1017 }
1018 }
1019 previous_above = table->space_above();
1020 } else {
1021 --chances;
1022 }
1023 }
1024 if (chances <= 0) {
1025 break;
1026 }
1027
1028 last_top = top;
1029 top = NextHorizontalSplit(guess_box.left(), guess_box.right(), last_top, false);
1030 }
1031
1032 if (!found_good_border) {
1033 return false;
1034 }
1035
1036 // If we get here, this shouldn't happen. It can be an assert, but
1037 // I haven't tested it enough to make it crash things.
1038 if (best_box.null_box()) {
1039 return false;
1040 }
1041
1042 // Given the best locations, fit the box to those locations.
1043 table->set_bounding_box(best_box);
1044 return table->FindWhitespacedStructure();
1045}
const double kRequiredColumns
Definition: tablerecog.cpp:47
const double kMaxRowSize
Definition: tablerecog.cpp:52
const double kMarginFactor
Definition: tablerecog.cpp:49
int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom)

The documentation for this class was generated from the following file: