tesseract v5.3.3.20231005
tablerecog.h
Go to the documentation of this file.
1
2// File: tablerecog.h
3// Description: Functions to detect structure of tables.
4// Author: Nicholas Beato
5//
6// (C) Copyright 2010, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TABLERECOG_H_
20#define TABLERECOG_H_
21
22#include "colpartitiongrid.h"
23
24namespace tesseract {
25
26// There are 2 classes in this file. They have 2 different purposes.
27// - StructuredTable contains the methods to find the structure given
28// a specific bounding box and grow that structure.
29// - TableRecognizer contains the methods to adjust the possible positions
30// of a table without worrying about structure.
31//
32// To use these classes, the assumption is that the TableFinder will
33// have a guess of the location of a table (or possibly over/undersegmented
34// tables). The TableRecognizer is responsible for finding the table boundaries
35// at a high level. The StructuredTable class is responsible for determining
36// the structure of the table and trying to maximize its bounds while retaining
37// the structure.
38// (The latter part is not implemented yet, but that was the goal).
39//
40// While on the boundary discussion, keep in mind that this is a first pass.
41// There should eventually be some things like internal structure checks,
42// and, more importantly, surrounding text flow checks.
43//
44
45// Usage:
46// The StructuredTable class contains methods to query a potential table.
47// It has functions to find structure, count rows, find ColPartitions that
48// intersect gridlines, etc. It is not meant to blindly find a table. It
49// is meant to start with a known table location and enhance it.
50// Usage:
51// ColPartitionGrid text_grid, line_grid; // init
52// TBOX table_box; // known location of table location
53//
54// StructuredTable table;
55// table.Init(); // construction code
56// table.set_text_grid(/* text */); // These 2 grids can be the same!
57// table.set_line_grid(/* lines */);
58// table.set_min_text_height(10); // Filter vertical and tall text.
59// // IMPORTANT! The table needs to be told where it is!
60// table.set_bounding_box(table_box); // Set initial table location.
61// if (table.FindWhitespacedStructure()) {
62// // process table
63// table.column_count(); // number of columns
64// table.row_count(); // number of rows
65// table.cells_count(); // number of cells
66// table.bounding_box(); // updated bounding box
67// // etc.
68// }
69//
71public:
73 ~StructuredTable() = default;
74
75 // Initialization code. Must be called after the constructor.
76 void Init();
77
78 // Sets the grids used by the table. These can be changed between
79 // calls to Recognize. They are treated as read-only data.
80 void set_text_grid(ColPartitionGrid *text);
81 void set_line_grid(ColPartitionGrid *lines);
82 // Filters text partitions that are ridiculously tall to prevent
83 // merging rows.
84 void set_max_text_height(int height);
85
86 // Basic accessors. Some are treated as attributes despite having indirect
87 // representation.
88 bool is_lined() const;
89 unsigned row_count() const;
90 unsigned column_count() const;
91 unsigned cell_count() const;
92 void set_bounding_box(const TBOX &box);
93 const TBOX &bounding_box() const;
94 int median_cell_height();
95 int median_cell_width();
96 int row_height(unsigned row) const;
97 int column_width(unsigned column) const;
98 int space_above() const;
99 int space_below() const;
100
101 // Given enough horizontal and vertical lines in a region, create this table
102 // based on the structure given by the lines. Return true if it worked out.
103 // Code assumes the lines exist. It is the caller's responsibility to check
104 // for lines and find an appropriate bounding box.
105 bool FindLinedStructure();
106
107 // The main subroutine for finding generic table structure. The function
108 // finds the grid structure in the given box. Returns true if a good grid
109 // exists, implying that "this" table is valid.
110 bool FindWhitespacedStructure();
111
115
116 // Returns true if inserting part into the table does not cause any
117 // cell merges.
118 bool DoesPartitionFit(const ColPartition &part) const;
119 // Checks if a sub-table has multiple data cells filled.
120 int CountFilledCells();
121 int CountFilledCellsInRow(int row);
122 int CountFilledCellsInColumn(int column);
123 int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end);
124
125 // Makes sure that at least one cell in a row has substantial area filled.
126 // This can filter out large whitespace caused by growing tables too far
127 // and page numbers.
128 // (currently bugged for some reason).
129 bool VerifyRowFilled(int row);
130 // Finds the filled area in a cell.
131 double CalculateCellFilledPercentage(unsigned row, unsigned column);
132
133 // Debug display, draws the table in the given color. If the table is not
134 // valid, the table and "best" grid lines are still drawn in the given color.
135 void Display(ScrollView *window, ScrollView::Color color);
136
137protected:
138 // Clear the structure information.
139 void ClearStructure();
140
144
145 // Verifies the lines do not intersect partitions. This happens when
146 // the lines are in column boundaries and extend the full page. As a result,
147 // the grid lines go through column text. The condition is detectable.
148 bool VerifyLinedTableCells();
149
153
154 // This is the function to change if you want to filter resulting tables
155 // better. Right now it just checks for a minimum cell count and such.
156 // You could add things like maximum number of ColPartitions per cell or
157 // similar.
158 bool VerifyWhitespacedTable();
159 // Find the columns of a table using whitespace.
160 void FindWhitespacedColumns();
161 // Find the rows of a table using whitespace.
162 void FindWhitespacedRows();
163
167
168 // Calculates the whitespace around the table using the table boundary and
169 // the supplied grids (set_text_grid and set_line_grid).
170 void CalculateMargins();
171 // Update the table margins with the supplied grid. This is
172 // only called by calculate margins to use multiple grid sources.
173 void UpdateMargins(ColPartitionGrid *grid);
174 int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const;
175 int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const;
176 // Calculates stats on the table, namely the median cell height and width.
177 void CalculateStats();
178
182
183 // Given a whitespaced table, this looks for bordering lines that might
184 // be page layout boxes around the table. It is necessary to get the margins
185 // correct on the table. If the lines are not joined, the margins will be
186 // the distance to the line, which is not right.
187 void AbsorbNearbyLines();
188
189 // Nice utility function for finding partition gaps. You feed it a sorted
190 // list of all of the mins/maxes of the partitions in the table, and it gives
191 // you the gaps (middle). This works for both vertical and horizontal
192 // gaps.
193 //
194 // If you want to allow slight overlap in the division and the partitions,
195 // just scale down the partitions before inserting them in the list.
196 // Likewise, you can force at least some space between partitions.
197 // This trick is how the horizontal partitions are done (since the page
198 // skew could make it hard to find splits in the text).
199 //
200 // As a result, "0 distance" between closest partitions causes a gap.
201 // This is not a programmatic assumption. It is intentional and simplifies
202 // things.
203 //
204 // "max_merged" indicates both the minimum number of stacked partitions
205 // to cause a cell (add 1 to it), and the maximum number of partitions that
206 // a grid line can intersect. For example, if max_merged is 0, then lines
207 // are inserted wherever space exists between partitions. If it is 2,
208 // lines may intersect 2 partitions at most, but you also need at least
209 // 2 partitions to generate a line.
210 static void FindCellSplitLocations(const std::vector<int> &min_list,
211 const std::vector<int> &max_list, int max_merged,
212 std::vector<int> *locations);
213
217
218 // Counts the number of ColPartitions that intersect vertical cell
219 // division at this x value. Used by VerifyLinedTable.
220 int CountVerticalIntersections(int x);
221 int CountHorizontalIntersections(int y);
222
223 // Counts how many text partitions are in this box.
224 int CountPartitions(const TBOX &box);
225
229
230 // Input data, used as read only data to make decisions.
231 ColPartitionGrid *text_grid_; // Text ColPartitions
232 ColPartitionGrid *line_grid_; // Line ColPartitions
233 // Table structure.
234 // bounding box is a convenient external representation.
235 // cell_x_ and cell_y_ indicate the grid lines.
236 TBOX bounding_box_; // Bounding box
237 std::vector<int> cell_x_; // Locations of vertical divisions (sorted)
238 std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)
239 bool is_lined_; // Is the table backed up by a line structure
240 // Table margins, set via CalculateMargins
247 // Filters, used to prevent awkward partitions from destroying structure.
249};
250
252public:
253 TableRecognizer() = default;
254 ~TableRecognizer() = default;
255
256 // Initialization code. Must be called after the constructor.
257 void Init();
258
262
263 // Sets the grids used by the table. These can be changed between
264 // calls to Recognize. They are treated as read-only data.
265 void set_text_grid(ColPartitionGrid *text);
266 void set_line_grid(ColPartitionGrid *lines);
267 // Sets some additional constraints on the table.
268 void set_min_height(int height);
269 void set_min_width(int width);
270 // Filters text partitions that are ridiculously tall to prevent
271 // merging rows. Note that "filters" refers to allowing horizontal
272 // cells to slice through them on the premise that they were
273 // merged text rows during previous layout.
274 void set_max_text_height(int height);
275
276 // Given a guess location, the RecognizeTable function will try to find a
277 // structured grid in the area. On success, it will return a new
278 // StructuredTable (and assumes you will delete it). Otherwise,
279 // nullptr is returned.
280 //
281 // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
282 // Ideally, there is a either a one-to-one correspondence between
283 // the guess and table or no table at all. This is not the best of
284 // assumptions right now, but was made to try to keep things simple in
285 // the first pass.
286 //
287 // If a line structure is available on the page in the given region,
288 // the table will use the linear structure as it is.
289 // Otherwise, it will try to maximize the whitespace around it while keeping
290 // a grid structure. This is somewhat working.
291 //
292 // Since the combination of adjustments can get high, effort was
293 // originally made to keep the number of adjustments linear in the number
294 // of partitions. The underlying structure finding code used to be
295 // much more complex. I don't know how necessary this constraint is anymore.
296 // The evaluation of a possible table is kept within O(nlogn) in the size of
297 // the table (where size is the number of partitions in the table).
298 // As a result, the algorithm is capable of O(n^2 log n). Depending
299 // on the grid search size, it may be higher.
300 //
301 // Last note: it is possible to just try all partition boundaries at a high
302 // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
303 // area 200 partitions on a page, this could be too costly. Effort could go
304 // into pruning the search, but I opted for something quicker. I'm confident
305 // that the independent adjustments can get similar results and keep the
306 // complextiy down. However, the other approach could work without using
307 // TableFinder at all if it is fast enough. It comes down to properly
308 // deciding what is a table. The code currently relies on TableFinder's
309 // guess to the location of a table for that.
310 StructuredTable *RecognizeTable(const TBOX &guess_box);
311
312protected:
316
317 // Returns true if the given box has a lined table within it. The
318 // table argument will be updated with the table if the table exists.
319 bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table);
320 // Returns true if the given box has a large number of horizontal and
321 // vertical lines present. If so, we assume the extent of these lines
322 // uniquely defines a table and find that table via SolveLinedTable.
323 bool HasSignificantLines(const TBOX &guess);
324
325 // Given enough horizontal and vertical lines in a region, find a bounding
326 // box that encloses all of them (as well as newly introduced lines).
327 // The bounding box is the smallest box that encloses the lines in guess
328 // without having any lines sticking out of it.
329 // bounding_box is an in/out parameter.
330 // On input, it in the extents of the box to search.
331 // On output, it is the resulting bounding box.
332 bool FindLinesBoundingBox(TBOX *bounding_box);
333 // Iteration in above search.
334 // bounding_box is an in/out parameter.
335 // On input, it in the extents of the box to search.
336 // On output, it is the resulting bounding box.
337 bool FindLinesBoundingBoxIteration(TBOX *bounding_box);
338
342
343 // Returns true if the given box has a whitespaced table within it. The
344 // table argument will be updated if the table exists. Also note
345 // that this method will fail if the guess_box center is not
346 // mostly within the table.
347 bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table);
348
349 // Finds the location of a horizontal split relative to y.
350 // This function is mostly unused now. If the SolveWhitespacedTable
351 // changes much, it can be removed. Note, it isn't really as reliable
352 // as I thought. I went with alternatives for most of the other uses.
353 int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
354
355 // Input data, used as read only data to make decisions.
356 ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions
357 ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions
358 // Table constraints, a "good" table must satisfy these.
359 int min_height_ = 0;
360 int min_width_ = 0;
361 // Filters, used to prevent awkward partitions from destroying structure.
362 int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text.
363};
364
365} // namespace tesseract
366
367#endif /* TABLERECOG_H_ */
const double y
std::vector< int > cell_y_
Definition: tablerecog.h:238
ColPartitionGrid * text_grid_
Definition: tablerecog.h:231
std::vector< int > cell_x_
Definition: tablerecog.h:237
ColPartitionGrid * line_grid_
Definition: tablerecog.h:232
#define TESS_API
Definition: export.h:32