tesseract v5.3.3.20231005
gap_map.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10
11#include "gap_map.h"
12
13#include "statistc.h"
14
15namespace tesseract {
16
17BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
18BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
19BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide");
20double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
21
22/*************************************************************************
23 * A block gap map is a quantised histogram of whitespace regions in the
24 * block. It is a vertical projection of wide gaps WITHIN lines
25 *
26 * The map is held as an array of counts of rows which have a wide gap
27 * covering that region of the row. Each bucket in the map represents a width
28 * of about half an xheight - (The median of the xhts in the rows is used.)
29 *
30 * The block is considered RECTANGULAR - delimited by the left and right
31 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
32 * counted.
33 *
34 *************************************************************************/
35
36GAPMAP::GAPMAP( // Constructor
37 TO_BLOCK *block // block
38) {
39 TO_ROW *row; // current row
40 BLOBNBOX_IT blob_it; // iterator
41 TBOX blob_box;
42 TBOX prev_blob_box;
43 int16_t gap_width;
44 int16_t start_of_row;
45 int16_t end_of_row;
46 STATS xht_stats(0, 127);
47 int16_t min_quantum;
48 int16_t max_quantum;
49 int16_t i;
50
51 /*
52 Find left and right extremes and bucket size
53*/
54 map = nullptr;
55 min_left = INT16_MAX;
56 max_right = -INT16_MAX;
57 total_rows = 0;
58 any_tabs = false;
59
60 // row iterator
61 TO_ROW_IT row_it(block->get_rows());
62 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
63 row = row_it.data();
64 if (!row->blob_list()->empty()) {
65 total_rows++;
66 xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);
67 blob_it.set_to_list(row->blob_list());
68 start_of_row = blob_it.data()->bounding_box().left();
69 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
70 if (min_left > start_of_row) {
71 min_left = start_of_row;
72 }
73 if (max_right < end_of_row) {
74 max_right = end_of_row;
75 }
76 }
77 }
78 if ((total_rows < 3) || (min_left >= max_right)) {
79 bucket_size = 0;
80 map_max = 0;
81 total_rows = 0;
82 min_left = max_right = 0;
83 return;
84 }
85 bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;
86 map_max = (max_right - min_left) / bucket_size;
87 map = new int16_t[map_max + 1];
88 for (i = 0; i <= map_max; i++) {
89 map[i] = 0;
90 }
91
92 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
93 row = row_it.data();
94 if (!row->blob_list()->empty()) {
95 blob_it.set_to_list(row->blob_list());
96 blob_it.mark_cycle_pt();
97 blob_box = box_next(&blob_it);
98 prev_blob_box = blob_box;
99 if (gapmap_use_ends) {
100 /* Leading space */
101 gap_width = blob_box.left() - min_left;
102 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
103 max_quantum = (blob_box.left() - min_left) / bucket_size;
104 if (max_quantum > map_max) {
105 max_quantum = map_max;
106 }
107 for (i = 0; i <= max_quantum; i++) {
108 map[i]++;
109 }
110 }
111 }
112 while (!blob_it.cycled_list()) {
113 blob_box = box_next(&blob_it);
114 gap_width = blob_box.left() - prev_blob_box.right();
115 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
116 min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
117 max_quantum = (blob_box.left() - min_left) / bucket_size;
118 if (max_quantum > map_max) {
119 max_quantum = map_max;
120 }
121 for (i = min_quantum; i <= max_quantum; i++) {
122 map[i]++;
123 }
124 }
125 prev_blob_box = blob_box;
126 }
127 if (gapmap_use_ends) {
128 /* Trailing space */
129 gap_width = max_right - prev_blob_box.right();
130 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
131 min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
132 if (min_quantum < 0) {
133 min_quantum = 0;
134 }
135 for (i = min_quantum; i <= map_max; i++) {
136 map[i]++;
137 }
138 }
139 }
140 }
141 }
142 for (i = 0; i <= map_max; i++) {
143 if (map[i] > total_rows / 2) {
145 (((i == 0) && (map[i + 1] <= total_rows / 2)) ||
146 ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||
147 ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&
148 (map[i + 1] <= total_rows / 2)))) {
149 map[i] = 0; // prevent isolated quantum
150 } else {
151 any_tabs = true;
152 }
153 }
154 }
155 if (gapmap_debug && any_tabs) {
156 tprintf("Table found\n");
157 }
158}
159
160/*************************************************************************
161 * GAPMAP::table_gap()
162 * Is there a bucket in the specified range where more than half the rows in the
163 * block have a wide gap?
164 *************************************************************************/
165
166bool GAPMAP::table_gap( // Is gap a table?
167 int16_t left, // From here
168 int16_t right // To here
169) {
170 int16_t min_quantum;
171 int16_t max_quantum;
172 int16_t i;
173 bool tab_found = false;
174
175 if (!any_tabs) {
176 return false;
177 }
178
179 min_quantum = (left - min_left) / bucket_size;
180 max_quantum = (right - min_left) / bucket_size;
181 // Clip to the bounds of the array. In some circumstances (big blob followed
182 // by small blob) max_quantum can exceed the map_max bounds, but we clip
183 // here instead, as it provides better long-term safety.
184 if (min_quantum < 0) {
185 min_quantum = 0;
186 }
187 if (max_quantum > map_max) {
188 max_quantum = map_max;
189 }
190 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {
191 if (map[i] > total_rows / 2) {
192 tab_found = true;
193 }
194 }
195 return tab_found;
196}
197
198} // namespace tesseract
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define double_VAR(name, val, comment)
Definition: params.h:366
bool gapmap_no_isolated_quanta
Definition: gap_map.cpp:19
double gapmap_big_gaps
Definition: gap_map.cpp:20
bool gapmap_use_ends
Definition: gap_map.cpp:18
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
bool gapmap_debug
Definition: gap_map.cpp:17
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
TDimension left() const
Definition: rect.h:82
TDimension right() const
Definition: rect.h:89
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:241
GAPMAP(TO_BLOCK *block)
Definition: gap_map.cpp:36
bool table_gap(int16_t left, int16_t right)
Definition: gap_map.cpp:166