tesseract v5.3.3.20231005
tospace.cpp
Go to the documentation of this file.
1// Licensed under the Apache License, Version 2.0 (the "License");
2// you may not use this file except in compliance with the License.
3// You may obtain a copy of the License at
4// http://www.apache.org/licenses/LICENSE-2.0
5// Unless required by applicable law or agreed to in writing, software
6// distributed under the License is distributed on an "AS IS" BASIS,
7// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8// See the License for the specific language governing permissions and
9// limitations under the License.
10/**********************************************************************
11 * tospace.cpp
12 *
13 * Compute fuzzy word spacing thresholds for each row.
14 * I.e. set : max_nonspace
15 * space_threshold
16 * min_space
17 * kern_size
18 * space_size
19 * for each row.
20 * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
21 *
22 * Note: functions in this file were originally not members of any
23 * class or enclosed by any namespace. Now they are all static members
24 * of the Textord class.
25 *
26 **********************************************************************/
27
28#include "drawtord.h"
29#include "statistc.h"
30#include "textord.h"
31#include "tovars.h"
32
33// Include automatically generated configuration file if running autoconf.
34#ifdef HAVE_CONFIG_H
35# include "config_auto.h"
36#endif
37
38#include <algorithm>
39#include <cmath>
40#include <memory>
41
42#define MAXSPACING 128 /*max expected spacing in pix */
43
44namespace tesseract {
45void Textord::to_spacing(ICOORD page_tr, // topright of page
46 TO_BLOCK_LIST *blocks // blocks on page
47) {
48 TO_BLOCK_IT block_it; // iterator
49 TO_BLOCK *block; // current block;
50 TO_ROW *row; // current row
51 int block_index; // block number
52 int row_index; // row number
53 // estimated width of real spaces for whole block
54 int16_t block_space_gap_width;
55 // estimated width of non space gaps for whole block
56 int16_t block_non_space_gap_width;
57 bool old_text_ord_proportional; // old fixed/prop result
58
59 block_it.set_to_list(blocks);
60 block_index = 1;
61 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62 block = block_it.data();
63 std::unique_ptr<GAPMAP> gapmap(new GAPMAP(block)); // map of big vert gaps in blk
64 block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65 block_non_space_gap_width);
66 // Make sure relative values of block-level space and non-space gap
67 // widths are reasonable. The ratio of 1:3 is also used in
68 // block_spacing_stats, to correct the block_space_gap_width.
69 // Useful for arabic and hindi, when the non-space gap width is
70 // often over-estimated and should not be trusted. A similar ratio
71 // is found in block_spacing_stats.
72 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73 block_non_space_gap_width > block_space_gap_width / 3) {
74 block_non_space_gap_width = block_space_gap_width / 3;
75 }
76 // row iterator
77 TO_ROW_IT row_it(block->get_rows());
78 row_index = 1;
79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
80 row = row_it.data();
82 if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83 tprintf("Block %d Row %d: Now Proportional\n", block_index, row_index);
84 }
85 row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86 block_non_space_gap_width);
87 } else {
88 if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89 tprintf("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
90 row_index, row->pitch_decision, row->fixed_pitch);
91 }
92 }
93#ifndef GRAPHICS_DISABLED
95 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
96 }
97#endif
98 row_index++;
99 }
100 block_index++;
101 }
102}
103
104/*************************************************************************
105 * block_spacing_stats()
106 *************************************************************************/
107
108void Textord::block_spacing_stats(TO_BLOCK *block, GAPMAP *gapmap, bool &old_text_ord_proportional,
109 int16_t &block_space_gap_width, // resulting estimate
110 int16_t &block_non_space_gap_width // resulting estimate
111) {
112 TO_ROW *row; // current row
113 BLOBNBOX_IT blob_it; // iterator
114
115 STATS centre_to_centre_stats(0, MAXSPACING - 1);
116 // DEBUG USE ONLY
117 STATS all_gap_stats(0, MAXSPACING - 1);
118 STATS space_gap_stats(0, MAXSPACING - 1);
119 int16_t minwidth = MAXSPACING; // narrowest blob
120 TBOX blob_box;
121 TBOX prev_blob_box;
122 int16_t centre_to_centre;
123 int16_t gap_width;
124 float real_space_threshold;
125 float iqr_centre_to_centre; // DEBUG USE ONLY
126 float iqr_all_gap_stats; // DEBUG USE ONLY
127 int32_t end_of_row;
128 int32_t row_length;
129
130 // row iterator
131 TO_ROW_IT row_it(block->get_rows());
132 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
133 row = row_it.data();
134 if (!row->blob_list()->empty() &&
135 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
136 (row->pitch_decision == PITCH_CORR_PROP))) {
137 blob_it.set_to_list(row->blob_list());
138 blob_it.mark_cycle_pt();
139 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140 if (tosp_use_pre_chopping) {
141 blob_box = box_next_pre_chopped(&blob_it);
142 } else if (tosp_stats_use_xht_gaps) {
143 blob_box = reduced_box_next(row, &blob_it);
144 } else {
145 blob_box = box_next(&blob_it);
146 }
147 row_length = end_of_row - blob_box.left();
148 if (blob_box.width() < minwidth) {
149 minwidth = blob_box.width();
150 }
151 prev_blob_box = blob_box;
152 while (!blob_it.cycled_list()) {
153 if (tosp_use_pre_chopping) {
154 blob_box = box_next_pre_chopped(&blob_it);
155 } else if (tosp_stats_use_xht_gaps) {
156 blob_box = reduced_box_next(row, &blob_it);
157 } else {
158 blob_box = box_next(&blob_it);
159 }
160 if (blob_box.width() < minwidth) {
161 minwidth = blob_box.width();
162 }
163 int16_t left = prev_blob_box.right();
164 int16_t right = blob_box.left();
165 gap_width = right - left;
166 if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167 all_gap_stats.add(gap_width, 1);
168
169 centre_to_centre = (right + blob_box.right() - (prev_blob_box.left() + left)) / 2;
170 // DEBUG
171 centre_to_centre_stats.add(centre_to_centre, 1);
172 // DEBUG
173 }
174 prev_blob_box = blob_box;
175 }
176 }
177 }
178
179 // Inadequate samples
180 if (all_gap_stats.get_total() <= 1) {
181 block_non_space_gap_width = minwidth;
182 block_space_gap_width = -1; // No est. space width
183 // DEBUG
184 old_text_ord_proportional = true;
185 } else {
186 /* For debug only ..... */
187 iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188 iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189 old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
190 /* .......For debug only */
191
192 /*
193The median of the gaps is used as an estimate of the NON-SPACE gap width.
194This RELIES on the assumption that there are more gaps WITHIN words than
195BETWEEN words in a block
196
197Now try to estimate the width of a real space for all real spaces in the
198block. Do this by using a crude threshold to ignore "narrow" gaps, then
199find the median of the "wide" gaps and use this.
200*/
201 block_non_space_gap_width = static_cast<int16_t>(floor(all_gap_stats.median()));
202 // median gap
203
204 row_it.set_to_list(block->get_rows());
205 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
206 row = row_it.data();
207 if (!row->blob_list()->empty() &&
208 (!tosp_only_use_prop_rows || (row->pitch_decision == PITCH_DEF_PROP) ||
209 (row->pitch_decision == PITCH_CORR_PROP))) {
210 real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211 tosp_init_guess_xht_mult * row->xheight);
212 blob_it.set_to_list(row->blob_list());
213 blob_it.mark_cycle_pt();
214 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215 if (tosp_use_pre_chopping) {
216 blob_box = box_next_pre_chopped(&blob_it);
217 } else if (tosp_stats_use_xht_gaps) {
218 blob_box = reduced_box_next(row, &blob_it);
219 } else {
220 blob_box = box_next(&blob_it);
221 }
222 row_length = blob_box.left() - end_of_row;
223 prev_blob_box = blob_box;
224 while (!blob_it.cycled_list()) {
225 if (tosp_use_pre_chopping) {
226 blob_box = box_next_pre_chopped(&blob_it);
227 } else if (tosp_stats_use_xht_gaps) {
228 blob_box = reduced_box_next(row, &blob_it);
229 } else {
230 blob_box = box_next(&blob_it);
231 }
232 int16_t left = prev_blob_box.right();
233 int16_t right = blob_box.left();
234 gap_width = right - left;
235 if ((gap_width > real_space_threshold) &&
236 !ignore_big_gap(row, row_length, gapmap, left, right)) {
237 /*
238If tosp_use_cert_spaces is enabled, the estimate of the space gap is
239restricted to obvious spaces - those wider than half the xht or
240those with wide blobs on both sides - i.e not things that are
241suspect 1's or punctuation that is sometimes widely spaced.
242*/
243 if (!tosp_block_use_cert_spaces ||
244 (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
245 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
246 (!tosp_narrow_blobs_not_cert ||
247 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249 space_gap_stats.add(gap_width, 1);
250 }
251 }
252 prev_blob_box = blob_box;
253 }
254 }
255 }
256 // Inadequate samples
257 if (space_gap_stats.get_total() <= 2) {
258 block_space_gap_width = -1; // No est. space width
259 } else {
260 block_space_gap_width = std::max(static_cast<int16_t>(floor(space_gap_stats.median())),
261 static_cast<int16_t>(3 * block_non_space_gap_width));
262 }
263 }
264}
265
266/*************************************************************************
267 * row_spacing_stats()
268 * Set values for min_space, max_non_space based on row stats only
269 * If failure - return 0 values.
270 *************************************************************************/
271void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272 int16_t block_space_gap_width, // estimate for block
273 int16_t block_non_space_gap_width // estimate for block
274) {
275 // iterator
276 BLOBNBOX_IT blob_it = row->blob_list();
277 STATS all_gap_stats(0, MAXSPACING - 1);
278 STATS cert_space_gap_stats(0, MAXSPACING - 1);
279 STATS all_space_gap_stats(0, MAXSPACING - 1);
280 STATS small_gap_stats(0, MAXSPACING - 1);
281 TBOX blob_box;
282 TBOX prev_blob_box;
283 int16_t gap_width;
284 int16_t real_space_threshold = 0;
285 int16_t max = 0;
286 int16_t index;
287 int16_t large_gap_count = 0;
288 bool suspected_table;
289 int32_t max_max_nonspace; // upper bound
290 bool good_block_space_estimate = block_space_gap_width > 0;
291 int32_t end_of_row;
292 int32_t row_length = 0;
293 float sane_space;
294 int32_t sane_threshold;
295
296 /* Collect first pass stats for row */
297
298 if (!good_block_space_estimate) {
299 block_space_gap_width = int16_t(std::floor(row->xheight / 2));
300 }
301 if (!row->blob_list()->empty()) {
302 if (tosp_threshold_bias1 > 0) {
303 real_space_threshold =
304 block_non_space_gap_width +
305 int16_t(floor(0.5 + tosp_threshold_bias1 *
306 (block_space_gap_width - block_non_space_gap_width)));
307 } else {
308 real_space_threshold = // Old TO method
309 (block_space_gap_width + block_non_space_gap_width) / 2;
310 }
311 blob_it.set_to_list(row->blob_list());
312 blob_it.mark_cycle_pt();
313 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
314 if (tosp_use_pre_chopping) {
315 blob_box = box_next_pre_chopped(&blob_it);
316 } else if (tosp_stats_use_xht_gaps) {
317 blob_box = reduced_box_next(row, &blob_it);
318 } else {
319 blob_box = box_next(&blob_it);
320 }
321 row_length = end_of_row - blob_box.left();
322 prev_blob_box = blob_box;
323 while (!blob_it.cycled_list()) {
324 if (tosp_use_pre_chopping) {
325 blob_box = box_next_pre_chopped(&blob_it);
326 } else if (tosp_stats_use_xht_gaps) {
327 blob_box = reduced_box_next(row, &blob_it);
328 } else {
329 blob_box = box_next(&blob_it);
330 }
331 int16_t left = prev_blob_box.right();
332 int16_t right = blob_box.left();
333 gap_width = right - left;
334 if (ignore_big_gap(row, row_length, gapmap, left, right)) {
335 large_gap_count++;
336 } else {
337 if (gap_width >= real_space_threshold) {
338 if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
339 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
340 (!tosp_narrow_blobs_not_cert ||
341 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
342 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
343 cert_space_gap_stats.add(gap_width, 1);
344 }
345 all_space_gap_stats.add(gap_width, 1);
346 } else {
347 small_gap_stats.add(gap_width, 1);
348 }
349 all_gap_stats.add(gap_width, 1);
350 }
351 prev_blob_box = blob_box;
352 }
353 }
354 suspected_table = (large_gap_count > 1) ||
355 ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
356
357 /* Now determine row kern size, space size and threshold */
358
359 if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
360 ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
361 cert_space_gap_stats.get_total() > 0)) {
362 old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
363 block_space_gap_width, block_non_space_gap_width);
364 } else {
365 if (!tosp_recovery_isolated_row_stats ||
366 !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
367 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
368 tprintf("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
369 }
370 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
371 // Use block default
372 row->space_size = block_space_gap_width;
373 if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
374 row->kern_size = all_gap_stats.median();
375 } else {
376 row->kern_size = block_non_space_gap_width;
377 }
378 row->space_threshold =
379 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
380 } else {
381 old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
382 block_space_gap_width, block_non_space_gap_width);
383 }
384 }
385 }
386
387 if (tosp_improve_thresh && !suspected_table) {
388 improve_row_threshold(row, &all_gap_stats);
389 }
390
391 /* Now lets try to be careful not to do anything silly with tables when we
392are ignoring big gaps*/
393 if (tosp_sanity_method == 0) {
394 if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
395 if (tosp_debug_level > 5) {
396 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
397 row->kern_size, row->space_threshold, row->space_size);
398 }
399 row->space_threshold = static_cast<int32_t>(tosp_table_kn_sp_ratio * row->kern_size);
400 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
401 }
402 } else if (tosp_sanity_method == 1) {
403 sane_space = row->space_size;
404 /* NEVER let space size get too close to kern size */
405 if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
406 ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
407 if (good_block_space_estimate &&
408 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
409 sane_space = block_space_gap_width;
410 } else {
411 sane_space =
412 std::max(static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
413 row->xheight / 2.0f);
414 }
415 if (tosp_debug_level > 5) {
416 tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
417 row->kern_size, row->space_threshold, row->space_size, sane_space);
418 }
419 row->space_size = sane_space;
420 row->space_threshold =
421 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
422 }
423 /* NEVER let threshold get VERY far away from kern */
424 sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
425 if (row->space_threshold > sane_threshold) {
426 if (tosp_debug_level > 5) {
427 tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
428 row->kern_size, row->space_threshold, row->space_size, sane_threshold);
429 }
430 row->space_threshold = sane_threshold;
431 if (row->space_size <= sane_threshold) {
432 row->space_size = row->space_threshold + 1.0f;
433 }
434 }
435 /* Beware of tables - there may be NO spaces */
436 if (suspected_table) {
437 sane_space =
438 std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
439 sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
440
441 if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
442 if (tosp_debug_level > 5) {
443 tprintf("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
444 row->kern_size, row->space_threshold, row->space_size);
445 }
446 // the minimum sane value
447 row->space_threshold = static_cast<int32_t>(sane_space);
448 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
449 }
450 }
451 }
452
453 /* Now lets try to put some error limits on the threshold */
454
455 if (tosp_old_to_method) {
456 /* Old textord made a space if gap >= threshold */
457 // NO FUZZY SPACES YET
458 row->max_nonspace = row->space_threshold;
459 // NO FUZZY SPACES YET
460 row->min_space = row->space_threshold + 1;
461 } else {
462 /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
463 row->min_space =
464 std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
465 if (row->min_space <= row->space_threshold) {
466 // Don't be silly
467 row->min_space = row->space_threshold + 1;
468 }
469 /*
470Lets try to guess the max certain kern gap by looking at the cluster of
471kerns for the row. The row is proportional so the kerns should cluster
472tightly at the bottom of the distribution. We also expect most gaps to be
473kerns. Find the maximum of the kern piles between 0 and twice the kern
474estimate. Piles before the first one with less than 1/10 the maximum
475number of samples can be taken as certain kerns.
476
477 Of course, there are some cases where the kern peak and space peaks merge,
478 so we will put an UPPER limit on the max certain kern gap of some fraction
479 below the threshold.
480*/
481
482 max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
483
484 // default
485 row->max_nonspace = max_max_nonspace;
486 for (index = 0; index <= max_max_nonspace; index++) {
487 if (all_gap_stats.pile_count(index) > max) {
488 max = all_gap_stats.pile_count(index);
489 }
490 if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
491 row->max_nonspace = index;
492 break;
493 }
494 }
495 }
496
497 /* Yet another algorithm - simpler this time - just choose a fraction of the
498threshold to space range */
499
500 if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
501 row->min_space = std::max(
502 row->min_space, static_cast<int32_t>(ceil(row->space_threshold +
503 tosp_fuzzy_sp_fraction *
504 (row->space_size - row->space_threshold))));
505 }
506
507 /* Ensure that ANY space less than some multiplier times the kern size is
508fuzzy. In tables there is a risk of erroneously setting a small space size
509when there are no real spaces. Sometimes tables have text squashed into
510columns so that the kn->sp ratio is small anyway - this means that we can't
511use this to force a wider separation - hence we rely on context to join any
512dubious breaks. */
513
514 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
515 row->min_space = std::max(
516 row->min_space, static_cast<int32_t>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
517 }
518
519 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
520 row->max_nonspace = static_cast<int32_t>(floor(
521 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
522 }
523 if (row->max_nonspace > row->space_threshold) {
524 // Don't be silly
525 row->max_nonspace = row->space_threshold;
526 }
527
528 if (tosp_debug_level > 5) {
529 tprintf(
530 "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
531 "Sp:%3.2f\n",
532 block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
533 real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
534 row->min_space, row->space_size);
535 }
536 if (tosp_debug_level > 10) {
537 tprintf(
538 "row->kern_size = %3.2f, row->space_size = %3.2f, "
539 "row->space_threshold = %d\n",
540 row->kern_size, row->space_size, row->space_threshold);
541 }
542}
543
544void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
545 STATS *small_gap_stats,
546 int16_t block_space_gap_width, // estimate for block
547 int16_t block_non_space_gap_width // estimate for block
548) {
549 /* First, estimate row space size */
550 /* Old to condition was > 2 */
551 if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
552 // Adequate samples
553 /* Set space size to median of spaces BUT limits it if it seems wildly out
554 */
555 row->space_size = space_gap_stats->median();
556 if (row->space_size > block_space_gap_width * 1.5) {
557 if (tosp_old_to_bug_fix) {
558 row->space_size = block_space_gap_width * 1.5;
559 } else {
560 // BUG??? should be *1.5
561 row->space_size = block_space_gap_width;
562 }
563 }
564 if (row->space_size < (block_non_space_gap_width * 2) + 1) {
565 row->space_size = (block_non_space_gap_width * 2) + 1;
566 }
567 }
568 // Only 1 or 2 samples
569 else if (space_gap_stats->get_total() >= 1) {
570 // hence mean not median
571 row->space_size = space_gap_stats->mean();
572 if (row->space_size > block_space_gap_width * 1.5) {
573 if (tosp_old_to_bug_fix) {
574 row->space_size = block_space_gap_width * 1.5;
575 } else {
576 // BUG??? should be *1.5
577 row->space_size = block_space_gap_width;
578 }
579 }
580 if (row->space_size < (block_non_space_gap_width * 3) + 1) {
581 row->space_size = (block_non_space_gap_width * 3) + 1;
582 }
583 } else {
584 // Use block default
585 row->space_size = block_space_gap_width;
586 }
587
588 /* Next, estimate row kern size */
589 if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
590 row->kern_size = small_gap_stats->median();
591 } else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
592 row->kern_size = all_gap_stats->median();
593 } else { // old TO -SAME FOR ALL ROWS
594 row->kern_size = block_non_space_gap_width;
595 }
596
597 /* Finally, estimate row space threshold */
598 if (tosp_threshold_bias2 > 0) {
599 row->space_threshold = int32_t(
600 floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
601 } else {
602 /*
603 NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
604and holds this in a float. The use is with a >= test
605NEW textord uses an integer threshold and a > test
606It comes to the same thing.
607 (Though there is a difference in that old textor has integer space_size
608 and kern_size.)
609*/
610 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
611 }
612
613 // Apply the same logic and ratios as in row_spacing_stats to
614 // restrict relative values of the row's space_size, kern_size, and
615 // space_threshold
616 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
617 ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
618 ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
619 if (row->kern_size > 2.5) {
620 row->kern_size = row->space_size / tosp_min_sane_kn_sp;
621 }
622 row->space_threshold =
623 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
624 }
625}
626
627/*************************************************************************
628 * isolated_row_stats()
629 * Set values for min_space, max_non_space based on row stats only
630 *************************************************************************/
631bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
632 bool suspected_table, int16_t block_idx, int16_t row_idx) {
633 float kern_estimate;
634 float crude_threshold_estimate;
635 int16_t small_gaps_count;
636 int16_t total;
637 // iterator
638 BLOBNBOX_IT blob_it = row->blob_list();
639 STATS cert_space_gap_stats(0, MAXSPACING - 1);
640 STATS all_space_gap_stats(0, MAXSPACING - 1);
641 STATS small_gap_stats(0, MAXSPACING - 1);
642 TBOX blob_box;
643 TBOX prev_blob_box;
644 int16_t gap_width;
645 int32_t end_of_row;
646 int32_t row_length;
647
648 kern_estimate = all_gap_stats->median();
649 crude_threshold_estimate =
650 std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
651 small_gaps_count =
652 stats_count_under(all_gap_stats, static_cast<int16_t>(std::ceil(crude_threshold_estimate)));
653 total = all_gap_stats->get_total();
654
655 if ((total <= tosp_redo_kern_limit) ||
656 ((small_gaps_count / static_cast<float>(total)) < tosp_enough_small_gaps) ||
657 (total - small_gaps_count < 1)) {
658 if (tosp_debug_level > 5) {
659 tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
660 }
661 return false;
662 }
663 blob_it.set_to_list(row->blob_list());
664 blob_it.mark_cycle_pt();
665 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
666 if (tosp_use_pre_chopping) {
667 blob_box = box_next_pre_chopped(&blob_it);
668 } else if (tosp_stats_use_xht_gaps) {
669 blob_box = reduced_box_next(row, &blob_it);
670 } else {
671 blob_box = box_next(&blob_it);
672 }
673 row_length = end_of_row - blob_box.left();
674 prev_blob_box = blob_box;
675 while (!blob_it.cycled_list()) {
676 if (tosp_use_pre_chopping) {
677 blob_box = box_next_pre_chopped(&blob_it);
678 } else if (tosp_stats_use_xht_gaps) {
679 blob_box = reduced_box_next(row, &blob_it);
680 } else {
681 blob_box = box_next(&blob_it);
682 }
683 int16_t left = prev_blob_box.right();
684 int16_t right = blob_box.left();
685 gap_width = right - left;
686 if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
687 (gap_width > crude_threshold_estimate)) {
688 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
689 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
690 (!tosp_narrow_blobs_not_cert ||
691 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
692 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
693 cert_space_gap_stats.add(gap_width, 1);
694 }
695 all_space_gap_stats.add(gap_width, 1);
696 }
697 if (gap_width < crude_threshold_estimate) {
698 small_gap_stats.add(gap_width, 1);
699 }
700
701 prev_blob_box = blob_box;
702 }
703 if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
704 // median
705 row->space_size = cert_space_gap_stats.median();
706 } else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
707 // to avoid spaced
708 row->space_size = cert_space_gap_stats.mean();
709 // 1's in tables
710 } else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
711 // median
712 row->space_size = all_space_gap_stats.median();
713 } else {
714 row->space_size = all_space_gap_stats.mean();
715 }
716
717 if (tosp_only_small_gaps_for_kern) {
718 row->kern_size = small_gap_stats.median();
719 } else {
720 row->kern_size = all_gap_stats->median();
721 }
722 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
723 /* Sanity check */
724 if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
725 (row->space_threshold <= 0)) {
726 if (tosp_debug_level > 5) {
727 tprintf("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
728 row->kern_size, row->space_threshold, row->space_size);
729 }
730 row->kern_size = 0.0f;
731 row->space_threshold = 0;
732 row->space_size = 0.0f;
733 return false;
734 }
735
736 if (tosp_debug_level > 5) {
737 tprintf("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
738 row->space_threshold, row->space_size);
739 }
740 return true;
741}
742
743int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
744 int16_t index;
745 int16_t total = 0;
746
747 for (index = 0; index < threshold; index++) {
748 total += stats->pile_count(index);
749 }
750 return total;
751}
752
753/*************************************************************************
754 * improve_row_threshold()
755 * Try to recognise a "normal line" -
756 * > 25 gaps
757 * && space > 3 * kn && space > 10
758 * (I.e. reasonably large space and kn:sp ratio)
759 * && > 3/4 # gaps < kn + (sp - kn)/3
760 * (I.e. most gaps are well away from space estimate)
761 * && a gap of max(3, (sp - kn) / 3) empty histogram positions is found
762 * somewhere in the histogram between kn and sp
763 * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
764 * NO!!!!! the bristol line has "11" with a gap of 12 between the
765 *1's!!! try moving the default threshold to within this band but leave the
766 * fuzzy limit calculation as at present.
767 *************************************************************************/
768void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
769 float sp = row->space_size;
770 float kn = row->kern_size;
771 int16_t reqd_zero_width = 0;
772 int16_t zero_width = 0;
773 int16_t zero_start = 0;
774 int16_t index = 0;
775
776 if (tosp_debug_level > 10) {
777 tprintf("Improve row threshold 0");
778 }
779 if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
780 (stats_count_under(all_gap_stats, static_cast<int16_t>(ceil(kn + (sp - kn) / 3 + 0.5))) <
781 (0.75 * all_gap_stats->get_total()))) {
782 return;
783 }
784 if (tosp_debug_level > 10) {
785 tprintf(" 1");
786 }
787 /*
788Look for the first region of all 0's in the histogram which is wider than
789max(3, (sp - kn) / 3) and starts between kn and sp. If found, and current
790threshold is not within it, move the threshold so that is just inside it.
791*/
792 reqd_zero_width = static_cast<int16_t>(floor((sp - kn) / 3 + 0.5));
793 if (reqd_zero_width < 3) {
794 reqd_zero_width = 3;
795 }
796
797 for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
798 if (all_gap_stats->pile_count(index) == 0) {
799 if (zero_width == 0) {
800 zero_start = index;
801 }
802 zero_width++;
803 } else {
804 if (zero_width >= reqd_zero_width) {
805 break;
806 } else {
807 zero_width = 0;
808 }
809 }
810 }
811 index--;
812 if (tosp_debug_level > 10) {
813 tprintf(" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
814 zero_width, zero_start, row->space_threshold);
815 }
816 if ((zero_width < reqd_zero_width) ||
817 ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
818 return;
819 }
820 if (tosp_debug_level > 10) {
821 tprintf(" 2");
822 }
823 if (row->space_threshold < zero_start) {
824 if (tosp_debug_level > 5) {
825 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
826 index, row->space_threshold, zero_start);
827 }
828 row->space_threshold = zero_start;
829 }
830 if (row->space_threshold > index) {
831 if (tosp_debug_level > 5) {
832 tprintf("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
833 index, row->space_threshold, index);
834 }
835 row->space_threshold = index;
836 }
837}
838
839/**********************************************************************
840 * make_prop_words
841 *
842 * Convert a TO_ROW to a ROW.
843 **********************************************************************/
845 FCOORD rotation // for drawing
846) {
847 bool bol; // start of line
848 /* prev_ values are for start of word being built. non prev_ values are for
849the gap between the word being built and the next one. */
850 bool prev_fuzzy_sp; // probably space
851 bool prev_fuzzy_non; // probably not
852 uint8_t prev_blanks; // in front of word
853 bool fuzzy_sp = false; // probably space
854 bool fuzzy_non = false; // probably not
855 uint8_t blanks = 0; // in front of word
856 bool prev_gap_was_a_space = false;
857 bool break_at_next_gap = false;
858 ROW *real_row; // output row
859 C_OUTLINE_IT cout_it;
860 C_BLOB_LIST cblobs;
861 C_BLOB_IT cblob_it = &cblobs;
862 WERD_LIST words;
863 WERD *word; // new word
864 int32_t next_rep_char_word_right = INT32_MAX;
865 float repetition_spacing; // gap between repetitions
866 int32_t xstarts[2]; // row ends
867 int32_t prev_x; // end of prev blob
868 BLOBNBOX_IT box_it; // iterator
869 TBOX prev_blob_box;
870 TBOX next_blob_box;
871 int16_t prev_gap = INT16_MAX;
872 int16_t current_gap = INT16_MAX;
873 int16_t next_gap = INT16_MAX;
874 int16_t prev_within_xht_gap = INT16_MAX;
875 int16_t current_within_xht_gap = INT16_MAX;
876 int16_t next_within_xht_gap = INT16_MAX;
877 int16_t word_count = 0;
878
879 // repeated char words
880 WERD_IT rep_char_it(&(row->rep_words));
881 if (!rep_char_it.empty()) {
882 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
883 }
884
885 prev_x = -INT16_MAX;
886 cblob_it.set_to_list(&cblobs);
887 box_it.set_to_list(row->blob_list());
888 // new words
889 WERD_IT word_it(&words);
890 bol = true;
891 prev_blanks = 0;
892 prev_fuzzy_sp = false;
893 prev_fuzzy_non = false;
894 if (!box_it.empty()) {
895 xstarts[0] = box_it.data()->bounding_box().left();
896 if (xstarts[0] > next_rep_char_word_right) {
897 /* We need to insert a repeated char word at the start of the row */
898 word = rep_char_it.extract();
899 word_it.add_after_then_move(word);
900 /* Set spaces before repeated char word */
901 word->set_flag(W_BOL, true);
902 bol = false;
903 word->set_blanks(0);
904 // NO uncertainty
905 word->set_flag(W_FUZZY_SP, false);
906 word->set_flag(W_FUZZY_NON, false);
907 xstarts[0] = word->bounding_box().left();
908 /* Set spaces after repeated char word (and leave current word set) */
909 repetition_spacing = find_mean_blob_spacing(word);
910 current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
911 current_within_xht_gap = current_gap;
912 if (current_gap > tosp_rep_space * repetition_spacing) {
913 prev_blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
914 if (prev_blanks < 1) {
915 prev_blanks = 1;
916 }
917 } else {
918 prev_blanks = 0;
919 }
920 if (tosp_debug_level > 5) {
921 tprintf("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
922 box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
923 repetition_spacing, current_gap);
924 }
925 prev_fuzzy_sp = false;
926 prev_fuzzy_non = false;
927 if (rep_char_it.empty()) {
928 next_rep_char_word_right = INT32_MAX;
929 } else {
930 rep_char_it.forward();
931 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
932 }
933 }
934
935 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
936 do {
937 auto bblob = box_it.data();
938 auto blob_box = bblob->bounding_box();
939 if (bblob->joined_to_prev()) {
940 auto cblob = bblob->remove_cblob();
941 if (cblob != nullptr) {
942 cout_it.set_to_list(cblob_it.data()->out_list());
943 cout_it.move_to_last();
944 cout_it.add_list_after(cblob->out_list());
945 delete cblob;
946 }
947 } else {
948 auto cblob = bblob->cblob();
949 if (cblob != nullptr) {
950 bblob->set_owns_cblob(false);
951 cblob_it.add_after_then_move(cblob);
952 }
953 prev_x = blob_box.right();
954 }
955 box_it.forward(); // next one
956 bblob = box_it.data();
957 blob_box = bblob->bounding_box();
958
959 if (!bblob->joined_to_prev() && bblob->cblob() != nullptr) {
960 /* Real Blob - not multiple outlines or pre-chopped */
961 prev_gap = current_gap;
962 prev_within_xht_gap = current_within_xht_gap;
963 prev_blob_box = next_blob_box;
964 current_gap = next_gap;
965 current_within_xht_gap = next_within_xht_gap;
966 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
967
968 int16_t prev_gap_arg = prev_gap;
969 int16_t next_gap_arg = next_gap;
970 if (tosp_only_use_xht_gaps) {
971 prev_gap_arg = prev_within_xht_gap;
972 next_gap_arg = next_within_xht_gap;
973 }
974 // Decide if a word-break should be inserted
975 if (blob_box.left() > next_rep_char_word_right ||
976 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
977 current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
978 fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
979 box_it.at_first()) {
980 /* Form a new word out of the blobs collected */
981 word = new WERD(&cblobs, prev_blanks, nullptr);
982 word_count++;
983 word_it.add_after_then_move(word);
984 if (bol) {
985 word->set_flag(W_BOL, true);
986 bol = false;
987 }
988 if (prev_fuzzy_sp) {
989 // probably space
990 word->set_flag(W_FUZZY_SP, true);
991 } else if (prev_fuzzy_non) {
992 word->set_flag(W_FUZZY_NON, true);
993 }
994 // probably not
995
996 if (blob_box.left() > next_rep_char_word_right) {
997 /* We need to insert a repeated char word */
998 word = rep_char_it.extract();
999 word_it.add_after_then_move(word);
1000
1001 /* Set spaces before repeated char word */
1002 repetition_spacing = find_mean_blob_spacing(word);
1003 current_gap = word->bounding_box().left() - prev_x;
1004 current_within_xht_gap = current_gap;
1005 if (current_gap > tosp_rep_space * repetition_spacing) {
1006 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1007 if (blanks < 1) {
1008 blanks = 1;
1009 }
1010 } else {
1011 blanks = 0;
1012 }
1013 if (tosp_debug_level > 5) {
1014 tprintf("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1015 word->bounding_box().left(), word->bounding_box().bottom(),
1016 repetition_spacing, current_gap, blanks);
1017 }
1018 word->set_blanks(blanks);
1019 // NO uncertainty
1020 word->set_flag(W_FUZZY_SP, false);
1021 word->set_flag(W_FUZZY_NON, false);
1022
1023 /* Set spaces after repeated char word (and leave current word set)
1024 */
1025 current_gap = blob_box.left() - next_rep_char_word_right;
1026 if (current_gap > tosp_rep_space * repetition_spacing) {
1027 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1028 if (blanks < 1) {
1029 blanks = 1;
1030 }
1031 } else {
1032 blanks = 0;
1033 }
1034 if (tosp_debug_level > 5) {
1035 tprintf(" Rgap:%d (%d blanks)\n", current_gap, blanks);
1036 }
1037 fuzzy_sp = false;
1038 fuzzy_non = false;
1039
1040 if (rep_char_it.empty()) {
1041 next_rep_char_word_right = INT32_MAX;
1042 } else {
1043 rep_char_it.forward();
1044 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1045 }
1046 }
1047
1048 if (box_it.at_first() && rep_char_it.empty()) {
1049 // at end of line
1050 word->set_flag(W_EOL, true);
1051 xstarts[1] = prev_x;
1052 } else {
1053 prev_blanks = blanks;
1054 prev_fuzzy_sp = fuzzy_sp;
1055 prev_fuzzy_non = fuzzy_non;
1056 }
1057 }
1058 }
1059 } while (!box_it.at_first()); // until back at start
1060
1061 /* Insert any further repeated char words */
1062 while (!rep_char_it.empty()) {
1063 word = rep_char_it.extract();
1064 word_it.add_after_then_move(word);
1065
1066 /* Set spaces before repeated char word */
1067 repetition_spacing = find_mean_blob_spacing(word);
1068 current_gap = word->bounding_box().left() - prev_x;
1069 if (current_gap > tosp_rep_space * repetition_spacing) {
1070 blanks = static_cast<uint8_t>(std::floor(current_gap / row->space_size));
1071 if (blanks < 1) {
1072 blanks = 1;
1073 }
1074 } else {
1075 blanks = 0;
1076 }
1077 if (tosp_debug_level > 5) {
1078 tprintf("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1079 word->bounding_box().left(), word->bounding_box().bottom(), repetition_spacing,
1080 current_gap, blanks);
1081 }
1082 word->set_blanks(blanks);
1083 // NO uncertainty
1084 word->set_flag(W_FUZZY_SP, false);
1085 word->set_flag(W_FUZZY_NON, false);
1086 prev_x = word->bounding_box().right();
1087 if (rep_char_it.empty()) {
1088 // at end of line
1089 word->set_flag(W_EOL, true);
1090 xstarts[1] = prev_x;
1091 } else {
1092 rep_char_it.forward();
1093 }
1094 }
1095 real_row =
1096 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1097 word_it.set_to_list(real_row->word_list());
1098 // put words in row
1099 word_it.add_list_after(&words);
1100 real_row->recalc_bounding_box();
1101
1102 if (tosp_debug_level > 4) {
1103 tprintf("Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1104 real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1105 real_row->bounding_box().right(), real_row->bounding_box().top());
1106 }
1107 return real_row;
1108 }
1109 return nullptr;
1110}
1111
1112/**********************************************************************
1113 * make_blob_words
1114 *
1115 * Converts words into blobs so that each blob is a single character.
1116 * Used for chopper test.
1117 **********************************************************************/
1119 FCOORD rotation // for drawing
1120) {
1121 bool bol; // start of line
1122 ROW *real_row; // output row
1123 C_OUTLINE_IT cout_it;
1124 C_BLOB_LIST cblobs;
1125 C_BLOB_IT cblob_it = &cblobs;
1126 WERD_LIST words;
1127 WERD *word; // new word
1128 BLOBNBOX_IT box_it; // iterator
1129 int16_t word_count = 0;
1130
1131 cblob_it.set_to_list(&cblobs);
1132 box_it.set_to_list(row->blob_list());
1133 // new words
1134 WERD_IT word_it(&words);
1135 bol = true;
1136 if (!box_it.empty()) {
1137 do {
1138 auto bblob = box_it.data();
1139 auto blob_box = bblob->bounding_box();
1140 if (bblob->joined_to_prev()) {
1141 auto cblob = bblob->remove_cblob();
1142 if (cblob != nullptr) {
1143 cout_it.set_to_list(cblob_it.data()->out_list());
1144 cout_it.move_to_last();
1145 cout_it.add_list_after(cblob->out_list());
1146 delete cblob;
1147 }
1148 } else {
1149 auto cblob = bblob->cblob();
1150 if (cblob != nullptr) {
1151 bblob->set_owns_cblob(false);
1152 cblob_it.add_after_then_move(cblob);
1153 }
1154 }
1155 box_it.forward(); // next one
1156 bblob = box_it.data();
1157 blob_box = bblob->bounding_box();
1158
1159 if (!bblob->joined_to_prev() && !cblobs.empty()) {
1160 word = new WERD(&cblobs, 1, nullptr);
1161 word_count++;
1162 word_it.add_after_then_move(word);
1163 if (bol) {
1164 word->set_flag(W_BOL, true);
1165 bol = false;
1166 }
1167 if (box_it.at_first()) { // at end of line
1168 word->set_flag(W_EOL, true);
1169 }
1170 }
1171 } while (!box_it.at_first()); // until back at start
1172 /* Setup the row with created words. */
1173 real_row =
1174 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
1175 word_it.set_to_list(real_row->word_list());
1176 // put words in row
1177 word_it.add_list_after(&words);
1178 real_row->recalc_bounding_box();
1179 if (tosp_debug_level > 4) {
1180 tprintf("Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1181 real_row->bounding_box().left(), real_row->bounding_box().bottom(),
1182 real_row->bounding_box().right(), real_row->bounding_box().top());
1183 }
1184 return real_row;
1185 }
1186 return nullptr;
1187}
1188
1189bool Textord::make_a_word_break(TO_ROW *row, // row being made
1190 TBOX blob_box, // for next_blob // how many blanks?
1191 int16_t prev_gap, TBOX prev_blob_box, int16_t real_current_gap,
1192 int16_t within_xht_current_gap, TBOX next_blob_box,
1193 int16_t next_gap, uint8_t &blanks, bool &fuzzy_sp, bool &fuzzy_non,
1194 bool &prev_gap_was_a_space, bool &break_at_next_gap) {
1195 bool space;
1196 int16_t current_gap;
1197 float fuzzy_sp_to_kn_limit;
1198
1199 if (break_at_next_gap) {
1200 break_at_next_gap = false;
1201 return true;
1202 }
1203 /* Inhibit using the reduced gap if
1204 The kerning is large - chars are not kerned and reducing "f"s can cause
1205 erroneous blanks
1206OR The real gap is less than 0
1207OR The real gap is less than the kerning estimate
1208*/
1209 if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1210 ((tosp_dont_fool_with_small_kerns >= 0) &&
1211 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1212 // Ignore the difference
1213 within_xht_current_gap = real_current_gap;
1214 }
1215
1216 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1217 current_gap = within_xht_current_gap;
1218 } else {
1219 current_gap = real_current_gap;
1220 }
1221
1222 if (tosp_old_to_method) {
1223 // Boring old method
1224 space = current_gap > row->max_nonspace;
1225 if (space && (current_gap < INT16_MAX)) {
1226 if (current_gap < row->min_space) {
1227 if (current_gap > row->space_threshold) {
1228 blanks = 1;
1229 fuzzy_sp = true;
1230 fuzzy_non = false;
1231 } else {
1232 blanks = 0;
1233 fuzzy_sp = false;
1234 fuzzy_non = true;
1235 }
1236 } else {
1237 if (row->space_size == 0.0f) {
1238 // Avoid FP division by 0.
1239 blanks = 1;
1240 } else {
1241 blanks = static_cast<uint8_t>(current_gap / row->space_size);
1242 if (blanks < 1) {
1243 blanks = 1;
1244 }
1245 }
1246 fuzzy_sp = false;
1247 fuzzy_non = false;
1248 }
1249 }
1250 return space;
1251 } else {
1252 /* New exciting heuristic method */
1253 if (prev_blob_box.null_box()) { // Beginning of row
1254 prev_gap_was_a_space = true;
1255 }
1256
1257 // Default as old TO
1258 space = current_gap > row->space_threshold;
1259
1260 /* Set defaults for the word break in case we find one. Currently there are
1261no fuzzy spaces. Depending on the reliability of the different heuristics
1262we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1263be used if the function returns true - ie the word is to be broken.
1264*/
1265 int num_blanks = current_gap;
1266 if (row->space_size > 1.0f) {
1267 num_blanks = IntCastRounded(current_gap / row->space_size);
1268 }
1269 blanks = static_cast<uint8_t>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1270 fuzzy_sp = false;
1271 fuzzy_non = false;
1272 /*
1273If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1274despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1275context.
1276*/
1277 if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1278 (within_xht_current_gap > row->max_nonspace)) {
1279 space = true;
1280 fuzzy_non = true;
1281#ifndef GRAPHICS_DISABLED
1282 mark_gap(blob_box, 20, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1283 next_gap);
1284#endif
1285 } else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1286 (within_xht_current_gap > row->space_threshold)) {
1287 space = true;
1288 if (tosp_flip_fuzz_kn_to_sp) {
1289 fuzzy_sp = true;
1290 } else {
1291 fuzzy_non = true;
1292 }
1293#ifndef GRAPHICS_DISABLED
1294 mark_gap(blob_box, 21, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1295 next_gap);
1296#endif
1297 } else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1298 (within_xht_current_gap >= row->min_space)) {
1299 space = true;
1300#ifndef GRAPHICS_DISABLED
1301 mark_gap(blob_box, 22, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1302 next_gap);
1303#endif
1304 } else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1305 suspected_punct_blob(row, blob_box)) {
1306 break_at_next_gap = true;
1307 }
1308 /* Now continue with normal heuristics */
1309 else if ((current_gap < row->min_space) && (current_gap > row->space_threshold)) {
1310 /* Heuristics to turn dubious spaces to kerns */
1311 if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1312 fuzzy_sp_to_kn_limit =
1313 row->kern_size + tosp_pass_wide_fuzz_sp_to_context * (row->space_size - row->kern_size);
1314 } else {
1315 fuzzy_sp_to_kn_limit = 99999.0f;
1316 }
1317
1318 /* If current gap is significantly smaller than the previous space the
1319other side of a narrow blob then this gap is a kern. */
1320 if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1321 (current_gap <= tosp_gap_factor * prev_gap)) {
1322 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1323 if (tosp_flip_fuzz_sp_to_kn) {
1324 fuzzy_non = true;
1325 } else {
1326 fuzzy_sp = true;
1327 }
1328 } else {
1329 space = false;
1330 }
1331#ifndef GRAPHICS_DISABLED
1332 mark_gap(blob_box, 1, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1333 next_gap);
1334#endif
1335 }
1336 /* If current gap not much bigger than the previous kern the other side of
1337a narrow blob then this gap is a kern as well */
1338 else if ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box) &&
1339 !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1340 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1341 if (tosp_flip_fuzz_sp_to_kn) {
1342 fuzzy_non = true;
1343 } else {
1344 fuzzy_sp = true;
1345 }
1346 } else {
1347 space = false;
1348 }
1349#ifndef GRAPHICS_DISABLED
1350 mark_gap(blob_box, 2, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1351 next_gap);
1352#endif
1353 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1354 (next_gap > row->space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1355 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1356 if (tosp_flip_fuzz_sp_to_kn) {
1357 fuzzy_non = true;
1358 } else {
1359 fuzzy_sp = true;
1360 }
1361 } else {
1362 space = false;
1363 }
1364#ifndef GRAPHICS_DISABLED
1365 mark_gap(blob_box, 3, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1366 next_gap);
1367#endif
1368 } else if ((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box) &&
1369 (next_gap <= row->space_threshold) &&
1370 (current_gap * tosp_gap_factor <= next_gap)) {
1371 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1372 if (tosp_flip_fuzz_sp_to_kn) {
1373 fuzzy_non = true;
1374 } else {
1375 fuzzy_sp = true;
1376 }
1377 } else {
1378 space = false;
1379 }
1380#ifndef GRAPHICS_DISABLED
1381 mark_gap(blob_box, 4, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1382 next_gap);
1383#endif
1384 } else if ((((next_blob_box.width() > 0) && narrow_blob(row, next_blob_box)) ||
1385 ((prev_blob_box.width() > 0) && narrow_blob(row, prev_blob_box)))) {
1386 fuzzy_sp = true;
1387#ifndef GRAPHICS_DISABLED
1388 mark_gap(blob_box, 6, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1389 next_gap);
1390#endif
1391 }
1392 } else if ((current_gap > row->max_nonspace) && (current_gap <= row->space_threshold)) {
1393 /* Heuristics to turn dubious kerns to spaces */
1394 /* TRIED THIS BUT IT MADE THINGS WORSE
1395 if (prev_gap == INT16_MAX)
1396 prev_gap = 0; // start of row
1397 if (next_gap == INT16_MAX)
1398 next_gap = 0; // end of row
1399*/
1400 if ((prev_blob_box.width() > 0) && (next_blob_box.width() > 0) &&
1401 (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1402 wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1403 space = true;
1404 /*
1405tosp_flip_caution is an attempt to stop the default changing in cases
1406where there is a large difference between the kern and space estimates.
1407 See problem in 'chiefs' where "have" gets split in the quotation.
1408*/
1409 if ((tosp_flip_fuzz_kn_to_sp) &&
1410 ((tosp_flip_caution <= 0) || (tosp_flip_caution * row->kern_size > row->space_size))) {
1411 fuzzy_sp = true;
1412 } else {
1413 fuzzy_non = true;
1414 }
1415#ifndef GRAPHICS_DISABLED
1416 mark_gap(blob_box, 7, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1417 next_gap);
1418#endif
1419 } else if (prev_blob_box.width() > 0 && next_blob_box.width() > 0 &&
1420 current_gap > 5 && // Rule 9 handles small gap, big ratio.
1421 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1422 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1423 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1424 space = true;
1425 fuzzy_non = true;
1426#ifndef GRAPHICS_DISABLED
1427 mark_gap(blob_box, 8, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1428 next_gap);
1429#endif
1430 } else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.width() > 0) &&
1431 (next_blob_box.width() > 0) &&
1432 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1433 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1434 !suspected_punct_blob(row, next_blob_box)))) {
1435 space = true;
1436 fuzzy_non = true;
1437#ifndef GRAPHICS_DISABLED
1438 mark_gap(blob_box, 9, prev_gap, prev_blob_box.width(), current_gap, next_blob_box.width(),
1439 next_gap);
1440#endif
1441 }
1442 }
1443 if (tosp_debug_level > 10) {
1444 tprintf(
1445 "word break = %d current_gap = %d, prev_gap = %d, "
1446 "next_gap = %d\n",
1447 space ? 1 : 0, current_gap, prev_gap, next_gap);
1448 }
1449 prev_gap_was_a_space = space && !(fuzzy_non);
1450 return space;
1451 }
1452}
1453
1454bool Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1455 bool result;
1456 result =
1457 ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1458 ((static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1459 return result;
1460}
1461
1462bool Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1463 bool result;
1464 if (tosp_wide_fraction > 0) {
1465 if (tosp_wide_aspect_ratio > 0) {
1466 result =
1467 ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1468 ((static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1469 } else {
1470 result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1471 }
1472 } else {
1473 result = !narrow_blob(row, blob_box);
1474 }
1475 return result;
1476}
1477
1478bool Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1479 bool result;
1480 float baseline;
1481 float blob_x_centre;
1482 /* Find baseline of centre of blob */
1483 blob_x_centre = (box.right() + box.left()) / 2.0;
1484 baseline = row->baseline.y(blob_x_centre);
1485
1486 result = (box.height() <= 0.66 * row->xheight) || (box.top() < baseline + row->xheight / 2.0) ||
1487 (box.bottom() > baseline + row->xheight / 2.0);
1488 return result;
1489}
1490
1491void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it, TBOX &next_blob_box,
1492 int16_t &next_gap, int16_t &next_within_xht_gap) {
1493 TBOX next_reduced_blob_box;
1494 TBOX bit_beyond;
1495 BLOBNBOX_IT reduced_box_it = box_it;
1496
1497 next_blob_box = box_next(&box_it);
1498 next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1499 if (box_it.at_first()) {
1500 next_gap = INT16_MAX;
1501 next_within_xht_gap = INT16_MAX;
1502 } else {
1503 bit_beyond = box_it.data()->bounding_box();
1504 next_gap = bit_beyond.left() - next_blob_box.right();
1505 bit_beyond = reduced_box_next(row, &reduced_box_it);
1506 next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1507 }
1508}
1509
1510#ifndef GRAPHICS_DISABLED
1511void Textord::mark_gap(TBOX blob, // blob following gap
1512 int16_t rule, // heuristic id
1513 int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1514 int16_t next_blob_width, int16_t next_gap) {
1515 ScrollView::Color col; // of ellipse marking flipped gap
1516
1517 switch (rule) {
1518 case 1:
1519 col = ScrollView::RED;
1520 break;
1521 case 2:
1522 col = ScrollView::CYAN;
1523 break;
1524 case 3:
1525 col = ScrollView::GREEN;
1526 break;
1527 case 4:
1528 col = ScrollView::BLACK;
1529 break;
1530 case 5:
1531 col = ScrollView::MAGENTA;
1532 break;
1533 case 6:
1534 col = ScrollView::BLUE;
1535 break;
1536
1537 case 7:
1538 col = ScrollView::WHITE;
1539 break;
1540 case 8:
1541 col = ScrollView::YELLOW;
1542 break;
1543 case 9:
1544 col = ScrollView::BLACK;
1545 break;
1546
1547 case 20:
1548 col = ScrollView::CYAN;
1549 break;
1550 case 21:
1551 col = ScrollView::GREEN;
1552 break;
1553 case 22:
1554 col = ScrollView::MAGENTA;
1555 break;
1556 default:
1557 col = ScrollView::BLACK;
1558 }
1560 to_win->Pen(col);
1561 /* if (rule < 20)
1562 //interior_style(to_win, INT_SOLID, false);
1563 else
1564 //interior_style(to_win, INT_HOLLOW, true);*/
1565 // x radius
1566 to_win->Ellipse(current_gap / 2.0f,
1567 blob.height() / 2.0f, // y radius
1568 // x centre
1569 blob.left() - current_gap / 2.0f,
1570 // y centre
1571 blob.bottom() + blob.height() / 2.0f);
1572 }
1573 if (tosp_debug_level > 5) {
1574 tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1575 blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1576 }
1577}
1578#endif
1579
1580float Textord::find_mean_blob_spacing(WERD *word) {
1581 C_BLOB_IT cblob_it;
1582 TBOX blob_box;
1583 int32_t gap_sum = 0;
1584 int16_t gap_count = 0;
1585 int16_t prev_right;
1586
1587 cblob_it.set_to_list(word->cblob_list());
1588 if (!cblob_it.empty()) {
1589 cblob_it.mark_cycle_pt();
1590 prev_right = cblob_it.data()->bounding_box().right();
1591 // first blob
1592 cblob_it.forward();
1593 for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1594 blob_box = cblob_it.data()->bounding_box();
1595 gap_sum += blob_box.left() - prev_right;
1596 gap_count++;
1597 prev_right = blob_box.right();
1598 }
1599 }
1600 if (gap_count > 0) {
1601 return (gap_sum / static_cast<float>(gap_count));
1602 } else {
1603 return 0.0f;
1604 }
1605}
1606
1607bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1608 int16_t right) {
1609 int16_t gap = right - left + 1;
1610
1611 if (tosp_ignore_big_gaps > 999) {
1612 return false; // Don't ignore
1613 }
1614 if (tosp_ignore_big_gaps > 0) {
1615 return (gap > tosp_ignore_big_gaps * row->xheight);
1616 }
1617 if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1618 return true;
1619 }
1620 if (tosp_ignore_big_gaps == 0) {
1621 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1622 return true;
1623 }
1624 if ((gap > 1.75 * row->xheight) &&
1625 ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1626 return true;
1627 }
1628 } else {
1629 /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table
1630 */
1631 if ((gap > gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1632 return true;
1633 }
1634 }
1635 return false;
1636}
1637
1638/**********************************************************************
1639 * reduced_box_next
1640 *
1641 * Compute the bounding box of this blob with merging of x overlaps
1642 * but no pre-chopping.
1643 * Then move the iterator on to the start of the next blob.
1644 * DON'T reduce the box for small things - eg punctuation.
1645 **********************************************************************/
1646TBOX Textord::reduced_box_next(TO_ROW *row, // current row
1647 BLOBNBOX_IT *it // iterator to blobds
1648) {
1649 BLOBNBOX *blob; // current blob
1650 BLOBNBOX *head_blob; // place to store box
1651 TBOX full_box; // full blob boundg box
1652 TBOX reduced_box; // box of significant part
1653 int16_t left_above_xht; // ABOVE xht left limit
1654 int16_t new_left_above_xht; // ABOVE xht left limit
1655
1656 blob = it->data();
1657 if (blob->red_box_set()) {
1658 reduced_box = blob->reduced_box();
1659 do {
1660 it->forward();
1661 blob = it->data();
1662 } while (blob->cblob() == nullptr || blob->joined_to_prev());
1663 return reduced_box;
1664 }
1665 head_blob = blob;
1666 full_box = blob->bounding_box();
1667 reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1668 do {
1669 it->forward();
1670 blob = it->data();
1671 if (blob->cblob() == nullptr) {
1672 // was pre-chopped
1673 full_box += blob->bounding_box();
1674 } else if (blob->joined_to_prev()) {
1675 reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1676 left_above_xht = std::min(left_above_xht, new_left_above_xht);
1677 }
1678 }
1679 // until next real blob
1680 while (blob->cblob() == nullptr || blob->joined_to_prev());
1681
1682 if ((reduced_box.width() > 0) &&
1683 ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1684 (reduced_box.height() > 0.7 * row->xheight)) {
1685#ifndef GRAPHICS_DISABLED
1687 reduced_box.plot(to_win, ScrollView::YELLOW, ScrollView::YELLOW);
1688 }
1689#endif
1690 } else {
1691 reduced_box = full_box;
1692 }
1693 head_blob->set_reduced_box(reduced_box);
1694 return reduced_box;
1695}
1696
1697/*************************************************************************
1698 * reduced_box_for_blob()
1699 * Find box for blob which is the same height and y position as the whole blob,
1700 * but whose left limit is the left most position of the blob ABOVE the
1701 * baseline and whose right limit is the right most position of the blob BELOW
1702 * the xheight.
1703 *
1704 *
1705 * !!!!!!! WON'T WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1706 * "home". Perhaps we need something which say if the width ABOVE the
1707 * xht alone includes the whole of the reduced width, then use the full
1708 * blob box - Might still fail on italic F
1709 *
1710 * Alternatively we could be a little less severe and only reduce the
1711 * left and right edges by half the difference between the full box and
1712 * the reduced box.
1713 *
1714 * NOTE that we need to rotate all the coordinates as
1715 * find_blob_limits finds the y min and max within a specified x band
1716 *************************************************************************/
1717TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1718 float baseline;
1719 float blob_x_centre;
1720 float left_limit;
1721 float right_limit;
1722 float junk;
1723 TBOX blob_box;
1724
1725 /* Find baseline of centre of blob */
1726
1727 blob_box = blob->bounding_box();
1728 blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1729 baseline = row->baseline.y(blob_x_centre);
1730
1731 /*
1732Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1733caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1734*/
1735 left_limit = static_cast<float>(INT32_MAX);
1736 junk = static_cast<float>(-INT32_MAX);
1737 find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight), static_cast<float>(INT16_MAX),
1738 left_limit, junk);
1739 if (left_limit > junk) {
1740 *left_above_xht = INT16_MAX; // No area above xht
1741 } else {
1742 *left_above_xht = static_cast<int16_t>(std::floor(left_limit));
1743 }
1744 /*
1745Find reduced LH limit of blob - the left extent of the region ABOVE the
1746baseline.
1747*/
1748 left_limit = static_cast<float>(INT32_MAX);
1749 junk = static_cast<float>(-INT32_MAX);
1750 find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(INT16_MAX), left_limit, junk);
1751
1752 if (left_limit > junk) {
1753 return TBOX(); // no area within xht so return empty box
1754 }
1755 /*
1756Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1757*/
1758 junk = static_cast<float>(INT32_MAX);
1759 right_limit = static_cast<float>(-INT32_MAX);
1760 find_cblob_hlimits(blob->cblob(), static_cast<float>(-INT16_MAX), (baseline + row->xheight), junk,
1761 right_limit);
1762 if (junk > right_limit) {
1763 return TBOX(); // no area within xht so return empty box
1764 }
1765
1766 return TBOX(ICOORD(static_cast<int16_t>(std::floor(left_limit)), blob_box.bottom()),
1767 ICOORD(static_cast<int16_t>(std::ceil(right_limit)), blob_box.top()));
1768}
1769} // namespace tesseract
#define MAXSPACING
Definition: tospace.cpp:42
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_EOL
end of line
Definition: werd.h:35
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
double gapmap_big_gaps
Definition: gap_map.cpp:20
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
@ baseline
Definition: mfoutline.h:53
ScrollView * to_win
Definition: drawtord.cpp:37
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:579
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
bool textord_show_initial_words
Definition: tovars.cpp:25
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:667
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:638
int32_t min_space
Definition: blobbox.h:669
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
WERD_LIST rep_words
Definition: blobbox.h:674
int32_t max_nonspace
Definition: blobbox.h:670
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
int32_t space_threshold
Definition: blobbox.h:671
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
WERD_LIST * word_list()
Definition: ocrrow.h:57
void recalc_bounding_box()
Definition: ocrrow.cpp:100
TBOX bounding_box() const
Definition: ocrrow.h:90
integer coordinate
Definition: points.h:36
TDimension left() const
Definition: rect.h:82
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
bool null_box() const
Definition: rect.h:60
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
TBOX bounding_box() const
Definition: werd.cpp:155
void set_blanks(uint8_t new_blanks)
Definition: werd.h:103
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:844
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:45
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1118
void Pen(Color color)
Definition: scrollview.cpp:710
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:585