35# include "config_auto.h"
54 int16_t block_space_gap_width;
56 int16_t block_non_space_gap_width;
57 bool old_text_ord_proportional;
59 block_it.set_to_list(blocks);
61 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
62 block = block_it.data();
63 std::unique_ptr<GAPMAP> gapmap(
new GAPMAP(block));
64 block_spacing_stats(block, gapmap.get(), old_text_ord_proportional, block_space_gap_width,
65 block_non_space_gap_width);
72 if (tosp_old_to_method && tosp_old_to_constrain_sp_kn &&
73 block_non_space_gap_width > block_space_gap_width / 3) {
74 block_non_space_gap_width = block_space_gap_width / 3;
79 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
82 if ((tosp_debug_level > 0) && !old_text_ord_proportional) {
83 tprintf(
"Block %d Row %d: Now Proportional\n", block_index, row_index);
85 row_spacing_stats(row, gapmap.get(), block_index, row_index, block_space_gap_width,
86 block_non_space_gap_width);
88 if ((tosp_debug_level > 0) && old_text_ord_proportional) {
89 tprintf(
"Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n", block_index,
93#ifndef GRAPHICS_DISABLED
108void Textord::block_spacing_stats(
TO_BLOCK *block,
GAPMAP *gapmap,
bool &old_text_ord_proportional,
109 int16_t &block_space_gap_width,
110 int16_t &block_non_space_gap_width
122 int16_t centre_to_centre;
124 float real_space_threshold;
125 float iqr_centre_to_centre;
126 float iqr_all_gap_stats;
131 TO_ROW_IT row_it(block->
get_rows());
132 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
138 blob_it.mark_cycle_pt();
139 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
140 if (tosp_use_pre_chopping) {
142 }
else if (tosp_stats_use_xht_gaps) {
143 blob_box = reduced_box_next(row, &blob_it);
147 row_length = end_of_row - blob_box.
left();
148 if (blob_box.
width() < minwidth) {
149 minwidth = blob_box.
width();
151 prev_blob_box = blob_box;
152 while (!blob_it.cycled_list()) {
153 if (tosp_use_pre_chopping) {
155 }
else if (tosp_stats_use_xht_gaps) {
156 blob_box = reduced_box_next(row, &blob_it);
160 if (blob_box.
width() < minwidth) {
161 minwidth = blob_box.
width();
163 int16_t left = prev_blob_box.
right();
164 int16_t right = blob_box.
left();
165 gap_width = right - left;
166 if (!ignore_big_gap(row, row_length, gapmap, left, right)) {
167 all_gap_stats.add(gap_width, 1);
169 centre_to_centre = (right + blob_box.
right() - (prev_blob_box.
left() + left)) / 2;
171 centre_to_centre_stats.add(centre_to_centre, 1);
174 prev_blob_box = blob_box;
180 if (all_gap_stats.get_total() <= 1) {
181 block_non_space_gap_width = minwidth;
182 block_space_gap_width = -1;
184 old_text_ord_proportional =
true;
187 iqr_centre_to_centre = centre_to_centre_stats.ile(0.75) - centre_to_centre_stats.ile(0.25);
188 iqr_all_gap_stats = all_gap_stats.ile(0.75) - all_gap_stats.ile(0.25);
189 old_text_ord_proportional = iqr_centre_to_centre * 2 > iqr_all_gap_stats;
201 block_non_space_gap_width =
static_cast<int16_t
>(floor(all_gap_stats.median()));
204 row_it.set_to_list(block->
get_rows());
205 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
210 real_space_threshold = std::max(tosp_init_guess_kn_mult * block_non_space_gap_width,
211 tosp_init_guess_xht_mult * row->
xheight);
213 blob_it.mark_cycle_pt();
214 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
215 if (tosp_use_pre_chopping) {
217 }
else if (tosp_stats_use_xht_gaps) {
218 blob_box = reduced_box_next(row, &blob_it);
222 row_length = blob_box.
left() - end_of_row;
223 prev_blob_box = blob_box;
224 while (!blob_it.cycled_list()) {
225 if (tosp_use_pre_chopping) {
227 }
else if (tosp_stats_use_xht_gaps) {
228 blob_box = reduced_box_next(row, &blob_it);
232 int16_t left = prev_blob_box.
right();
233 int16_t right = blob_box.
left();
234 gap_width = right - left;
235 if ((gap_width > real_space_threshold) &&
236 !ignore_big_gap(row, row_length, gapmap, left, right)) {
243 if (!tosp_block_use_cert_spaces ||
244 (gap_width > tosp_fuzzy_space_factor2 * row->
xheight) ||
245 ((gap_width > tosp_fuzzy_space_factor1 * row->
xheight) &&
246 (!tosp_narrow_blobs_not_cert ||
247 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
248 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
249 space_gap_stats.add(gap_width, 1);
252 prev_blob_box = blob_box;
257 if (space_gap_stats.get_total() <= 2) {
258 block_space_gap_width = -1;
260 block_space_gap_width = std::max(
static_cast<int16_t
>(floor(space_gap_stats.median())),
261 static_cast<int16_t
>(3 * block_non_space_gap_width));
271void Textord::row_spacing_stats(TO_ROW *row, GAPMAP *gapmap, int16_t block_idx, int16_t row_idx,
272 int16_t block_space_gap_width,
273 int16_t block_non_space_gap_width
276 BLOBNBOX_IT blob_it = row->blob_list();
278 STATS cert_space_gap_stats(0,
MAXSPACING - 1);
284 int16_t real_space_threshold = 0;
287 int16_t large_gap_count = 0;
288 bool suspected_table;
289 int32_t max_max_nonspace;
290 bool good_block_space_estimate = block_space_gap_width > 0;
292 int32_t row_length = 0;
294 int32_t sane_threshold;
298 if (!good_block_space_estimate) {
299 block_space_gap_width = int16_t(std::floor(row->xheight / 2));
301 if (!row->blob_list()->empty()) {
302 if (tosp_threshold_bias1 > 0) {
303 real_space_threshold =
304 block_non_space_gap_width +
305 int16_t(floor(0.5 + tosp_threshold_bias1 *
306 (block_space_gap_width - block_non_space_gap_width)));
308 real_space_threshold =
309 (block_space_gap_width + block_non_space_gap_width) / 2;
311 blob_it.set_to_list(row->blob_list());
312 blob_it.mark_cycle_pt();
313 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
314 if (tosp_use_pre_chopping) {
316 }
else if (tosp_stats_use_xht_gaps) {
317 blob_box = reduced_box_next(row, &blob_it);
321 row_length = end_of_row - blob_box.left();
322 prev_blob_box = blob_box;
323 while (!blob_it.cycled_list()) {
324 if (tosp_use_pre_chopping) {
326 }
else if (tosp_stats_use_xht_gaps) {
327 blob_box = reduced_box_next(row, &blob_it);
331 int16_t left = prev_blob_box.right();
332 int16_t right = blob_box.left();
333 gap_width = right - left;
334 if (ignore_big_gap(row, row_length, gapmap, left, right)) {
337 if (gap_width >= real_space_threshold) {
338 if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
339 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
340 (!tosp_narrow_blobs_not_cert ||
341 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
342 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
343 cert_space_gap_stats.add(gap_width, 1);
345 all_space_gap_stats.add(gap_width, 1);
347 small_gap_stats.add(gap_width, 1);
349 all_gap_stats.add(gap_width, 1);
351 prev_blob_box = blob_box;
354 suspected_table = (large_gap_count > 1) ||
355 ((large_gap_count > 0) && (all_gap_stats.get_total() <= tosp_few_samples));
359 if ((cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) ||
360 ((suspected_table || all_gap_stats.get_total() <= tosp_short_row) &&
361 cert_space_gap_stats.get_total() > 0)) {
362 old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats,
363 block_space_gap_width, block_non_space_gap_width);
365 if (!tosp_recovery_isolated_row_stats ||
366 !isolated_row_stats(row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) {
367 if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) {
368 tprintf(
"B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx);
370 if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
372 row->space_size = block_space_gap_width;
373 if (all_gap_stats.get_total() > tosp_redo_kern_limit) {
374 row->kern_size = all_gap_stats.median();
376 row->kern_size = block_non_space_gap_width;
378 row->space_threshold =
379 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
381 old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats,
382 block_space_gap_width, block_non_space_gap_width);
387 if (tosp_improve_thresh && !suspected_table) {
388 improve_row_threshold(row, &all_gap_stats);
393 if (tosp_sanity_method == 0) {
394 if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
395 if (tosp_debug_level > 5) {
396 tprintf(
"B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx,
397 row->kern_size, row->space_threshold, row->space_size);
399 row->space_threshold =
static_cast<int32_t
>(tosp_table_kn_sp_ratio * row->kern_size);
400 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
402 }
else if (tosp_sanity_method == 1) {
403 sane_space = row->space_size;
405 if ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
406 ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) {
407 if (good_block_space_estimate &&
408 (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) {
409 sane_space = block_space_gap_width;
412 std::max(
static_cast<float>(tosp_min_sane_kn_sp) * std::max(row->kern_size, 2.5f),
413 row->xheight / 2.0f);
415 if (tosp_debug_level > 5) {
416 tprintf(
"B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx,
417 row->kern_size, row->space_threshold, row->space_size, sane_space);
419 row->space_size = sane_space;
420 row->space_threshold =
421 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
424 sane_threshold = int32_t(floor(tosp_max_sane_kn_thresh * std::max(row->kern_size, 2.5f)));
425 if (row->space_threshold > sane_threshold) {
426 if (tosp_debug_level > 5) {
427 tprintf(
"B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx,
428 row->kern_size, row->space_threshold, row->space_size, sane_threshold);
430 row->space_threshold = sane_threshold;
431 if (row->space_size <= sane_threshold) {
432 row->space_size = row->space_threshold + 1.0f;
436 if (suspected_table) {
438 std::max(tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight);
439 sane_threshold = int32_t(std::floor((sane_space + row->kern_size) / 2));
441 if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) {
442 if (tosp_debug_level > 5) {
443 tprintf(
"B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx,
444 row->kern_size, row->space_threshold, row->space_size);
447 row->space_threshold =
static_cast<int32_t
>(sane_space);
448 row->space_size = std::max(row->space_threshold + 1.0f, row->xheight);
455 if (tosp_old_to_method) {
458 row->max_nonspace = row->space_threshold;
460 row->min_space = row->space_threshold + 1;
464 std::min(int32_t(ceil(tosp_fuzzy_space_factor * row->xheight)), int32_t(row->space_size));
465 if (row->min_space <= row->space_threshold) {
467 row->min_space = row->space_threshold + 1;
482 max_max_nonspace = int32_t((row->space_threshold + row->kern_size) / 2);
485 row->max_nonspace = max_max_nonspace;
486 for (index = 0; index <= max_max_nonspace; index++) {
487 if (all_gap_stats.pile_count(index) > max) {
488 max = all_gap_stats.pile_count(index);
490 if ((index > row->kern_size) && (all_gap_stats.pile_count(index) < 0.1 * max)) {
491 row->max_nonspace = index;
500 if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) {
501 row->min_space = std::max(
502 row->min_space,
static_cast<int32_t
>(ceil(row->space_threshold +
503 tosp_fuzzy_sp_fraction *
504 (row->space_size - row->space_threshold))));
514 if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) {
515 row->min_space = std::max(
516 row->min_space,
static_cast<int32_t
>(ceil(tosp_table_fuzzy_kn_sp_ratio * row->kern_size)));
519 if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
520 row->max_nonspace =
static_cast<int32_t
>(floor(
521 0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)));
523 if (row->max_nonspace > row->space_threshold) {
525 row->max_nonspace = row->space_threshold;
528 if (tosp_debug_level > 5) {
530 "B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) "
532 block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width,
533 real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold,
534 row->min_space, row->space_size);
536 if (tosp_debug_level > 10) {
538 "row->kern_size = %3.2f, row->space_size = %3.2f, "
539 "row->space_threshold = %d\n",
540 row->kern_size, row->space_size, row->space_threshold);
544void Textord::old_to_method(TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats,
545 STATS *small_gap_stats,
546 int16_t block_space_gap_width,
547 int16_t block_non_space_gap_width
551 if (space_gap_stats->get_total() >= tosp_enough_space_samples_for_median) {
555 row->space_size = space_gap_stats->median();
556 if (row->space_size > block_space_gap_width * 1.5) {
557 if (tosp_old_to_bug_fix) {
558 row->space_size = block_space_gap_width * 1.5;
561 row->space_size = block_space_gap_width;
564 if (row->space_size < (block_non_space_gap_width * 2) + 1) {
565 row->space_size = (block_non_space_gap_width * 2) + 1;
569 else if (space_gap_stats->get_total() >= 1) {
571 row->space_size = space_gap_stats->mean();
572 if (row->space_size > block_space_gap_width * 1.5) {
573 if (tosp_old_to_bug_fix) {
574 row->space_size = block_space_gap_width * 1.5;
577 row->space_size = block_space_gap_width;
580 if (row->space_size < (block_non_space_gap_width * 3) + 1) {
581 row->space_size = (block_non_space_gap_width * 3) + 1;
585 row->space_size = block_space_gap_width;
589 if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total() > tosp_redo_kern_limit)) {
590 row->kern_size = small_gap_stats->median();
591 }
else if (all_gap_stats->get_total() > tosp_redo_kern_limit) {
592 row->kern_size = all_gap_stats->median();
594 row->kern_size = block_non_space_gap_width;
598 if (tosp_threshold_bias2 > 0) {
599 row->space_threshold = int32_t(
600 floor(0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size)));
610 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
616 if (tosp_old_to_constrain_sp_kn && tosp_sanity_method == 1 &&
617 ((row->space_size < tosp_min_sane_kn_sp * std::max(row->kern_size, 2.5f)) ||
618 ((row->space_size - row->kern_size) < tosp_silly_kn_sp_gap * row->xheight))) {
619 if (row->kern_size > 2.5) {
620 row->kern_size = row->space_size / tosp_min_sane_kn_sp;
622 row->space_threshold =
623 int32_t(floor((row->space_size + row->kern_size) / tosp_old_sp_kn_th_factor));
631bool Textord::isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats,
632 bool suspected_table, int16_t block_idx, int16_t row_idx) {
634 float crude_threshold_estimate;
635 int16_t small_gaps_count;
638 BLOBNBOX_IT blob_it = row->blob_list();
639 STATS cert_space_gap_stats(0,
MAXSPACING - 1);
648 kern_estimate = all_gap_stats->median();
649 crude_threshold_estimate =
650 std::max(tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight);
652 stats_count_under(all_gap_stats,
static_cast<int16_t
>(std::ceil(crude_threshold_estimate)));
653 total = all_gap_stats->get_total();
655 if ((total <= tosp_redo_kern_limit) ||
656 ((small_gaps_count /
static_cast<float>(total)) < tosp_enough_small_gaps) ||
657 (total - small_gaps_count < 1)) {
658 if (tosp_debug_level > 5) {
659 tprintf(
"B:%d R:%d -- Can't do isolated row stats.\n", block_idx, row_idx);
663 blob_it.set_to_list(row->blob_list());
664 blob_it.mark_cycle_pt();
665 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
666 if (tosp_use_pre_chopping) {
668 }
else if (tosp_stats_use_xht_gaps) {
669 blob_box = reduced_box_next(row, &blob_it);
673 row_length = end_of_row - blob_box.left();
674 prev_blob_box = blob_box;
675 while (!blob_it.cycled_list()) {
676 if (tosp_use_pre_chopping) {
678 }
else if (tosp_stats_use_xht_gaps) {
679 blob_box = reduced_box_next(row, &blob_it);
683 int16_t left = prev_blob_box.right();
684 int16_t right = blob_box.left();
685 gap_width = right - left;
686 if (!ignore_big_gap(row, row_length, gapmap, left, right) &&
687 (gap_width > crude_threshold_estimate)) {
688 if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
689 ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
690 (!tosp_narrow_blobs_not_cert ||
691 (!narrow_blob(row, prev_blob_box) && !narrow_blob(row, blob_box)))) ||
692 (wide_blob(row, prev_blob_box) && wide_blob(row, blob_box))) {
693 cert_space_gap_stats.add(gap_width, 1);
695 all_space_gap_stats.add(gap_width, 1);
697 if (gap_width < crude_threshold_estimate) {
698 small_gap_stats.add(gap_width, 1);
701 prev_blob_box = blob_box;
703 if (cert_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
705 row->space_size = cert_space_gap_stats.median();
706 }
else if (suspected_table && (cert_space_gap_stats.get_total() > 0)) {
708 row->space_size = cert_space_gap_stats.mean();
710 }
else if (all_space_gap_stats.get_total() >= tosp_enough_space_samples_for_median) {
712 row->space_size = all_space_gap_stats.median();
714 row->space_size = all_space_gap_stats.mean();
717 if (tosp_only_small_gaps_for_kern) {
718 row->kern_size = small_gap_stats.median();
720 row->kern_size = all_gap_stats->median();
722 row->space_threshold = int32_t(std::floor((row->space_size + row->kern_size) / 2));
724 if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) ||
725 (row->space_threshold <= 0)) {
726 if (tosp_debug_level > 5) {
727 tprintf(
"B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx,
728 row->kern_size, row->space_threshold, row->space_size);
730 row->kern_size = 0.0f;
731 row->space_threshold = 0;
732 row->space_size = 0.0f;
736 if (tosp_debug_level > 5) {
737 tprintf(
"B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size,
738 row->space_threshold, row->space_size);
743int16_t Textord::stats_count_under(STATS *stats, int16_t threshold) {
747 for (index = 0; index < threshold; index++) {
748 total += stats->pile_count(index);
768void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
769 float sp = row->space_size;
770 float kn = row->kern_size;
771 int16_t reqd_zero_width = 0;
772 int16_t zero_width = 0;
773 int16_t zero_start = 0;
776 if (tosp_debug_level > 10) {
777 tprintf(
"Improve row threshold 0");
779 if ((all_gap_stats->get_total() <= 25) || (sp <= 10) || (sp <= 3 * kn) ||
780 (stats_count_under(all_gap_stats,
static_cast<int16_t
>(ceil(kn + (sp - kn) / 3 + 0.5))) <
781 (0.75 * all_gap_stats->get_total()))) {
784 if (tosp_debug_level > 10) {
792 reqd_zero_width =
static_cast<int16_t
>(floor((sp - kn) / 3 + 0.5));
793 if (reqd_zero_width < 3) {
797 for (index = int16_t(std::ceil(kn)); index < int16_t(std::floor(sp)); index++) {
798 if (all_gap_stats->pile_count(index) == 0) {
799 if (zero_width == 0) {
804 if (zero_width >= reqd_zero_width) {
812 if (tosp_debug_level > 10) {
813 tprintf(
" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width,
814 zero_width, zero_start, row->space_threshold);
816 if ((zero_width < reqd_zero_width) ||
817 ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) {
820 if (tosp_debug_level > 10) {
823 if (row->space_threshold < zero_start) {
824 if (tosp_debug_level > 5) {
825 tprintf(
"Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
826 index, row->space_threshold, zero_start);
828 row->space_threshold = zero_start;
830 if (row->space_threshold > index) {
831 if (tosp_debug_level > 5) {
832 tprintf(
"Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start,
833 index, row->space_threshold, index);
835 row->space_threshold = index;
853 bool fuzzy_sp =
false;
854 bool fuzzy_non =
false;
856 bool prev_gap_was_a_space =
false;
857 bool break_at_next_gap =
false;
859 C_OUTLINE_IT cout_it;
861 C_BLOB_IT cblob_it = &cblobs;
864 int32_t next_rep_char_word_right = INT32_MAX;
865 float repetition_spacing;
871 int16_t prev_gap = INT16_MAX;
872 int16_t current_gap = INT16_MAX;
873 int16_t next_gap = INT16_MAX;
874 int16_t prev_within_xht_gap = INT16_MAX;
875 int16_t current_within_xht_gap = INT16_MAX;
876 int16_t next_within_xht_gap = INT16_MAX;
877 int16_t word_count = 0;
881 if (!rep_char_it.empty()) {
882 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
886 cblob_it.set_to_list(&cblobs);
889 WERD_IT word_it(&words);
892 prev_fuzzy_sp =
false;
893 prev_fuzzy_non =
false;
894 if (!box_it.empty()) {
895 xstarts[0] = box_it.data()->bounding_box().left();
896 if (xstarts[0] > next_rep_char_word_right) {
898 word = rep_char_it.extract();
899 word_it.add_after_then_move(word);
909 repetition_spacing = find_mean_blob_spacing(word);
910 current_gap = box_it.data()->bounding_box().left() - next_rep_char_word_right;
911 current_within_xht_gap = current_gap;
912 if (current_gap > tosp_rep_space * repetition_spacing) {
913 prev_blanks =
static_cast<uint8_t
>(std::floor(current_gap / row->
space_size));
914 if (prev_blanks < 1) {
920 if (tosp_debug_level > 5) {
921 tprintf(
"Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
922 box_it.data()->bounding_box().left(), box_it.data()->bounding_box().bottom(),
923 repetition_spacing, current_gap);
925 prev_fuzzy_sp =
false;
926 prev_fuzzy_non =
false;
927 if (rep_char_it.empty()) {
928 next_rep_char_word_right = INT32_MAX;
930 rep_char_it.forward();
931 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
935 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
937 auto bblob = box_it.data();
938 auto blob_box = bblob->bounding_box();
939 if (bblob->joined_to_prev()) {
940 auto cblob = bblob->remove_cblob();
941 if (cblob !=
nullptr) {
942 cout_it.set_to_list(cblob_it.data()->out_list());
943 cout_it.move_to_last();
944 cout_it.add_list_after(cblob->out_list());
948 auto cblob = bblob->cblob();
949 if (cblob !=
nullptr) {
950 bblob->set_owns_cblob(
false);
951 cblob_it.add_after_then_move(cblob);
953 prev_x = blob_box.right();
956 bblob = box_it.data();
957 blob_box = bblob->bounding_box();
959 if (!bblob->joined_to_prev() && bblob->cblob() !=
nullptr) {
961 prev_gap = current_gap;
962 prev_within_xht_gap = current_within_xht_gap;
963 prev_blob_box = next_blob_box;
964 current_gap = next_gap;
965 current_within_xht_gap = next_within_xht_gap;
966 peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap);
968 int16_t prev_gap_arg = prev_gap;
969 int16_t next_gap_arg = next_gap;
970 if (tosp_only_use_xht_gaps) {
971 prev_gap_arg = prev_within_xht_gap;
972 next_gap_arg = next_within_xht_gap;
975 if (blob_box.left() > next_rep_char_word_right ||
976 make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box, current_gap,
977 current_within_xht_gap, next_blob_box, next_gap_arg, blanks, fuzzy_sp,
978 fuzzy_non, prev_gap_was_a_space, break_at_next_gap) ||
981 word =
new WERD(&cblobs, prev_blanks,
nullptr);
983 word_it.add_after_then_move(word);
991 }
else if (prev_fuzzy_non) {
996 if (blob_box.left() > next_rep_char_word_right) {
998 word = rep_char_it.extract();
999 word_it.add_after_then_move(word);
1002 repetition_spacing = find_mean_blob_spacing(word);
1004 current_within_xht_gap = current_gap;
1005 if (current_gap > tosp_rep_space * repetition_spacing) {
1006 blanks =
static_cast<uint8_t
>(std::floor(current_gap / row->
space_size));
1013 if (tosp_debug_level > 5) {
1014 tprintf(
"Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1016 repetition_spacing, current_gap, blanks);
1025 current_gap = blob_box.left() - next_rep_char_word_right;
1026 if (current_gap > tosp_rep_space * repetition_spacing) {
1027 blanks =
static_cast<uint8_t
>(current_gap / row->
space_size);
1034 if (tosp_debug_level > 5) {
1035 tprintf(
" Rgap:%d (%d blanks)\n", current_gap, blanks);
1040 if (rep_char_it.empty()) {
1041 next_rep_char_word_right = INT32_MAX;
1043 rep_char_it.forward();
1044 next_rep_char_word_right = rep_char_it.data()->bounding_box().right();
1048 if (box_it.at_first() && rep_char_it.empty()) {
1051 xstarts[1] = prev_x;
1053 prev_blanks = blanks;
1054 prev_fuzzy_sp = fuzzy_sp;
1055 prev_fuzzy_non = fuzzy_non;
1059 }
while (!box_it.at_first());
1062 while (!rep_char_it.empty()) {
1063 word = rep_char_it.extract();
1064 word_it.add_after_then_move(word);
1067 repetition_spacing = find_mean_blob_spacing(word);
1069 if (current_gap > tosp_rep_space * repetition_spacing) {
1070 blanks =
static_cast<uint8_t
>(std::floor(current_gap / row->
space_size));
1077 if (tosp_debug_level > 5) {
1078 tprintf(
"Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n",
1080 current_gap, blanks);
1087 if (rep_char_it.empty()) {
1090 xstarts[1] = prev_x;
1092 rep_char_it.forward();
1097 word_it.set_to_list(real_row->
word_list());
1099 word_it.add_list_after(&words);
1102 if (tosp_debug_level > 4) {
1103 tprintf(
"Row: Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1123 C_OUTLINE_IT cout_it;
1125 C_BLOB_IT cblob_it = &cblobs;
1129 int16_t word_count = 0;
1131 cblob_it.set_to_list(&cblobs);
1134 WERD_IT word_it(&words);
1136 if (!box_it.empty()) {
1138 auto bblob = box_it.data();
1139 auto blob_box = bblob->bounding_box();
1140 if (bblob->joined_to_prev()) {
1141 auto cblob = bblob->remove_cblob();
1142 if (cblob !=
nullptr) {
1143 cout_it.set_to_list(cblob_it.data()->out_list());
1144 cout_it.move_to_last();
1145 cout_it.add_list_after(cblob->out_list());
1149 auto cblob = bblob->cblob();
1150 if (cblob !=
nullptr) {
1151 bblob->set_owns_cblob(
false);
1152 cblob_it.add_after_then_move(cblob);
1156 bblob = box_it.data();
1157 blob_box = bblob->bounding_box();
1159 if (!bblob->joined_to_prev() && !cblobs.empty()) {
1160 word =
new WERD(&cblobs, 1,
nullptr);
1162 word_it.add_after_then_move(word);
1167 if (box_it.at_first()) {
1171 }
while (!box_it.at_first());
1175 word_it.set_to_list(real_row->
word_list());
1177 word_it.add_list_after(&words);
1179 if (tosp_debug_level > 4) {
1180 tprintf(
"Row:Made %d words in row ((%d,%d)(%d,%d))\n", word_count,
1189bool Textord::make_a_word_break(
TO_ROW *row,
1191 int16_t prev_gap,
TBOX prev_blob_box, int16_t real_current_gap,
1192 int16_t within_xht_current_gap,
TBOX next_blob_box,
1193 int16_t next_gap, uint8_t &blanks,
bool &fuzzy_sp,
bool &fuzzy_non,
1194 bool &prev_gap_was_a_space,
bool &break_at_next_gap) {
1196 int16_t current_gap;
1197 float fuzzy_sp_to_kn_limit;
1199 if (break_at_next_gap) {
1200 break_at_next_gap =
false;
1210 ((tosp_dont_fool_with_small_kerns >= 0) &&
1211 (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size))) {
1213 within_xht_current_gap = real_current_gap;
1216 if (tosp_use_xht_gaps && tosp_only_use_xht_gaps) {
1217 current_gap = within_xht_current_gap;
1219 current_gap = real_current_gap;
1222 if (tosp_old_to_method) {
1225 if (space && (current_gap < INT16_MAX)) {
1226 if (current_gap < row->min_space) {
1241 blanks =
static_cast<uint8_t
>(current_gap / row->
space_size);
1254 prev_gap_was_a_space =
true;
1265 int num_blanks = current_gap;
1269 blanks =
static_cast<uint8_t
>(ClipToRange<int>(num_blanks, 1, UINT8_MAX));
1277 if (tosp_use_xht_gaps && (real_current_gap <= row->max_nonspace) &&
1281#ifndef GRAPHICS_DISABLED
1282 mark_gap(blob_box, 20, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1285 }
else if (tosp_use_xht_gaps && (real_current_gap <= row->space_threshold) &&
1288 if (tosp_flip_fuzz_kn_to_sp) {
1293#ifndef GRAPHICS_DISABLED
1294 mark_gap(blob_box, 21, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1297 }
else if (tosp_use_xht_gaps && (real_current_gap < row->min_space) &&
1298 (within_xht_current_gap >= row->
min_space)) {
1300#ifndef GRAPHICS_DISABLED
1301 mark_gap(blob_box, 22, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1304 }
else if (tosp_force_wordbreak_on_punct && !suspected_punct_blob(row, prev_blob_box) &&
1305 suspected_punct_blob(row, blob_box)) {
1306 break_at_next_gap =
true;
1309 else if ((current_gap < row->min_space) && (current_gap > row->
space_threshold)) {
1311 if (tosp_pass_wide_fuzz_sp_to_context > 0) {
1312 fuzzy_sp_to_kn_limit =
1315 fuzzy_sp_to_kn_limit = 99999.0f;
1320 if ((prev_blob_box.
width() > 0) && narrow_blob(row, prev_blob_box) && prev_gap_was_a_space &&
1321 (current_gap <= tosp_gap_factor * prev_gap)) {
1322 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1323 if (tosp_flip_fuzz_sp_to_kn) {
1331#ifndef GRAPHICS_DISABLED
1332 mark_gap(blob_box, 1, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1338 else if ((prev_blob_box.
width() > 0) && narrow_blob(row, prev_blob_box) &&
1339 !prev_gap_was_a_space && (current_gap * tosp_gap_factor <= prev_gap)) {
1340 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1341 if (tosp_flip_fuzz_sp_to_kn) {
1349#ifndef GRAPHICS_DISABLED
1350 mark_gap(blob_box, 2, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1353 }
else if ((next_blob_box.
width() > 0) && narrow_blob(row, next_blob_box) &&
1354 (next_gap > row->
space_threshold) && (current_gap <= tosp_gap_factor * next_gap)) {
1355 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1356 if (tosp_flip_fuzz_sp_to_kn) {
1364#ifndef GRAPHICS_DISABLED
1365 mark_gap(blob_box, 3, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1368 }
else if ((next_blob_box.
width() > 0) && narrow_blob(row, next_blob_box) &&
1369 (next_gap <= row->space_threshold) &&
1370 (current_gap * tosp_gap_factor <= next_gap)) {
1371 if ((tosp_all_flips_fuzzy) || (current_gap > fuzzy_sp_to_kn_limit)) {
1372 if (tosp_flip_fuzz_sp_to_kn) {
1380#ifndef GRAPHICS_DISABLED
1381 mark_gap(blob_box, 4, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1384 }
else if ((((next_blob_box.
width() > 0) && narrow_blob(row, next_blob_box)) ||
1385 ((prev_blob_box.
width() > 0) && narrow_blob(row, prev_blob_box)))) {
1387#ifndef GRAPHICS_DISABLED
1388 mark_gap(blob_box, 6, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1392 }
else if ((current_gap > row->
max_nonspace) && (current_gap <= row->space_threshold)) {
1400 if ((prev_blob_box.
width() > 0) && (next_blob_box.
width() > 0) &&
1401 (current_gap >= tosp_kern_gap_factor1 * std::max(prev_gap, next_gap)) &&
1402 wide_blob(row, prev_blob_box) && wide_blob(row, next_blob_box)) {
1409 if ((tosp_flip_fuzz_kn_to_sp) &&
1415#ifndef GRAPHICS_DISABLED
1416 mark_gap(blob_box, 7, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1419 }
else if (prev_blob_box.
width() > 0 && next_blob_box.
width() > 0 &&
1421 current_gap >= tosp_kern_gap_factor2 * std::max(prev_gap, next_gap) &&
1422 !(narrow_blob(row, prev_blob_box) || suspected_punct_blob(row, prev_blob_box)) &&
1423 !(narrow_blob(row, next_blob_box) || suspected_punct_blob(row, next_blob_box))) {
1426#ifndef GRAPHICS_DISABLED
1427 mark_gap(blob_box, 8, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1430 }
else if ((tosp_kern_gap_factor3 > 0) && (prev_blob_box.
width() > 0) &&
1431 (next_blob_box.
width() > 0) &&
1432 (current_gap >= tosp_kern_gap_factor3 * std::max(prev_gap, next_gap)) &&
1433 (!tosp_rule_9_test_punct || (!suspected_punct_blob(row, prev_blob_box) &&
1434 !suspected_punct_blob(row, next_blob_box)))) {
1437#ifndef GRAPHICS_DISABLED
1438 mark_gap(blob_box, 9, prev_gap, prev_blob_box.
width(), current_gap, next_blob_box.
width(),
1443 if (tosp_debug_level > 10) {
1445 "word break = %d current_gap = %d, prev_gap = %d, "
1447 space ? 1 : 0, current_gap, prev_gap, next_gap);
1449 prev_gap_was_a_space = space && !(fuzzy_non);
1454bool Textord::narrow_blob(TO_ROW *row,
TBOX blob_box) {
1457 ((blob_box.width() <= tosp_narrow_fraction * row->xheight) ||
1458 ((
static_cast<float>(blob_box.width()) / blob_box.height()) <= tosp_narrow_aspect_ratio));
1462bool Textord::wide_blob(TO_ROW *row,
TBOX blob_box) {
1464 if (tosp_wide_fraction > 0) {
1465 if (tosp_wide_aspect_ratio > 0) {
1467 ((blob_box.width() >= tosp_wide_fraction * row->xheight) &&
1468 ((
static_cast<float>(blob_box.width()) / blob_box.height()) > tosp_wide_aspect_ratio));
1470 result = (blob_box.width() >= tosp_wide_fraction * row->xheight);
1473 result = !narrow_blob(row, blob_box);
1478bool Textord::suspected_punct_blob(TO_ROW *row,
TBOX box) {
1481 float blob_x_centre;
1483 blob_x_centre = (box.right() + box.left()) / 2.0;
1484 baseline = row->baseline.y(blob_x_centre);
1486 result = (box.height() <= 0.66 * row->xheight) || (box.top() <
baseline + row->xheight / 2.0) ||
1487 (box.bottom() >
baseline + row->xheight / 2.0);
1491void Textord::peek_at_next_gap(TO_ROW *row, BLOBNBOX_IT box_it,
TBOX &next_blob_box,
1492 int16_t &next_gap, int16_t &next_within_xht_gap) {
1493 TBOX next_reduced_blob_box;
1495 BLOBNBOX_IT reduced_box_it = box_it;
1498 next_reduced_blob_box = reduced_box_next(row, &reduced_box_it);
1499 if (box_it.at_first()) {
1500 next_gap = INT16_MAX;
1501 next_within_xht_gap = INT16_MAX;
1503 bit_beyond = box_it.data()->bounding_box();
1504 next_gap = bit_beyond.left() - next_blob_box.right();
1505 bit_beyond = reduced_box_next(row, &reduced_box_it);
1506 next_within_xht_gap = bit_beyond.left() - next_reduced_blob_box.right();
1510#ifndef GRAPHICS_DISABLED
1511void Textord::mark_gap(
TBOX blob,
1513 int16_t prev_gap, int16_t prev_blob_width, int16_t current_gap,
1514 int16_t next_blob_width, int16_t next_gap) {
1567 blob.height() / 2.0f,
1569 blob.left() - current_gap / 2.0f,
1571 blob.bottom() + blob.height() / 2.0f);
1573 if (tosp_debug_level > 5) {
1574 tprintf(
" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", blob.left() - current_gap / 2,
1575 blob.bottom(), rule, prev_gap, prev_blob_width, current_gap, next_blob_width, next_gap);
1580float Textord::find_mean_blob_spacing(WERD *word) {
1583 int32_t gap_sum = 0;
1584 int16_t gap_count = 0;
1587 cblob_it.set_to_list(word->cblob_list());
1588 if (!cblob_it.empty()) {
1589 cblob_it.mark_cycle_pt();
1590 prev_right = cblob_it.data()->bounding_box().right();
1593 for (; !cblob_it.cycled_list(); cblob_it.forward()) {
1594 blob_box = cblob_it.data()->bounding_box();
1595 gap_sum += blob_box.left() - prev_right;
1597 prev_right = blob_box.right();
1600 if (gap_count > 0) {
1601 return (gap_sum /
static_cast<float>(gap_count));
1607bool Textord::ignore_big_gap(TO_ROW *row, int32_t row_length, GAPMAP *gapmap, int16_t left,
1609 int16_t gap = right - left + 1;
1611 if (tosp_ignore_big_gaps > 999) {
1614 if (tosp_ignore_big_gaps > 0) {
1615 return (gap > tosp_ignore_big_gaps * row->xheight);
1617 if (gap > tosp_ignore_very_big_gaps * row->xheight) {
1620 if (tosp_ignore_big_gaps == 0) {
1621 if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight)) {
1624 if ((gap > 1.75 * row->xheight) &&
1625 ((row_length > 35 * row->xheight) || gapmap->table_gap(left, right))) {
1631 if ((gap >
gapmap_big_gaps * row->xheight) && gapmap->table_gap(left, right)) {
1646TBOX Textord::reduced_box_next(TO_ROW *row,
1650 BLOBNBOX *head_blob;
1653 int16_t left_above_xht;
1654 int16_t new_left_above_xht;
1657 if (blob->red_box_set()) {
1658 reduced_box = blob->reduced_box();
1662 }
while (blob->cblob() ==
nullptr || blob->joined_to_prev());
1666 full_box = blob->bounding_box();
1667 reduced_box = reduced_box_for_blob(blob, row, &left_above_xht);
1671 if (blob->cblob() ==
nullptr) {
1673 full_box += blob->bounding_box();
1674 }
else if (blob->joined_to_prev()) {
1675 reduced_box += reduced_box_for_blob(blob, row, &new_left_above_xht);
1676 left_above_xht = std::min(left_above_xht, new_left_above_xht);
1680 while (blob->cblob() ==
nullptr || blob->joined_to_prev());
1682 if ((reduced_box.width() > 0) &&
1683 ((reduced_box.left() + tosp_near_lh_edge * reduced_box.width()) < left_above_xht) &&
1684 (reduced_box.height() > 0.7 * row->xheight)) {
1685#ifndef GRAPHICS_DISABLED
1691 reduced_box = full_box;
1693 head_blob->set_reduced_box(reduced_box);
1717TBOX Textord::reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, int16_t *left_above_xht) {
1719 float blob_x_centre;
1727 blob_box = blob->bounding_box();
1728 blob_x_centre = (blob_box.left() + blob_box.right()) / 2.0;
1729 baseline = row->baseline.y(blob_x_centre);
1735 left_limit =
static_cast<float>(INT32_MAX);
1736 junk =
static_cast<float>(-INT32_MAX);
1739 if (left_limit > junk) {
1740 *left_above_xht = INT16_MAX;
1742 *left_above_xht =
static_cast<int16_t
>(std::floor(left_limit));
1748 left_limit =
static_cast<float>(INT32_MAX);
1749 junk =
static_cast<float>(-INT32_MAX);
1752 if (left_limit > junk) {
1758 junk =
static_cast<float>(INT32_MAX);
1759 right_limit =
static_cast<float>(-INT32_MAX);
1762 if (junk > right_limit) {
1766 return TBOX(ICOORD(
static_cast<int16_t
>(std::floor(left_limit)), blob_box.bottom()),
1767 ICOORD(
static_cast<int16_t
>(std::ceil(right_limit)), blob_box.top()));
@ W_FUZZY_NON
fuzzy nonspace
void tprintf(const char *format,...)
int IntCastRounded(double x)
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
bool textord_show_initial_words
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
TBOX box_next(BLOBNBOX_IT *it)
BLOBNBOX_LIST * blob_list()
PITCH_TYPE pitch_decision
void recalc_bounding_box()
TBOX bounding_box() const
TDimension bottom() const
void set_flag(WERD_FLAGS mask, bool value)
TBOX bounding_box() const
void set_blanks(uint8_t new_blanks)
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
void Ellipse(int x, int y, int width, int height)