21# include "config_auto.h"
60static BOOL_VAR(textord_biased_skewcalc,
true,
"Bias skew estimates with line length");
61static BOOL_VAR(textord_interpolating_skew,
true,
"Interpolate across gaps");
62static INT_VAR(textord_skewsmooth_offset, 4,
"For smooth factor");
63static INT_VAR(textord_skewsmooth_offset2, 1,
"For smooth factor");
69static INT_VAR(textord_max_blob_overlaps, 4,
"Max number of blobs a big blob can overlap");
77static double_VAR(textord_expansion_factor, 1.0,
"Factor to expand rows by in expand_rows");
78static double_VAR(textord_overlap_x, 0.375,
"Fraction of linespace for good overlap");
85 "Min blob height/top to include blob top into xheight stats");
88static double_VAR(textord_descheight_mode_fraction, 0.08,
"Min pile height to make descheight");
98#define MAX_HEIGHT_MODES 12
107static int row_y_order(
111 const TO_ROW *row1 = *
reinterpret_cast<const TO_ROW *
const *
>(item1);
113 const TO_ROW *row2 = *
reinterpret_cast<const TO_ROW *
const *
>(item2);
115 if (row1->parallel_c() > row2->parallel_c()) {
117 }
else if (row1->parallel_c() < row2->parallel_c()) {
129static int row_spacing_order(
131 const TO_ROW *row2) {
132 return row1->spacing < row2->spacing;
137static float MakeRowFromBlobs(
float line_size, BLOBNBOX_IT *blob_it, TO_ROW_IT *row_it) {
139 blob_it->move_to_first();
140 TO_ROW *row =
nullptr;
141 float total_size = 0.0f;
144 for (; !blob_it->empty(); blob_it->forward()) {
145 BLOBNBOX *blob = blob_it->extract();
146 int top = blob->bounding_box().top();
147 int bottom = blob->bounding_box().bottom();
148 if (row ==
nullptr) {
149 row =
new TO_ROW(blob, top, bottom, line_size);
150 row_it->add_before_then_move(row);
152 row->add_blob(blob, top, bottom, line_size);
154 total_size += top - bottom;
157 return blob_count > 0 ? total_size / blob_count : total_size;
162static float MakeRowFromSubBlobs(TO_BLOCK *block, C_BLOB *blob, TO_ROW_IT *row_it) {
164 BLOBNBOX_IT bb_it(&block->small_blobs);
165 C_OUTLINE_IT ol_it(blob->out_list());
167 ol_it.set_to_list(ol_it.data()->child());
171 for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
175 blob->CheckInverseFlagAndDirection();
176 auto *bbox =
new BLOBNBOX(blob);
177 bb_it.add_after_then_move(bbox);
180 return MakeRowFromBlobs(block->line_size, &bb_it, row_it);
191 TO_BLOCK_LIST *blocks) {
192 BLOBNBOX_IT blob_it = &block->
blobs;
193 TO_ROW_IT row_it = block->
get_rows();
199 if (block->
blobs.singleton() && allow_sub_blobs) {
200 blob_it.move_to_first();
201 float size = MakeRowFromSubBlobs(block, blob_it.data()->cblob(), &row_it);
205 }
else if (block->
blobs.empty()) {
210 blob_it.add_after_then_move(bblob);
212 MakeRowFromBlobs(block->
line_size, &blob_it, &row_it);
214 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
232 TO_BLOCK_IT block_it;
234 block_it.set_to_list(port_blocks);
235 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
240 block_it.set_to_list(port_blocks);
241 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
243 block_it.data()->block->pdblk.bounding_box().left(),
260 TO_ROW_IT row_it = block->
get_rows();
262#ifndef GRAPHICS_DISABLED
273 row_it.move_to_first();
274 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
277#ifndef GRAPHICS_DISABLED
280 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
301 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
302 const TBOX &box = blob_it.data()->bounding_box();
305 double error = lms.
Fit(&m, &c);
316 TO_BLOCK_LIST *blocks,
325 TO_BLOCK_IT block_it = blocks;
329 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
330 POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
331 if (pb !=
nullptr && !pb->
IsText()) {
334 row_count += block_it.data()->get_rows()->length();
336 TO_ROW_IT row_it(block_it.data()->get_rows());
337 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
338 blob_count += row_it.data()->blob_list()->length();
341 if (row_count == 0) {
347 std::vector<float> gradients(blob_count);
349 std::vector<float> errors(blob_count);
352 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
353 POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
354 if (pb !=
nullptr && !pb->
IsText()) {
357 TO_ROW_IT row_it(block_it.data()->get_rows());
358 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
361 row_err =
static_cast<int32_t
>(std::ceil(row->
line_error()));
365 if (textord_biased_skewcalc) {
366 blob_count /= row_err;
367 for (blob_count /= row_err; blob_count > 0; blob_count--) {
368 gradients[row_index] = row->
line_m();
374 gradients[row_index] = row->
line_m();
380 if (row_index == 0) {
382 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
383 POLY_BLOCK *pb = block_it.data()->block->pdblk.poly_block();
384 if (pb !=
nullptr && !pb->
IsText()) {
387 TO_ROW_IT row_it(block_it.data()->get_rows());
388 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
390 gradients[row_index] = row->
line_m();
396 row_count = row_index;
398 gradients.resize(row_count);
399 std::nth_element(gradients.begin(), gradients.begin() + row_index, gradients.end());
400 page_m = gradients[row_index];
402 errors.resize(row_count);
403 std::nth_element(errors.begin(), errors.begin() + row_index, errors.end());
404 page_err = errors[row_index];
415 const TBOX &ibox =
i->bounding_box();
419 int overlap = std::min(dotbox.
right(), ibox.
right()) - std::max(dotbox.
left(), ibox.
left());
421 (overlap * 2 < ibox.
width() && overlap < dotbox.
width())) {
433 const double kHeightFraction = 0.6;
434 double target_height = std::min(dotbox.
bottom(), ibox.
top());
436 target_height *= kHeightFraction;
437 int left_min = dotbox.
left() - dotbox.
width();
438 int middle = (dotbox.
left() + dotbox.
right()) / 2;
439 int right_max = dotbox.
right() + dotbox.
width();
444 bool found_left =
false;
445 bool found_right =
false;
446 bool in_left =
false;
447 bool in_right =
false;
449 C_OUTLINE_IT o_it = blob->
out_list();
450 for (o_it.mark_cycle_pt(); !o_it.cycled_list(); o_it.forward()) {
454 for (
int step = 0; step < length; pos += outline->
step(step++)) {
457 if (
x >= left_min &&
x < middle && !found_left) {
467 left_maxy = left_miny =
y;
470 }
else if (in_left) {
472 if (left_maxy - left_miny > target_height) {
480 if (x <= right_max && x > middle && !found_right) {
483 if (
y > right_maxy) {
486 if (
y < right_miny) {
490 right_maxy = right_miny =
y;
493 }
else if (in_right) {
495 if (right_maxy - right_miny > target_height) {
509 TO_ROW_IT row_it = block->
get_rows();
510 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
511 TO_ROW *row = row_it.data();
515 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
521 STATS hstats(0, max_height);
522 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
529 float xheight = hstats.
median();
532 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
537 if (prev !=
nullptr) {
538 if (dot_of_i(blob, prev, row)) {
542 if (!b_it.at_last()) {
544 if (dot_of_i(blob,
next, row)) {
550 delete b_it.extract();
572 BLOBNBOX_IT blob_it = &block->
blobs;
573 TO_ROW_IT row_it = block->
get_rows();
575#ifndef GRAPHICS_DISABLED
587 expand_rows(page_tr, block, gradient, rotation, block_edge, testing_on);
588 blob_it.set_to_list(&block->
blobs);
589 row_it.set_to_list(block->
get_rows());
590 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
591 blob_it.add_list_after(row_it.data()->blob_list());
596 blob_it.set_to_list(&block->
blobs);
600 blob_it.set_to_list(&block->
blobs);
628 TO_ROW_IT row_it = block->
get_rows();
629 BLOBNBOX_IT blob_it = &block->
blobs;
631 if (row_it.empty()) {
637 min_y = block_box.
bottom() - 1;
638 max_y = block_box.
top() + 1;
639 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
640 line_index =
static_cast<int32_t
>(std::floor(row_it.data()->intercept()));
641 if (line_index <= min_y) {
642 min_y = line_index - 1;
644 if (line_index >= max_y) {
645 max_y = line_index + 1;
648 line_count = max_y - min_y + 1;
649 if (line_count <= 0) {
653 std::vector<int32_t> deltas(line_count);
655 std::vector<int32_t> occupation(line_count);
663 max_y - min_y + 1, &occupation[0], &deltas[0]);
664#ifndef GRAPHICS_DISABLED
666 draw_occupation(xleft, ybottom, min_y, max_y, &occupation[0], &deltas[0]);
670 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
672 line_index =
static_cast<int32_t
>(std::floor(row->
intercept()));
673 distance = deltas[line_index - min_y];
676#ifndef GRAPHICS_DISABLED
681 blob_it.add_list_after(row_it.data()->blob_list());
682 delete row_it.extract();
685 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
686 blob_it.add_list_after(row_it.data()->blob_list());
720 if (abs_dist > dist_limit) {
722 tprintf(
" too far - deleting\n");
726 if ((distance < 0 && !row_it->at_last()) || (
distance >= 0 && !row_it->at_first())) {
727 row_offset = row_inc;
729 next_row = row_it->data_relative(row_offset);
730 next_index =
static_cast<int32_t
>(std::floor(next_row->
intercept()));
731 if ((
distance < 0 && next_index < line_index &&
733 (
distance >= 0 && next_index > line_index &&
736 tprintf(
" nearer neighbour (%d) at %g\n", line_index +
distance - next_index,
740 }
else if (next_index == line_index || next_index == line_index +
distance +
distance) {
743 tprintf(
" equal but more believable at %g (%g/%g)\n", next_row->
intercept(),
749 row_offset += row_inc;
750 }
while ((next_index == line_index || next_index == line_index +
distance +
distance) &&
751 row_offset < row_it->length());
773 TO_ROW_IT row_it = block->
get_rows();
778 length = std::sqrt(gradient * gradient + 1);
779 rotation =
FCOORD(1 / length, -gradient / length);
780 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
783 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
784 blob = blob_it.data();
786 blob_box.
rotate(rotation);
811 TO_ROW_IT row_it = block->
get_rows();
818 line_count = max_y - min_y + 1;
819 length = std::sqrt(gradient * gradient + 1);
820 rotation =
FCOORD(1 / length, -gradient / length);
821 for (line_index = 0; line_index < line_count; line_index++) {
822 deltas[line_index] = 0;
824 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
827 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
828 blob = blob_it.data();
830 blob_box.
rotate(rotation);
831 int32_t width = blob_box.
right() - blob_box.
left();
832 index = blob_box.
bottom() - min_y;
835 deltas[index] += width;
836 index = blob_box.
top() - min_y;
838 deltas[index] -= width;
841 occupation[0] = deltas[0];
842 for (line_index = 1; line_index < line_count; line_index++) {
843 occupation[line_index] = occupation[line_index - 1] + deltas[line_index];
869 if (low_window + high_window < line_count) {
870 for (sum = 0, high_index = 0; high_index < low_window; high_index++) {
871 sum += occupation[high_index];
873 for (low_index = 0; low_index < high_window; low_index++, high_index++) {
874 sum += occupation[high_index];
876 min_occ = occupation[0];
878 for (test_index = 1; test_index < high_index; test_index++) {
879 if (occupation[test_index] <= min_occ) {
880 min_occ = occupation[test_index];
881 min_index = test_index;
884 for (line_index = 0; line_index < low_window; line_index++) {
885 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
888 for (low_index = 0; high_index < line_count; low_index++, high_index++) {
889 sum -= occupation[low_index];
890 sum += occupation[high_index];
891 if (occupation[high_index] <= min_occ) {
893 min_occ = occupation[high_index];
894 min_index = high_index;
897 if (min_index <= low_index) {
898 min_occ = occupation[low_index + 1];
899 min_index = low_index + 1;
900 for (test_index = low_index + 2; test_index <= high_index; test_index++) {
901 if (occupation[test_index] <= min_occ) {
902 min_occ = occupation[test_index];
904 min_index = test_index;
908 thresholds[line_index++] = (sum - min_occ) / divisor + min_occ;
911 min_occ = occupation[0];
913 for (sum = 0, low_index = 0; low_index < line_count; low_index++) {
914 if (occupation[low_index] < min_occ) {
915 min_occ = occupation[low_index];
916 min_index = low_index;
918 sum += occupation[low_index];
922 for (; line_index < line_count; line_index++) {
923 thresholds[line_index] = (sum - min_occ) / divisor + min_occ;
942 int32_t prev_threshold;
949 prev_threshold = thresholds[line_index];
953 }
while (line_index < line_count && (occupation[line_index] < thresholds[line_index] ||
954 occupation[line_index - 1] >= prev_threshold));
955 if (line_index < line_count) {
956 back_index = line_index - 1;
958 while (next_dist < -distance && back_index >= 0) {
959 thresholds[back_index] = next_dist;
966 }
while (line_index < line_count);
986 float y_bottom, y_top;
990 BLOBNBOX_IT blob_it = &block->
blobs;
991 TO_ROW_IT row_it = block->
get_rows();
993#ifndef GRAPHICS_DISABLED
1018 row_it.move_to_last();
1020 row = row_it.data();
1021 y_max = row->
max_y();
1022 y_min = row->
min_y();
1026 block->
line_size * textord_expansion_factor *
1028 if (y_min > y_bottom) {
1030 tprintf(
"Expanding bottom of row at %f from %f to %f\n", row->
intercept(), y_min, y_bottom);
1033 swallowed_row =
true;
1034 while (swallowed_row && !row_it.at_last()) {
1035 swallowed_row =
false;
1037 test_row = row_it.data_relative(1);
1039 if (test_row->
max_y() > y_bottom) {
1040 if (test_row->
min_y() > y_bottom) {
1045#ifndef GRAPHICS_DISABLED
1051 blob_it.add_list_after(test_row->
blob_list());
1053 delete row_it.extract();
1055 swallowed_row =
true;
1056 }
else if (test_row->
max_y() < y_min) {
1058 y_bottom = test_row->
max_y();
1060 tprintf(
"Truncating limit to %f due to touching row at %f\n", y_bottom,
1066 tprintf(
"Not expanding limit beyond %f due to touching row at %f\n", y_bottom,
1074 if (y_max < y_top) {
1076 tprintf(
"Expanding top of row at %f from %f to %f\n", row->
intercept(), y_max, y_top);
1078 swallowed_row =
true;
1079 while (swallowed_row && !row_it.at_first()) {
1080 swallowed_row =
false;
1082 test_row = row_it.data_relative(-1);
1083 if (test_row->
min_y() < y_top) {
1084 if (test_row->
max_y() < y_top) {
1090#ifndef GRAPHICS_DISABLED
1095 blob_it.add_list_after(test_row->
blob_list());
1097 delete row_it.extract();
1099 swallowed_row =
true;
1100 }
else if (test_row->
min_y() < y_max) {
1102 y_top = test_row->
min_y();
1104 tprintf(
"Truncating limit to %f due to touching row at %f\n", y_top,
1110 tprintf(
"Not expanding limit beyond %f due to touching row at %f\n", y_top,
1121 }
while (!row_it.at_last());
1136 TO_ROW_IT row_it = block->
get_rows();
1142 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1143 row = row_it.data();
1147 row->
max_y(), size);
1171 TO_ROW_IT row_it = block->
get_rows();
1173 int16_t rowcount = row_it.length();
1175 std::vector<TO_ROW *> rows(rowcount);
1178 row_it.move_to_last();
1180 row = row_it.data();
1181 if (prev_row !=
nullptr) {
1182 rows[rowcount++] = prev_row;
1195 }
while (!row_it.at_last());
1203 rows.resize(rowcount);
1204 row_index = rowcount * 3 / 4;
1205 std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1206 iqr = rows[row_index]->spacing;
1207 row_index = rowcount / 4;
1208 std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1209 iqr -= rows[row_index]->spacing;
1210 row_index = rowcount / 2;
1211 std::nth_element(rows.begin(), rows.begin() + row_index, rows.end(), row_spacing_order);
1212 block->
key_row = rows[row_index];
1214 tprintf(
" row based=%g(%g)", rows[row_index]->spacing, iqr);
1218 if (rows[row_index]->spacing < block->line_spacing &&
1219 rows[row_index]->spacing > block->
line_size) {
1221 block->
line_size = rows[row_index]->spacing;
1223 }
else if (rows[row_index]->spacing > block->
line_spacing) {
1228 if (rows[row_index]->spacing < block->line_spacing) {
1229 block->
line_size = rows[row_index]->spacing;
1244 tprintf(
"\nEstimate line size=%g, spacing=%g, offset=%g\n", block->
line_size,
1282 int32_t min_height, max_height;
1283 TO_ROW_IT row_it = block->
get_rows();
1284 if (row_it.empty()) {
1291 STATS row_asc_xheights(min_height, max_height);
1292 STATS row_asc_ascrise(
static_cast<int>(min_height * asc_frac_xheight),
1293 static_cast<int>(max_height * asc_frac_xheight));
1294 int min_desc_height =
static_cast<int>(min_height * desc_frac_xheight);
1295 int max_desc_height =
static_cast<int>(max_height * desc_frac_xheight);
1296 STATS row_asc_descdrop(min_desc_height, max_desc_height);
1297 STATS row_desc_xheights(min_height, max_height);
1298 STATS row_desc_descdrop(min_desc_height, max_desc_height);
1299 STATS row_cap_xheights(min_height, max_height);
1300 STATS row_cap_floating_xheights(min_height, max_height);
1301 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1302 row = row_it.data();
1316 fill_heights(row, gradient, min_height, max_height, &row_cap_xheights,
1317 &row_cap_floating_xheights);
1321 float xheight = 0.0;
1322 float ascrise = 0.0;
1323 float descdrop = 0.0;
1327 xheight = row_asc_xheights.
median();
1328 ascrise = row_asc_ascrise.
median();
1329 descdrop = -row_asc_descdrop.
median();
1330 }
else if (row_desc_xheights.
get_total() > 0) {
1332 xheight = row_desc_xheights.
median();
1333 descdrop = -row_desc_descdrop.
median();
1334 }
else if (row_cap_xheights.
get_total() > 0) {
1343 &row_cap_xheights, &row_cap_floating_xheights,
1345 max_height, &(xheight), &(ascrise));
1353 bool corrected_xheight =
false;
1356 corrected_xheight =
true;
1358 if (corrected_xheight || ascrise <= 0) {
1359 ascrise = xheight * asc_frac_xheight;
1361 if (corrected_xheight || descdrop >= 0) {
1362 descdrop = -(xheight * desc_frac_xheight);
1367 tprintf(
"Block average xheight=%.4f, ascrise=%.4f, descdrop=%.4f\n", xheight, ascrise,
1371 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1387 int block_line_size) {
1395 int min_height, max_height;
1397 STATS heights(min_height, max_height);
1398 STATS floating_heights(min_height, max_height);
1399 fill_heights(row, gradient, min_height, max_height, &heights, &floating_heights);
1403 &heights, &floating_heights, textord_single_height_mode && rotation.
y() == 0.0, min_height,
1419 STATS *floating_heights) {
1426 if (blob_it.empty()) {
1431 blob = blob_it.data();
1439 top -= gradient * xcentre + row->
parallel_c();
1441 if (top >= min_height && top <= max_height) {
1442 heights->
add(
static_cast<int32_t
>(floor(top + 0.5)), 1);
1444 floating_heights->
add(
static_cast<int32_t
>(floor(top + 0.5)), 1);
1452 while (!blob_it.at_first() && blob_it.data()->repeated_set() == repeated_set) {
1455 tprintf(
"Skipping repeated char when computing xheight\n");
1461 }
while (!blob_it.at_first());
1481 int min_height,
int max_height,
float *xheight,
float *ascrise) {
1482 int blob_index = heights->
mode();
1483 int blob_count = heights->
pile_count(blob_index);
1485 tprintf(
"min_height=%d, max_height=%d, mode=%d, count=%d, total=%d\n", min_height, max_height,
1486 blob_index, blob_count, heights->
get_total());
1488 floating_heights->
print();
1490 if (blob_count == 0) {
1494 bool in_best_pile =
false;
1495 int prev_size = -INT32_MAX;
1498 if (cap_only && mode_count > 1) {
1503 tprintf(
"found %d modes: ", mode_count);
1504 for (
x = 0;
x < mode_count;
x++) {
1510 for (
x = 0;
x < mode_count - 1;
x++) {
1511 if (modes[
x] != prev_size + 1) {
1512 in_best_pile =
false;
1516 (in_best_pile || modes_x_count > best_count)) {
1517 for (
int asc =
x + 1; asc < mode_count; asc++) {
1518 float ratio =
static_cast<float>(modes[asc]) /
static_cast<float>(modes[
x]);
1521 if (modes_x_count > best_count) {
1522 in_best_pile =
true;
1523 best_count = modes_x_count;
1526 tprintf(
"X=%d, asc=%d, count=%d, ratio=%g\n", modes[
x], modes[asc] - modes[
x],
1527 modes_x_count, ratio);
1529 prev_size = modes[
x];
1530 *xheight =
static_cast<float>(modes[
x]);
1531 *ascrise =
static_cast<float>(modes[asc] - modes[
x]);
1536 if (*xheight == 0) {
1543 if (floating_heights->
get_total() > 0) {
1544 for (
x = min_height;
x < max_height; ++
x) {
1547 blob_index = heights->
mode();
1548 for (
x = min_height;
x < max_height; ++
x) {
1552 *xheight =
static_cast<float>(blob_index);
1554 best_count = heights->
pile_count(blob_index);
1556 tprintf(
"Single mode xheight set to %g\n", *xheight);
1559 tprintf(
"Multi-mode xheight set to %g, asc=%g\n", *xheight, *ascrise);
1577 STATS *asc_heights) {
1587 int num_potential_asc = 0;
1588 for (
int i = i_min;
i <= i_max; ++
i) {
1597 STATS heights(min_height, max_height);
1598 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1599 blob = blob_it.data();
1603 if (height >= min_height && height <= max_height) {
1604 heights.
add(
static_cast<int>(floor(height + 0.5)), 1);
1608 int blob_index = heights.
mode();
1609 int blob_count = heights.
pile_count(blob_index);
1611 if (
static_cast<float>(blob_count + num_potential_asc) < xheight_blob_count * total_fraction) {
1614 int descdrop = blob_count > 0 ? -blob_index : 0;
1616 tprintf(
"Descdrop: %d (potential ascenders %d, descenders %d)\n", descdrop, num_potential_asc,
1637 int32_t least_count;
1638 int32_t least_index;
1641 src_count = max_height + 1 - min_height;
1643 least_count = INT32_MAX;
1645 for (src_index = 0; src_index < src_count; src_index++) {
1646 pile_count = heights->
pile_count(min_height + src_index);
1647 if (pile_count > 0) {
1648 if (dest_count < maxmodes) {
1649 if (pile_count < least_count) {
1651 least_count = pile_count;
1652 least_index = dest_count;
1654 modes[dest_count++] = min_height + src_index;
1655 }
else if (pile_count >= least_count) {
1656 while (least_index < maxmodes - 1) {
1657 modes[least_index] = modes[least_index + 1];
1662 modes[maxmodes - 1] = min_height + src_index;
1663 if (pile_count == least_count) {
1665 least_index = maxmodes - 1;
1669 for (dest_count = 1; dest_count < maxmodes; dest_count++) {
1670 pile_count = heights->
pile_count(modes[dest_count]);
1671 if (pile_count < least_count) {
1673 least_count = pile_count;
1674 least_index = dest_count;
1694 "correcting row xheight: row->xheight %.4f"
1695 ", row->acrise %.4f row->descdrop %.4f\n",
1713 (row_category ==
ROW_UNKNOWN && normal_xheight)) {
1715 tprintf(
"using average xheight\n");
1726 tprintf(
"lowercase, corrected ascrise\n");
1743 if (row->
xheight < xheight + ascrise && row->
xheight > xheight) {
1746 tprintf(
"all caps with irregular xheight\n");
1756 "corrected row->xheight = %.4f, row->acrise = %.4f, row->descdrop"
1762static int CountOverlaps(
const TBOX &box,
int min_height, BLOBNBOX_LIST *blobs) {
1764 BLOBNBOX_IT blob_it(blobs);
1765 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1766 BLOBNBOX *blob = blob_it.data();
1767 const TBOX &blob_box = blob->bounding_box();
1768 if (blob_box.height() >= min_height && box.major_overlap(blob_box)) {
1792 BLOBNBOX_IT blob_it;
1796 TO_ROW_IT row_it = block->
get_rows();
1800 length = std::sqrt(1 + gradient * gradient);
1801 g_vec =
FCOORD(1 / length, -gradient / length);
1802 blob_rotation =
FCOORD(rotation.
x(), -rotation.
y());
1803 blob_rotation.
rotate(g_vec);
1804 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1805 row = row_it.data();
1808 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1809 blob = blob_it.data();
1819 under_it.add_after_then_move(blob_it.extract());
1821 tprintf(
"Underlined blob at:");
1827 textord_max_blob_overlaps) {
1828 large_it.add_after_then_move(blob_it.extract());
1830 tprintf(
"Large blob overlaps %d blobs at:",
1831 CountOverlaps(blob_box, min_blob_height, row->
blob_list()));
1835 delete rotated_blob;
1852#ifndef GRAPHICS_DISABLED
1859 BLOBNBOX_IT blob_it;
1860 BLOBNBOX_IT start_it;
1861 TO_ROW_IT row_it = block->
get_rows();
1863#ifndef GRAPHICS_DISABLED
1867 blob_rotation =
FCOORD(rotation.
x(), -rotation.
y());
1868 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1870 blob_it.set_to_list(row_it.data()->blob_list());
1871 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1872 blob = blob_it.data();
1886 if (!blob_it.at_last()) {
1887 nextblob = blob_it.data_relative(1);
1890 blob->
merge(nextblob);
1896 blob->
chop(&start_it, &blob_it, blob_rotation,
1900#ifndef GRAPHICS_DISABLED
1906 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1907 blob = blob_it.data();
1909 blob_box.
rotate(rotation);
1935#ifndef GRAPHICS_DISABLED
1938 TO_ROW_IT row_it = block->
get_rows();
1940 row_it.move_to_first();
1941 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1942 if (row_it.data()->blob_list()->empty()) {
1943 delete row_it.extract();
1948#ifndef GRAPHICS_DISABLED
1951 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
1960 row_it.sort(row_y_order);
1977 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1978 if (!blob_it.data()->joined_to_prev()) {
1979 const TBOX &box = blob_it.data()->bounding_box();
1987 error = lms.
Fit(&gradient, &c);
2001#ifndef GRAPHICS_DISABLED
2003 if (testing_on &&
to_win ==
nullptr) {
2007 TO_ROW_IT row_it = block->
get_rows();
2009 row_it.move_to_first();
2010 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2011 if (row_it.data()->blob_list()->empty()) {
2012 delete row_it.extract();
2018#ifndef GRAPHICS_DISABLED
2021 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2022 row_it.data()->baseline.plot(
to_win, colour);
2030 make_old_baselines(block, testing_on, gradient);
2032#ifndef GRAPHICS_DISABLED
2035 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2036 row_it.data()->baseline.plot(
to_win, colour);
2059 auto *xstarts =
new int32_t[row->
blob_list()->length() + 1];
2064 xstarts[1] = xstarts[segments];
2066 coeffs =
new double[3];
2068 coeffs[1] = row->
line_m();
2069 coeffs[2] = row->
line_c();
2100 BLOBNBOX_IT new_it = blob_it;
2103 needs_curve =
false;
2105 xstarts[0] = box.
left();
2109 tprintf(
"Segmenting baseline of %d blobs at (%d,%d)\n", blobcount, box.
left(), box.
bottom());
2112 blob_it.move_to_last();
2113 box = blob_it.data()->bounding_box();
2114 xstarts[1] = box.
right();
2118 new_it.mark_cycle_pt();
2121 middle = (new_box.
left() + new_box.
right()) / 2.0;
2124 yshifts.
add(yshift, blobindex);
2125 if (new_it.cycled_list()) {
2126 xstarts[1] = new_box.
right();
2150 xstarts[segments++] = box.
left();
2156 middle = (new_box.
left() + new_box.
right()) / 2.0;
2158 yshifts.
add(yshift, blobindex);
2161 }
while (!new_it.cycled_list());
2163 xstarts[segments] = new_box.
right();
2165 xstarts[--segments] = new_box.
right();
2168 tprintf(
"Made %d segments on row at (%d,%d)\n", segments, box.
right(), box.
bottom());
2189 int blobs_per_segment;
2194 BLOBNBOX_IT new_it = blob_it;
2200 xstarts[0] = box.
left();
2202 while (!blob_it.at_first()) {
2210 blobs_per_segment = blobcount / segments;
2212 auto *coeffs =
new double[segments * 3];
2215 "Linear splining baseline of %d blobs at (%d,%d), into %d segments of "
2217 blobcount, box.
left(), box.
bottom(), segments, blobs_per_segment);
2220 for (index2 = 0; index2 < blobs_per_segment / 2; index2++) {
2226 blobindex += blobs_per_segment;
2228 while (index1 < blobindex || (segment == segments && index1 < blobcount)) {
2230 int middle = (box.
left() + box.
right()) / 2;
2233 if (index1 == blobindex - blobs_per_segment / 2 || index1 == blobcount - 1) {
2234 xstarts[segment] = box.
left();
2238 coeffs[segment * 3 - 3] = 0;
2239 coeffs[segment * 3 - 2] = b;
2240 coeffs[segment * 3 - 1] = c;
2242 if (segment > segments) {
2246 blobindex += blobs_per_segment;
2248 while (index2 < blobindex || (segment == segments && index2 < blobcount)) {
2250 int middle = (new_box.
left() + new_box.
right()) / 2;
2253 if (index2 == blobindex - blobs_per_segment / 2 || index2 == blobcount - 1) {
2254 xstarts[segment] = new_box.
left();
2258 coeffs[segment * 3 - 3] = 0;
2259 coeffs[segment * 3 - 2] = b;
2260 coeffs[segment * 3 - 1] = c;
2262 }
while (segment <= segments);
2283 float g_length = 1.0f;
2288 float smooth_factor;
2293 TO_ROW *dest_row =
nullptr;
2295 BLOBNBOX_IT blob_it = &block->
blobs;
2296 TO_ROW_IT row_it = block->
get_rows();
2301 if (gradient !=
nullptr) {
2302 g_length = std::sqrt(1 + *gradient * *gradient);
2304#ifndef GRAPHICS_DISABLED
2311 smooth_factor = 1.0;
2313 row_count = row_it.length();
2314 if (!blob_it.empty()) {
2315 left_x = blob_it.data()->bounding_box().left();
2320 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
2321 blob = blob_it.data();
2322 if (gradient !=
nullptr) {
2326 last_x - left_x > block->
line_size * 2 && textord_interpolating_skew) {
2328 block_skew *=
static_cast<float>(blob->
bounding_box().
left() - left_x) / (last_x - left_x);
2334#ifndef GRAPHICS_DISABLED
2339 if (!row_it.empty()) {
2340 for (row_it.move_to_first(); !row_it.at_last() && row_it.data()->min_y() > top;
2343 row = row_it.data();
2344 if (row->
min_y() <= top && row->
max_y() >= bottom) {
2349 if (overlap_result ==
NEW_ROW && !reject_misses) {
2354 if (!make_new_rows) {
2355 near_dist = row_it.data_relative(-1)->min_y() - top;
2357 if (bottom < row->min_y()) {
2364 }
else if (near_dist > 0 && near_dist < bottom - row->max_y()) {
2366 dest_row = row_it.data();
2373 if (top - row->
max_y() <=
2383 if (overlap_result ==
ASSIGN) {
2386 if (overlap_result ==
NEW_ROW) {
2387 if (make_new_rows && top - bottom < block->max_blob_size) {
2388 dest_row =
new TO_ROW(blob_it.extract(), top, bottom, block->
line_size);
2390 if (bottom > row_it.data()->min_y()) {
2391 row_it.add_before_then_move(dest_row);
2394 row_it.add_after_then_move(dest_row);
2396 smooth_factor = 1.0 / (row_count *
textord_skew_lag + textord_skewsmooth_offset);
2401 }
else if (make_new_rows && top - bottom < block->max_blob_size) {
2403 dest_row =
new TO_ROW(blob_it.extract(), top, bottom, block->
line_size);
2405 row_it.add_after_then_move(dest_row);
2406 smooth_factor = 1.0 / (row_count *
textord_skew_lag + textord_skewsmooth_offset2);
2411 if (overlap_result !=
REJECT) {
2412 tprintf(
"Test blob assigned to row at (%g,%g) on pass %d\n", dest_row->
min_y(),
2413 dest_row->
max_y(), pass);
2415 tprintf(
"Test blob assigned to no row on pass %d\n", pass);
2418 if (overlap_result !=
REJECT) {
2419 while (!row_it.at_first() && row_it.data()->min_y() > row_it.data_relative(-1)->min_y()) {
2420 row = row_it.extract();
2422 row_it.add_before_then_move(row);
2424 while (!row_it.at_last() && row_it.data()->min_y() < row_it.data_relative(1)->min_y()) {
2425 row = row_it.extract();
2428 row_it.add_after_then_move(row);
2430 BLOBNBOX_IT added_blob_it(dest_row->
blob_list());
2431 added_blob_it.move_to_last();
2432 TBOX prev_box = added_blob_it.data_relative(-1)->bounding_box();
2434 block_skew = (1 - smooth_factor) * block_skew +
2439 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
2440 if (row_it.data()->blob_list()->empty()) {
2441 delete row_it.extract();
2462 float merge_top, merge_bottom;
2466 BLOBNBOX_IT blob_it;
2469 row = row_it->data();
2470 bestover = top - bottom;
2471 if (top > row->
max_y()) {
2472 bestover -= top - row->
max_y();
2474 if (bottom < row->min_y()) {
2476 bestover -= row->
min_y() - bottom;
2479 tprintf(
"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f\n", bottom, top, row->
min_y(),
2480 row->
max_y(), rowsize, bestover);
2484 if (!row_it->at_last()) {
2486 test_row = row_it->data();
2487 if (test_row->
min_y() <= top && test_row->
max_y() >= bottom) {
2490 if (merge_top - merge_bottom <= rowsize) {
2495 test_row->
set_limits(merge_bottom, merge_top);
2496 blob_it.set_to_list(test_row->
blob_list());
2497 blob_it.add_list_after(row->
blob_list());
2500 delete row_it->extract();
2504 overlap = top - bottom;
2505 if (top > test_row->
max_y()) {
2506 overlap -= top - test_row->
max_y();
2508 if (bottom < test_row->min_y()) {
2509 overlap -= test_row->
min_y() - bottom;
2511 if (bestover >= rowsize - 1 && overlap >= rowsize - 1) {
2514 if (overlap > bestover) {
2519 tprintf(
"Test blob y=(%g,%g), row=(%f,%f), size=%g, overlap=%f->%f\n", bottom, top,
2520 test_row->
min_y(), test_row->
max_y(), rowsize, overlap, bestover);
2524 }
while (!row_it->at_last() && test_row->
min_y() <= top && test_row->
max_y() >= bottom);
2525 while (row_it->data() != row) {
2529 if (top - bottom - bestover > rowsize * textord_overlap_x &&
2544 const void *item2) {
2567 int num_repeated_sets = 0;
2568 if (!box_it.empty()) {
2571 int repeat_length = 1;
2573 BLOBNBOX_IT test_it(box_it);
2574 for (test_it.forward(); !test_it.at_first();) {
2575 bblob = test_it.data();
2580 bblob = test_it.data();
2589 num_repeated_sets++;
2590 for (; repeat_length > 0; box_it.forward(), --repeat_length) {
2591 bblob = box_it.data();
2598 }
while (!box_it.at_first());
#define BOOL_VAR(name, val, comment)
#define INT_VAR(name, val, comment)
#define double_VAR(name, val, comment)
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
bool textord_old_baselines
C_BLOB * crotate_cblob(C_BLOB *blob, FCOORD rotation)
void expand_rows(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
bool textord_show_final_rows
void delete_non_dropout_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
void get_min_max_xheight(int block_linesize, int *min_height, int *max_height)
double textord_xheight_error_margin
void make_baseline_spline(TO_ROW *row, TO_BLOCK *block)
void fit_parallel_lms(float gradient, TO_ROW *row)
int textord_spline_minblobs
void pre_associate_blobs(ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
void cleanup_rows_making(ICOORD page_tr, TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
void compute_row_stats(TO_BLOCK *block, bool testing_on)
int32_t compute_height_modes(STATS *heights, int32_t min_height, int32_t max_height, int32_t *modes, int32_t maxmodes)
double textord_chop_width
bool textord_show_initial_rows
bool textord_show_expanded_rows
double textord_ascheight_mode_fraction
double textord_spline_shift_fraction
double textord_descx_ratio_min
void tprintf(const char *format,...)
bool textord_debug_xheights
void adjust_row_limits(TO_BLOCK *block)
int textord_spline_medianwin
double textord_excess_blobsize
void mark_repeated_chars(TO_ROW *row)
bool test_underline(bool testing_on, C_BLOB *blob, int16_t baseline, int16_t xheight)
void assign_blobs_to_rows(TO_BLOCK *block, float *gradient, int pass, bool reject_misses, bool make_new_rows, bool drawing_skew)
void fill_heights(TO_ROW *row, float gradient, int min_height, int max_height, STATS *heights, STATS *floating_heights)
bool textord_parallel_baselines
void make_initial_textrows(ICOORD page_tr, TO_BLOCK *block, FCOORD rotation, bool testing_on)
void fit_lms_line(TO_ROW *row)
double textord_min_blob_height_fraction
bool textord_show_parallel_rows
bool textord_fix_makerow_bug
void vigorous_noise_removal(TO_BLOCK *block)
void compute_page_skew(TO_BLOCK_LIST *blocks, float &page_m, float &page_err)
double textord_occupancy_threshold
double textord_underline_width
double * linear_spline_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t xstarts[])
int textord_lms_line_trials
double textord_ascx_ratio_max
bool within_error_margin(float test, float num, float margin)
double textord_ascx_ratio_min
TBOX deskew_block_coords(TO_BLOCK *block, float gradient)
void plot_to_row(TO_ROW *row, ScrollView::Color colour, FCOORD rotation)
float make_rows(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
void plot_parallel_row(TO_ROW *row, float gradient, int32_t left, ScrollView::Color colour, FCOORD rotation)
const int kMinLeaderCount
bool textord_test_landscape
bool textord_new_initial_xheight
void separate_underlines(TO_BLOCK *block, float gradient, FCOORD rotation, bool testing_on)
void fit_parallel_rows(TO_BLOCK *block, float gradient, FCOORD rotation, int32_t block_edge, bool testing_on)
double textord_linespace_iqrlimit
int compute_xheight_from_modes(STATS *heights, STATS *floating_heights, bool cap_only, int min_height, int max_height, float *xheight, float *ascrise)
double textord_min_linesize
ScrollView * create_to_win(ICOORD page_tr)
void correct_row_xheight(TO_ROW *row, float xheight, float ascrise, float descdrop)
int textord_min_blobs_in_row
OVERLAP_STATE most_overlapping_row(TO_ROW_IT *row_it, TO_ROW *&best_row, float top, float bottom, float rowsize, bool testing_blob)
double textord_width_limit
int32_t compute_row_descdrop(TO_ROW *row, float gradient, int xheight_blob_count, STATS *asc_heights)
double textord_xheight_mode_fraction
bool textord_fix_xheight_bug
ROW_CATEGORY get_row_category(const TO_ROW *row)
int blob_x_order(const void *item1, const void *item2)
void compute_occupation_threshold(int32_t low_window, int32_t high_window, int32_t line_count, int32_t *occupation, int32_t *thresholds)
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
bool textord_show_final_blobs
float make_single_row(ICOORD page_tr, bool allow_sub_blobs, TO_BLOCK *block, TO_BLOCK_LIST *blocks)
void compute_dropout_distances(int32_t *occupation, int32_t *thresholds, int32_t line_count)
bool textord_straight_baselines
void compute_line_occupation(TO_BLOCK *block, float gradient, int32_t min_y, int32_t max_y, int32_t *occupation, int32_t *deltas)
bool segment_baseline(TO_ROW *row, TO_BLOCK *block, int32_t &segments, int32_t *xstarts)
double textord_descx_ratio_max
bool find_best_dropout_row(TO_ROW *row, int32_t distance, float dist_limit, int32_t line_index, TO_ROW_IT *row_it, bool testing_on)
void draw_occupation(int32_t xleft, int32_t ybottom, int32_t min_y, int32_t max_y, int32_t occupation[], int32_t thresholds[])
const TBOX & bounding_box() const
void set_repeated_set(int set_id)
void merge(BLOBNBOX *nextblob)
BlobTextFlowType flow() const
void chop(BLOBNBOX_IT *start_it, BLOBNBOX_IT *blob_it, FCOORD rotation, float xheight)
bool joined_to_prev() const
bool rep_chars_marked() const
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
float believability() const
void set_num_repeated_sets(int num_sets)
float initial_min_y() const
BLOBNBOX_LIST * blob_list()
void set_line(float new_m, float new_c, float new_error)
int num_repeated_sets() const
void set_limits(float new_min, float new_max)
void set_parallel_line(float gradient, float new_c, float new_error)
BLOBNBOX_LIST small_blobs
BLOBNBOX_LIST large_blobs
BLOBNBOX_LIST noise_blobs
static const double kXHeightCapRatio
static const double kXHeightFraction
static const double kDescenderFraction
static const double kAscenderFraction
int32_t pathlength() const
ICOORD step(int index) const
static C_OUTLINE * deep_copy(const C_OUTLINE *src)
const ICOORD & start_pos() const
void Add(const ICOORD &pt)
double ConstrainedFit(const FCOORD &direction, double min_dist, double max_dist, bool debug, ICOORD *line_pt)
double Fit(ICOORD *pt1, ICOORD *pt2)
FCOORD classify_rotation() const
PDBLK pdblk
Page Description Block.
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
TDimension y() const
access_function
TDimension x() const
access function
void rotate(const FCOORD vec)
TDimension height() const
void rotate(const FCOORD &vec)
bool major_x_overlap(const TBOX &box) const
TDimension bottom() const
bool contains(const FCOORD pt) const
void add(int32_t value, int32_t count)
int32_t pile_count(int32_t value) const
int32_t get_total() const
int32_t min_bucket() const
int32_t max_bucket() const
static C_BLOB * FakeBlob(const TBOX &box)
C_OUTLINE_LIST * out_list()
TBOX bounding_box() const
void add(float value, int32_t key)
void compute_row_xheight(TO_ROW *row, const FCOORD &rotation, float gradient, int block_line_size)
void make_spline_rows(TO_BLOCK *block, float gradient, bool testing_on)
void compute_block_xheight(TO_BLOCK *block, float gradient)
void Rectangle(int x1, int y1, int x2, int y2)
void SetCursor(int x, int y)
void DrawTo(int x, int y)