tesseract v5.3.3.20231005
tordmain.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: tordmain.cpp (Formerly textordp.c)
3 * Description: C++ top level textord code.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#define _USE_MATH_DEFINES // for M_PI
20
21#ifdef HAVE_CONFIG_H
22# include "config_auto.h"
23#endif
24
25#include "tordmain.h"
26
27#include "arrayaccess.h" // for GET_DATA_BYTE
28#include "blobbox.h" // for BLOBNBOX_IT, BLOBNBOX, TO_BLOCK, TO_B...
29#include "ccstruct.h" // for CCStruct, CCStruct::kXHeightFraction
30#include "clst.h" // for CLISTIZE
31#include "coutln.h" // for C_OUTLINE_IT, C_OUTLINE_LIST, C_OUTLINE
32#include "drawtord.h" // for plot_box_list, to_win, create_to_win
33#include "edgblob.h" // for extract_edges
34#include "errcode.h" // for ASSERT_HOST, ...
35#include "makerow.h" // for textord_test_x, textord_test_y, texto...
36#include "ocrblock.h" // for BLOCK_IT, BLOCK, BLOCK_LIST (ptr only)
37#include "ocrrow.h" // for ROW, ROW_IT, ROW_LIST, tweak_row_base...
38#include "params.h" // for DoubleParam, BoolParam, IntParam
39#include "pdblock.h" // for PDBLK
40#include "points.h" // for FCOORD, ICOORD
41#include "polyblk.h" // for POLY_BLOCK
42#include "quadratc.h" // for QUAD_COEFFS
43#include "quspline.h" // for QSPLINE, tweak_row_baseline
44#include "rect.h" // for TBOX
45#include "scrollview.h" // for ScrollView, ScrollView::WHITE
46#include "statistc.h" // for STATS
47#include "stepblob.h" // for C_BLOB_IT, C_BLOB, C_BLOB_LIST
48#include "textord.h" // for Textord, WordWithBox, WordGrid, WordS...
49#include "tprintf.h" // for tprintf
50#include "werd.h" // for WERD_IT, WERD, WERD_LIST, W_DONT_CHOP
51
52#include <allheaders.h> // for pixDestroy, pixGetHeight, boxCreate
53
54#include <cfloat> // for FLT_MAX
55#include <cmath> // for ceil, floor, M_PI
56#include <cstdint> // for INT16_MAX, uint32_t, int32_t, int16_t
57#include <memory>
58
59namespace tesseract {
60
61#define MAX_NEAREST_DIST 600 // for block skew stats
62
63/**********************************************************************
64 * SetBlobStrokeWidth
65 *
66 * Set the horizontal and vertical stroke widths in the blob.
67 **********************************************************************/
69 // Cut the blob rectangle into a Pix.
70 int pix_height = pixGetHeight(pix);
71 const TBOX &box = blob->bounding_box();
72 int width = box.width();
73 int height = box.height();
74 Box *blob_pix_box = boxCreate(box.left(), pix_height - box.top(), width, height);
75 Image pix_blob = pixClipRectangle(pix, blob_pix_box, nullptr);
76 boxDestroy(&blob_pix_box);
77 Image dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
78 pix_blob.destroy();
79 // Compute the stroke widths.
80 uint32_t *data = pixGetData(dist_pix);
81 int wpl = pixGetWpl(dist_pix);
82 // Horizontal width of stroke.
83 STATS h_stats(0, width);
84 for (int y = 0; y < height; ++y) {
85 uint32_t *pixels = data + y * wpl;
86 int prev_pixel = 0;
87 int pixel = GET_DATA_BYTE(pixels, 0);
88 for (int x = 1; x < width; ++x) {
89 int next_pixel = GET_DATA_BYTE(pixels, x);
90 // We are looking for a pixel that is equal to its vertical neighbours,
91 // yet greater than its left neighbour.
92 if (prev_pixel < pixel && (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
93 (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
94 if (pixel > next_pixel) {
95 // Single local max, so an odd width.
96 h_stats.add(pixel * 2 - 1, 1);
97 } else if (pixel == next_pixel && x + 1 < width && pixel > GET_DATA_BYTE(pixels, x + 1)) {
98 // Double local max, so an even width.
99 h_stats.add(pixel * 2, 1);
100 }
101 }
102 prev_pixel = pixel;
103 pixel = next_pixel;
104 }
105 }
106 // Vertical width of stroke.
107 STATS v_stats(0, height);
108 for (int x = 0; x < width; ++x) {
109 int prev_pixel = 0;
110 int pixel = GET_DATA_BYTE(data, x);
111 for (int y = 1; y < height; ++y) {
112 uint32_t *pixels = data + y * wpl;
113 int next_pixel = GET_DATA_BYTE(pixels, x);
114 // We are looking for a pixel that is equal to its horizontal neighbours,
115 // yet greater than its upper neighbour.
116 if (prev_pixel < pixel && (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
117 (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
118 if (pixel > next_pixel) {
119 // Single local max, so an odd width.
120 v_stats.add(pixel * 2 - 1, 1);
121 } else if (pixel == next_pixel && y + 1 < height &&
122 pixel > GET_DATA_BYTE(pixels + wpl, x)) {
123 // Double local max, so an even width.
124 v_stats.add(pixel * 2, 1);
125 }
126 }
127 prev_pixel = pixel;
128 pixel = next_pixel;
129 }
130 }
131 dist_pix.destroy();
132 // Store the horizontal and vertical width in the blob, keeping both
133 // widths if there is enough information, otherwise only the one with
134 // the most samples.
135 // If there are insufficient samples, store zero, rather than using
136 // 2*area/perimeter, as the numbers that gives do not match the numbers
137 // from the distance method.
138 if (h_stats.get_total() >= (width + height) / 4) {
139 blob->set_horz_stroke_width(h_stats.ile(0.5f));
140 if (v_stats.get_total() >= (width + height) / 4) {
141 blob->set_vert_stroke_width(v_stats.ile(0.5f));
142 } else {
143 blob->set_vert_stroke_width(0.0f);
144 }
145 } else {
146 if (v_stats.get_total() >= (width + height) / 4 || v_stats.get_total() > h_stats.get_total()) {
147 blob->set_horz_stroke_width(0.0f);
148 blob->set_vert_stroke_width(v_stats.ile(0.5f));
149 } else {
150 blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f) : 0.0f);
151 blob->set_vert_stroke_width(0.0f);
152 }
153 }
154}
155
156/**********************************************************************
157 * assign_blobs_to_blocks2
158 *
159 * Make a list of TO_BLOCKs for portrait and landscape orientation.
160 **********************************************************************/
161
163 BLOCK_LIST *blocks, // blocks to process
164 TO_BLOCK_LIST *port_blocks) { // output list
165 BLOCK_IT block_it = blocks;
166 C_BLOB_IT blob_it; // iterator
167 BLOBNBOX_IT port_box_it; // iterator
168 // destination iterator
169 TO_BLOCK_IT port_block_it = port_blocks;
170
171 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
172 auto block = block_it.data();
173 auto port_block = new TO_BLOCK(block);
174
175 // Convert the good outlines to block->blob_list
176 port_box_it.set_to_list(&port_block->blobs);
177 blob_it.set_to_list(block->blob_list());
178 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
179 auto blob = blob_it.extract();
180 auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
181 newblob->set_owns_cblob(true);
182 SetBlobStrokeWidth(pix, newblob);
183 port_box_it.add_after_then_move(newblob);
184 }
185
186 // Put the rejected outlines in block->noise_blobs, which allows them to
187 // be reconsidered and sorted back into rows and recover outlines mistakenly
188 // rejected.
189 port_box_it.set_to_list(&port_block->noise_blobs);
190 blob_it.set_to_list(block->reject_blobs());
191 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
192 auto blob = blob_it.extract();
193 auto newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
194 newblob->set_owns_cblob(true);
195 SetBlobStrokeWidth(pix, newblob);
196 port_box_it.add_after_then_move(newblob);
197 }
198
199 port_block_it.add_after_then_move(port_block);
200 }
201}
202
203/**********************************************************************
204 * find_components
205 *
206 * Find the C_OUTLINEs of the connected components in each block, put them
207 * in C_BLOBs, and filter them by size, putting the different size
208 * grades on different lists in the matching TO_BLOCK in to_blocks.
209 **********************************************************************/
210
211void Textord::find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks) {
212 int width = pixGetWidth(pix);
213 int height = pixGetHeight(pix);
214 if (width > INT16_MAX || height > INT16_MAX) {
215 tprintf("Input image too large! (%d, %d)\n", width, height);
216 return; // Can't handle it.
217 }
218
219 BLOCK_IT block_it(blocks); // iterator
220 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
221 BLOCK *block = block_it.data();
222 if (block->pdblk.poly_block() == nullptr || block->pdblk.poly_block()->IsText()) {
223 extract_edges(pix, block);
224 }
225 }
226
227 assign_blobs_to_blocks2(pix, blocks, to_blocks);
228 ICOORD page_tr(width, height);
229 filter_blobs(page_tr, to_blocks, !textord_test_landscape);
230}
231
232/**********************************************************************
233 * filter_blobs
234 *
235 * Sort the blobs into sizes in all the blocks for later work.
236 **********************************************************************/
237
238void Textord::filter_blobs(ICOORD page_tr, // top right
239 TO_BLOCK_LIST *blocks, // output list
240 bool testing_on) { // for plotting
241 TO_BLOCK_IT block_it = blocks; // destination iterator
242 TO_BLOCK *block; // created block
243
244#ifndef GRAPHICS_DISABLED
245 if (to_win != nullptr) {
246 to_win->Clear();
247 }
248#endif // !GRAPHICS_DISABLED
249
250 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
251 block = block_it.data();
252 block->line_size = filter_noise_blobs(&block->blobs, &block->noise_blobs, &block->small_blobs,
253 &block->large_blobs);
254 if (block->line_size == 0) {
255 block->line_size = 1;
256 }
257 block->line_spacing =
258 block->line_size *
264
265#ifndef GRAPHICS_DISABLED
266 if (textord_show_blobs && testing_on) {
267 if (to_win == nullptr) {
268 create_to_win(page_tr);
269 }
271 }
272 if (textord_show_boxes && testing_on) {
273 if (to_win == nullptr) {
274 create_to_win(page_tr);
275 }
280 }
281#endif // !GRAPHICS_DISABLED
282 }
283}
284
285/**********************************************************************
286 * filter_noise_blobs
287 *
288 * Move small blobs to a separate list.
289 **********************************************************************/
290
291float Textord::filter_noise_blobs(BLOBNBOX_LIST *src_list, // original list
292 BLOBNBOX_LIST *noise_list, // noise list
293 BLOBNBOX_LIST *small_list, // small blobs
294 BLOBNBOX_LIST *large_list) { // large blobs
295 int16_t height; // height of blob
296 int16_t width; // of blob
297 BLOBNBOX *blob; // current blob
298 float initial_x; // first guess
299 BLOBNBOX_IT src_it = src_list; // iterators
300 BLOBNBOX_IT noise_it = noise_list;
301 BLOBNBOX_IT small_it = small_list;
302 BLOBNBOX_IT large_it = large_list;
303 STATS size_stats(0, MAX_NEAREST_DIST - 1);
304 // blob heights
305 float min_y; // size limits
306 float max_y;
307 float max_x;
308 float max_height; // of good blobs
309
310 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
311 blob = src_it.data();
312 if (blob->bounding_box().height() < textord_max_noise_size) {
313 noise_it.add_after_then_move(src_it.extract());
314 } else if (blob->enclosed_area() >= blob->bounding_box().height() *
315 blob->bounding_box().width() *
316 textord_noise_area_ratio) {
317 small_it.add_after_then_move(src_it.extract());
318 }
319 }
320 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
321 size_stats.add(src_it.data()->bounding_box().height(), 1);
322 }
323 initial_x = size_stats.ile(textord_initialx_ile);
324 max_y = ceil(initial_x *
328 min_y = std::floor(initial_x / 2);
329 max_x = ceil(initial_x * textord_width_limit);
330 small_it.move_to_first();
331 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
332 height = small_it.data()->bounding_box().height();
333 if (height > max_y) {
334 large_it.add_after_then_move(small_it.extract());
335 } else if (height >= min_y) {
336 src_it.add_after_then_move(small_it.extract());
337 }
338 }
339 size_stats.clear();
340 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
341 height = src_it.data()->bounding_box().height();
342 width = src_it.data()->bounding_box().width();
343 if (height < min_y) {
344 small_it.add_after_then_move(src_it.extract());
345 } else if (height > max_y || width > max_x) {
346 large_it.add_after_then_move(src_it.extract());
347 } else {
348 size_stats.add(height, 1);
349 }
350 }
351 max_height = size_stats.ile(textord_initialasc_ile);
352 // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
353 // max_y,min_y,initial_x,max_height);
355 if (max_height > initial_x) {
356 initial_x = max_height;
357 }
358 // tprintf(" ret=%g\n",initial_x);
359 return initial_x;
360}
361
362// Fixes the block so it obeys all the rules:
363// Must have at least one ROW.
364// Must have at least one WERD.
365// WERDs contain a fake blob.
366void Textord::cleanup_nontext_block(BLOCK *block) {
367 // Non-text blocks must contain at least one row.
368 ROW_IT row_it(block->row_list());
369 if (row_it.empty()) {
370 const TBOX &box = block->pdblk.bounding_box();
371 float height = box.height();
372 int32_t xstarts[2] = {box.left(), box.right()};
373 double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
374 ROW *row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f, height / 4.0f, 0, 1);
375 row_it.add_after_then_move(row);
376 }
377 // Each row must contain at least one word.
378 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
379 ROW *row = row_it.data();
380 WERD_IT w_it(row->word_list());
381 if (w_it.empty()) {
382 // Make a fake blob to put in the word.
383 TBOX box = block->row_list()->singleton() ? block->pdblk.bounding_box() : row->bounding_box();
384 C_BLOB *blob = C_BLOB::FakeBlob(box);
385 C_BLOB_LIST blobs;
386 C_BLOB_IT blob_it(&blobs);
387 blob_it.add_after_then_move(blob);
388 WERD *word = new WERD(&blobs, 0, nullptr);
389 w_it.add_after_then_move(word);
390 }
391 // Each word must contain a fake blob.
392 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
393 WERD *word = w_it.data();
394 // Just assert that this is true, as it would be useful to find
395 // out why it isn't.
396 ASSERT_HOST(!word->cblob_list()->empty());
397 }
398 row->recalc_bounding_box();
399 }
400}
401
402/**********************************************************************
403 * cleanup_blocks
404 *
405 * Delete empty blocks, rows from the page.
406 **********************************************************************/
407
408void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
409 BLOCK_IT block_it = blocks; // iterator
410 ROW_IT row_it; // row iterator
411
412 int num_rows = 0;
413 int num_rows_all = 0;
414 int num_blocks = 0;
415 int num_blocks_all = 0;
416 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
417 BLOCK *block = block_it.data();
418 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
419 cleanup_nontext_block(block);
420 continue;
421 }
422 num_rows = 0;
423 num_rows_all = 0;
424 if (clean_noise) {
425 row_it.set_to_list(block->row_list());
426 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
427 ROW *row = row_it.data();
428 ++num_rows_all;
429 clean_small_noise_from_words(row);
430 if ((textord_noise_rejrows && !row->word_list()->empty() && clean_noise_from_row(row)) ||
431 row->word_list()->empty()) {
432 delete row_it.extract(); // lose empty row.
433 } else {
434 if (textord_noise_rejwords) {
435 clean_noise_from_words(row_it.data());
436 }
437 if (textord_blshift_maxshift >= 0) {
438 tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction);
439 }
440 ++num_rows;
441 }
442 }
443 }
444 if (block->row_list()->empty()) {
445 delete block_it.extract(); // Lose empty text blocks.
446 } else {
447 ++num_blocks;
448 }
449 ++num_blocks_all;
450 if (textord_noise_debug) {
451 tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
452 }
453 }
454 if (textord_noise_debug) {
455 tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
456 }
457}
458
459/**********************************************************************
460 * clean_noise_from_row
461 *
462 * Move blobs of words from rows of garbage into the reject blobs list.
463 **********************************************************************/
464
465bool Textord::clean_noise_from_row( // remove empties
466 ROW *row // row to clean
467) {
468 bool testing_on;
469 TBOX blob_box; // bounding box
470 C_BLOB *blob; // current blob
471 C_OUTLINE *outline; // current outline
472 WERD *word; // current word
473 int32_t blob_size; // biggest size
474 int32_t trans_count = 0; // no of transitions
475 int32_t trans_threshold; // noise tolerance
476 int32_t dot_count; // small objects
477 int32_t norm_count; // normal objects
478 int32_t super_norm_count; // real char-like
479 // words of row
480 WERD_IT word_it = row->word_list();
481 C_BLOB_IT blob_it; // blob iterator
482 C_OUTLINE_IT out_it; // outline iterator
483
484 testing_on = textord_test_y > row->base_line(textord_test_x) && textord_show_blobs &&
485 textord_test_y < row->base_line(textord_test_x) + row->x_height();
486 dot_count = 0;
487 norm_count = 0;
488 super_norm_count = 0;
489 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
490 word = word_it.data(); // current word
491 // blobs in word
492 blob_it.set_to_list(word->cblob_list());
493 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
494 blob = blob_it.data();
495 if (!word->flag(W_DONT_CHOP)) {
496 // get outlines
497 out_it.set_to_list(blob->out_list());
498 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
499 outline = out_it.data();
500 blob_box = outline->bounding_box();
501 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
502 if (blob_size < textord_noise_sizelimit * row->x_height()) {
503 dot_count++; // count small outlines
504 }
505 if (!outline->child()->empty() &&
506 blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&
507 blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&
508 blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&
509 blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {
510 super_norm_count++; // count small outlines
511 }
512 }
513 } else {
514 super_norm_count++;
515 }
516 blob_box = blob->bounding_box();
517 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
518 if (blob_size >= textord_noise_sizelimit * row->x_height() &&
519 blob_size < row->x_height() * 2) {
520 trans_threshold = blob_size / textord_noise_sizefraction;
521 trans_count = blob->count_transitions(trans_threshold);
522 if (trans_count < textord_noise_translimit) {
523 norm_count++;
524 }
525 } else if (blob_box.height() > row->x_height() * 2 &&
526 (!word_it.at_first() || !blob_it.at_first())) {
527 dot_count += 2;
528 }
529 if (testing_on) {
530 tprintf("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left(),
531 blob_box.bottom(), blob_box.right(), blob_box.top(), blob->out_list()->length(),
532 trans_count, blob_box.bottom() - row->base_line(blob_box.left()));
533 }
534 }
535 }
536 if (textord_noise_debug) {
537 tprintf("Row ending at (%d,%g):", blob_box.right(), row->base_line(blob_box.right()));
538 tprintf(" R=%g, dc=%d, nc=%d, %s\n",
539 norm_count > 0 ? static_cast<float>(dot_count) / norm_count : 9999, dot_count,
540 norm_count,
541 dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED"
542 : "ACCEPTED");
543 }
544 return super_norm_count < textord_noise_sncount &&
545 dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
546}
547
548/**********************************************************************
549 * clean_noise_from_words
550 *
551 * Move blobs of words from rows of garbage into the reject blobs list.
552 **********************************************************************/
553
554void Textord::clean_noise_from_words( // remove empties
555 ROW *row // row to clean
556) {
557 TBOX blob_box; // bounding box
558 C_BLOB *blob; // current blob
559 C_OUTLINE *outline; // current outline
560 WERD *word; // current word
561 int32_t blob_size; // biggest size
562 int32_t trans_count; // no of transitions
563 int32_t trans_threshold; // noise tolerance
564 int32_t dot_count; // small objects
565 int32_t norm_count; // normal objects
566 int32_t dud_words; // number discarded
567 int32_t ok_words; // number remaining
568 int32_t word_index; // current word
569 // words of row
570 WERD_IT word_it = row->word_list();
571 C_BLOB_IT blob_it; // blob iterator
572 C_OUTLINE_IT out_it; // outline iterator
573
574 ok_words = word_it.length();
575 if (ok_words == 0 || textord_no_rejects) {
576 return;
577 }
578 // was it chucked
579 std::vector<int8_t> word_dud(ok_words);
580 dud_words = 0;
581 ok_words = 0;
582 word_index = 0;
583 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
584 word = word_it.data(); // current word
585 dot_count = 0;
586 norm_count = 0;
587 // blobs in word
588 blob_it.set_to_list(word->cblob_list());
589 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
590 blob = blob_it.data();
591 if (!word->flag(W_DONT_CHOP)) {
592 // get outlines
593 out_it.set_to_list(blob->out_list());
594 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
595 outline = out_it.data();
596 blob_box = outline->bounding_box();
597 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
598 if (blob_size < textord_noise_sizelimit * row->x_height()) {
599 dot_count++; // count small outlines
600 }
601 if (!outline->child()->empty() &&
602 blob_box.height() < (1 + textord_noise_syfract) * row->x_height() &&
603 blob_box.height() > (1 - textord_noise_syfract) * row->x_height() &&
604 blob_box.width() < (1 + textord_noise_sxfract) * row->x_height() &&
605 blob_box.width() > (1 - textord_noise_sxfract) * row->x_height()) {
606 norm_count++; // count small outlines
607 }
608 }
609 } else {
610 norm_count++;
611 }
612 blob_box = blob->bounding_box();
613 blob_size = blob_box.width() > blob_box.height() ? blob_box.width() : blob_box.height();
614 if (blob_size >= textord_noise_sizelimit * row->x_height() &&
615 blob_size < row->x_height() * 2) {
616 trans_threshold = blob_size / textord_noise_sizefraction;
617 trans_count = blob->count_transitions(trans_threshold);
618 if (trans_count < textord_noise_translimit) {
619 norm_count++;
620 }
621 } else if (blob_box.height() > row->x_height() * 2 &&
622 (!word_it.at_first() || !blob_it.at_first())) {
623 dot_count += 2;
624 }
625 }
626 if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
627 if (dot_count > norm_count * textord_noise_normratio * 2) {
628 word_dud[word_index] = 2;
629 } else if (dot_count > norm_count * textord_noise_normratio) {
630 word_dud[word_index] = 1;
631 } else {
632 word_dud[word_index] = 0;
633 }
634 } else {
635 word_dud[word_index] = 0;
636 }
637 if (word_dud[word_index] == 2) {
638 dud_words++;
639 } else {
640 ok_words++;
641 }
642 word_index++;
643 }
644
645 word_index = 0;
646 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
647 if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) {
648 word = word_it.data(); // Current word.
649 // Previously we threw away the entire word.
650 // Now just aggressively throw all small blobs into the reject list, where
651 // the classifier can decide whether they are actually needed.
652 word->CleanNoise(textord_noise_sizelimit * row->x_height());
653 }
654 word_index++;
655 }
656}
657
658// Remove outlines that are a tiny fraction in either width or height
659// of the word height.
660void Textord::clean_small_noise_from_words(ROW *row) {
661 WERD_IT word_it(row->word_list());
662 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
663 WERD *word = word_it.data();
664 int min_size = static_cast<int>(textord_noise_hfract * word->bounding_box().height() + 0.5);
665 C_BLOB_IT blob_it(word->cblob_list());
666 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
667 C_BLOB *blob = blob_it.data();
668 C_OUTLINE_IT out_it(blob->out_list());
669 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
670 C_OUTLINE *outline = out_it.data();
671 outline->RemoveSmallRecursive(min_size, &out_it);
672 }
673 if (blob->out_list()->empty()) {
674 delete blob_it.extract();
675 }
676 }
677 if (word->cblob_list()->empty()) {
678 if (!word_it.at_last()) {
679 // The next word is no longer a fuzzy non space if it was before,
680 // since the word before is about to be deleted.
681 WERD *next_word = word_it.data_relative(1);
682 if (next_word->flag(W_FUZZY_NON)) {
683 next_word->set_flag(W_FUZZY_NON, false);
684 }
685 }
686 delete word_it.extract();
687 }
688 }
689}
690
691// Local struct to hold a group of blocks.
693 BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
694 explicit BlockGroup(BLOCK *block)
695 : bounding_box(block->pdblk.bounding_box())
696 , rotation(block->re_rotation())
697 , angle(block->re_rotation().angle())
698 , min_xheight(block->x_height()) {
699 blocks.push_back(block);
700 }
701 // Union of block bounding boxes.
703 // Common rotation of the blocks.
705 // Angle of rotation.
706 float angle;
707 // Min xheight of the blocks.
709 // Collection of borrowed pointers to the blocks in the group.
710 std::vector<BLOCK *> blocks;
711};
712
713// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
714// TransferDiacriticsToWords to copy the diacritic blobs to the most
715// appropriate words in the group of blocks. Source blobs are not touched.
716void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks) {
717 // Angle difference larger than this is too much to consider equal.
718 // They should only be in multiples of M_PI/2 anyway.
719 const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
720 std::vector<std::unique_ptr<BlockGroup>> groups;
721 BLOCK_IT bk_it(blocks);
722 for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
723 BLOCK *block = bk_it.data();
724 if (block->pdblk.poly_block() != nullptr && !block->pdblk.poly_block()->IsText()) {
725 continue;
726 }
727 // Linear search of the groups to find a matching rotation.
728 float block_angle = block->re_rotation().angle();
729 int best_g = 0;
730 float best_angle_diff = FLT_MAX;
731 for (const auto &group : groups) {
732 double angle_diff = std::fabs(block_angle - group->angle);
733 if (angle_diff > M_PI) {
734 angle_diff = fabs(angle_diff - 2.0 * M_PI);
735 }
736 if (angle_diff < best_angle_diff) {
737 best_angle_diff = angle_diff;
738 best_g = &group - &groups[0];
739 }
740 }
741 if (best_angle_diff > kMaxAngleDiff) {
742 groups.push_back(std::make_unique<BlockGroup>(block));
743 } else {
744 groups[best_g]->blocks.push_back(block);
745 groups[best_g]->bounding_box += block->pdblk.bounding_box();
746 float x_height = block->x_height();
747 if (x_height < groups[best_g]->min_xheight) {
748 groups[best_g]->min_xheight = x_height;
749 }
750 }
751 }
752 // Now process each group of blocks.
753 std::vector<std::unique_ptr<WordWithBox>> word_ptrs;
754 for (const auto &group : groups) {
755 if (group->bounding_box.null_box()) {
756 continue;
757 }
758 WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
759 group->bounding_box.topright());
760 for (auto b : group->blocks) {
761 ROW_IT row_it(b->row_list());
762 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
763 ROW *row = row_it.data();
764 // Put the words of the row into the grid.
765 WERD_IT w_it(row->word_list());
766 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
767 WERD *word = w_it.data();
768 auto box_word = std::make_unique<WordWithBox>(word);
769 word_grid.InsertBBox(true, true, box_word.get());
770 // Save the pointer where it will be auto-deleted.
771 word_ptrs.emplace_back(std::move(box_word));
772 }
773 }
774 }
775 FCOORD rotation = group->rotation;
776 // Make it a forward rotation that will transform blob coords to block.
777 rotation.set_y(-rotation.y());
778 TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
779 }
780}
781
782// Places a copy of blobs that are near a word (after applying rotation to the
783// blob) in the most appropriate word, unless there is doubt, in which case a
784// blob can end up in two words. Source blobs are not touched.
785void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, const FCOORD &rotation,
786 WordGrid *word_grid) {
787 WordSearch ws(word_grid);
788 BLOBNBOX_IT b_it(diacritic_blobs);
789 // Apply rotation to each blob before finding the nearest words. The rotation
790 // allows us to only consider above/below placement and not left/right on
791 // vertical text, because all text is horizontal here.
792 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
793 BLOBNBOX *blobnbox = b_it.data();
794 TBOX blob_box = blobnbox->bounding_box();
795 blob_box.rotate(rotation);
796 ws.StartRectSearch(blob_box);
797 // Above/below refer to word position relative to diacritic. Since some
798 // scripts eg Kannada/Telugu habitually put diacritics below words, and
799 // others eg Thai/Vietnamese/Latin put most diacritics above words, try
800 // for both if there isn't much in it.
801 WordWithBox *best_above_word = nullptr;
802 WordWithBox *best_below_word = nullptr;
803 int best_above_distance = 0;
804 int best_below_distance = 0;
805 for (WordWithBox *word = ws.NextRectSearch(); word != nullptr; word = ws.NextRectSearch()) {
806 if (word->word()->flag(W_REP_CHAR)) {
807 continue;
808 }
809 TBOX word_box = word->true_bounding_box();
810 int x_distance = blob_box.x_gap(word_box);
811 int y_distance = blob_box.y_gap(word_box);
812 if (x_distance > 0) {
813 // Arbitrarily divide x-distance by 2 if there is a major y overlap,
814 // and the word is to the left of the diacritic. If the
815 // diacritic is a dropped broken character between two words, this will
816 // help send all the pieces to a single word, instead of splitting them
817 // over the 2 words.
818 if (word_box.major_y_overlap(blob_box) && blob_box.left() > word_box.right()) {
819 x_distance /= 2;
820 }
821 y_distance += x_distance;
822 }
823 if (word_box.y_middle() > blob_box.y_middle() &&
824 (best_above_word == nullptr || y_distance < best_above_distance)) {
825 best_above_word = word;
826 best_above_distance = y_distance;
827 }
828 if (word_box.y_middle() <= blob_box.y_middle() &&
829 (best_below_word == nullptr || y_distance < best_below_distance)) {
830 best_below_word = word;
831 best_below_distance = y_distance;
832 }
833 }
834 bool above_good = best_above_word != nullptr &&
835 (best_below_word == nullptr ||
836 best_above_distance < best_below_distance + blob_box.height());
837 bool below_good = best_below_word != nullptr && best_below_word != best_above_word &&
838 (best_above_word == nullptr ||
839 best_below_distance < best_above_distance + blob_box.height());
840 if (below_good) {
841 C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
842 copied_blob->rotate(rotation);
843 // Put the blob into the word's reject blobs list.
844 C_BLOB_IT blob_it(best_below_word->RejBlobs());
845 blob_it.add_to_end(copied_blob);
846 }
847 if (above_good) {
848 C_BLOB *copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
849 copied_blob->rotate(rotation);
850 // Put the blob into the word's reject blobs list.
851 C_BLOB_IT blob_it(best_above_word->RejBlobs());
852 blob_it.add_to_end(copied_blob);
853 }
854 }
855}
856
857/**********************************************************************
858 * tweak_row_baseline
859 *
860 * Shift baseline to fit the blobs more accurately where they are
861 * close enough.
862 **********************************************************************/
863
864void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction) {
865 TBOX blob_box; // bounding box
866 C_BLOB *blob; // current blob
867 WERD *word; // current word
868 int32_t blob_count; // no of blobs
869 int32_t src_index; // source segment
870 int32_t dest_index; // destination segment
871 float ydiff; // baseline error
872 float x_centre; // centre of blob
873 // words of row
874 WERD_IT word_it = row->word_list();
875 C_BLOB_IT blob_it; // blob iterator
876
877 blob_count = 0;
878 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
879 word = word_it.data(); // current word
880 // get total blobs
881 blob_count += word->cblob_list()->length();
882 }
883 if (blob_count == 0) {
884 return;
885 }
886 // spline segments
887 std::vector<int32_t> xstarts(blob_count + row->baseline.segments + 1);
888 // spline coeffs
889 std::vector<double> coeffs((blob_count + row->baseline.segments) * 3);
890
891 src_index = 0;
892 dest_index = 0;
893 xstarts[0] = row->baseline.xcoords[0];
894 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
895 word = word_it.data(); // current word
896 // blobs in word
897 blob_it.set_to_list(word->cblob_list());
898 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
899 blob = blob_it.data();
900 blob_box = blob->bounding_box();
901 x_centre = (blob_box.left() + blob_box.right()) / 2.0;
902 ydiff = blob_box.bottom() - row->base_line(x_centre);
903 if (ydiff < 0) {
904 ydiff = -ydiff / row->x_height();
905 } else {
906 ydiff = ydiff / row->x_height();
907 }
908 if (ydiff < blshift_maxshift && blob_box.height() / row->x_height() > blshift_xfraction) {
909 if (xstarts[dest_index] >= x_centre) {
910 xstarts[dest_index] = blob_box.left();
911 }
912 coeffs[dest_index * 3] = 0;
913 coeffs[dest_index * 3 + 1] = 0;
914 coeffs[dest_index * 3 + 2] = blob_box.bottom();
915 // shift it
916 dest_index++;
917 xstarts[dest_index] = blob_box.right() + 1;
918 } else {
919 if (xstarts[dest_index] <= x_centre) {
920 while (row->baseline.xcoords[src_index + 1] <= x_centre &&
921 src_index < row->baseline.segments - 1) {
922 if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) {
923 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
924 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
925 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
926 dest_index++;
927 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
928 }
929 src_index++;
930 }
931 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
932 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
933 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
934 dest_index++;
935 xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
936 }
937 }
938 }
939 }
940 while (src_index < row->baseline.segments &&
941 row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) {
942 src_index++;
943 }
944 while (src_index < row->baseline.segments) {
945 coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
946 coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
947 coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
948 dest_index++;
949 src_index++;
950 xstarts[dest_index] = row->baseline.xcoords[src_index];
951 }
952 // turn to spline
953 row->baseline = QSPLINE(dest_index, &xstarts[0], &coeffs[0]);
954}
955
956} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:61
@ TBOX
const double y
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
@ W_REP_CHAR
repeated character
Definition: werd.h:40
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:73
int textord_test_y
Definition: makerow.cpp:65
int textord_test_x
Definition: makerow.cpp:64
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
double textord_excess_blobsize
Definition: makerow.cpp:81
@ baseline
Definition: mfoutline.h:53
ScrollView * to_win
Definition: drawtord.cpp:37
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:69
void SetBlobStrokeWidth(Image pix, BLOBNBOX *blob)
Definition: tordmain.cpp:68
bool textord_test_landscape
Definition: makerow.cpp:52
double textord_min_linesize
Definition: makerow.cpp:80
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47
double textord_width_limit
Definition: makerow.cpp:75
void assign_blobs_to_blocks2(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:162
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:864
void extract_edges(Image pix, BLOCK *block)
Definition: edgblob.cpp:347
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:74
group
Definition: upload.py:412
const TBOX & bounding_box() const
Definition: blobbox.h:239
int32_t enclosed_area() const
Definition: blobbox.h:262
void set_horz_stroke_width(float width)
Definition: blobbox.h:355
void set_vert_stroke_width(float width)
Definition: blobbox.h:361
BLOBNBOX_LIST blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:779
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1058
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:780
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:778
static const double kXHeightCapRatio
Definition: ccstruct.h:35
static const double kXHeightFraction
Definition: ccstruct.h:32
static const double kDescenderFraction
Definition: ccstruct.h:31
static const double kAscenderFraction
Definition: ccstruct.h:33
void destroy()
Definition: image.cpp:32
FCOORD re_rotation() const
Definition: ocrblock.h:129
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
int32_t x_height() const
return xheight
Definition: ocrblock.h:101
WERD_LIST * word_list()
Definition: ocrrow.h:57
float x_height() const
Definition: ocrrow.h:66
float base_line(float xpos) const
Definition: ocrrow.h:61
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
integer coordinate
Definition: points.h:36
float angle() const
find angle
Definition: points.h:246
bool IsText() const
Definition: polyblk.h:52
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t get_total() const
Definition: statistc.h:85
double ile(double frac) const
Definition: statistc.cpp:172
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
TBOX bounding_box() const
Definition: stepblob.cpp:250
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:118
C_BLOB_LIST * cblob_list()
Definition: werd.h:96
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, bool testing_on)
Definition: tordmain.cpp:238
void find_components(Image pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:211
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:694
std::vector< BLOCK * > blocks
Definition: tordmain.cpp:710