tesseract v5.3.3.20231005
wordseg.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: wordseg.cpp (Formerly wspace.c)
3 * Description: Code to segment the blobs into words.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1992, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19// Include automatically generated configuration file if running autoconf.
20#ifdef HAVE_CONFIG_H
21# include "config_auto.h"
22#endif
23
24#include "wordseg.h"
25
26#include <cmath>
27
28#include "blobbox.h"
29#include "cjkpitch.h"
30#include "drawtord.h"
31#include "fpchop.h"
32#include "makerow.h"
33#include "pitsync1.h"
34#include "statistc.h"
35#include "textord.h"
36#include "topitch.h"
37#include "tovars.h"
38
39namespace tesseract {
40
41BOOL_VAR(textord_force_make_prop_words, false, "Force proportional word segmentation on all rows");
42BOOL_VAR(textord_chopper_test, false, "Chopper is being tested.");
43
44#define BLOCK_STATS_CLUSTERS 10
45
53void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows) {
54 TO_ROW_IT to_row_it(rows);
55 ROW_IT row_it(real_rows);
56 for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list(); to_row_it.forward()) {
57 TO_ROW *row = to_row_it.data();
58 // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
59 // to create the word.
60 C_BLOB_LIST cblobs;
61 C_BLOB_IT cblob_it(&cblobs);
62 BLOBNBOX_IT box_it(row->blob_list());
63 for (; !box_it.empty(); box_it.forward()) {
64 BLOBNBOX *bblob = box_it.extract();
65 if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
66 auto cblob = bblob->remove_cblob();
67 if (cblob != nullptr) {
68 C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
69 cout_it.move_to_last();
70 cout_it.add_list_after(cblob->out_list());
71 delete cblob;
72 }
73 } else {
74 auto cblob = bblob->remove_cblob();
75 if (cblob != nullptr) {
76 cblob_it.add_after_then_move(cblob);
77 }
78 }
79 delete bblob;
80 }
81 // Convert the TO_ROW to a ROW.
82 ROW *real_row =
83 new ROW(row, static_cast<int16_t>(row->kern_size), static_cast<int16_t>(row->space_size));
84 WERD_IT word_it(real_row->word_list());
85 WERD *word = new WERD(&cblobs, 0, nullptr);
86 word->set_flag(W_BOL, true);
87 word->set_flag(W_EOL, true);
88 word->set_flag(W_DONT_CHOP, one_blob);
89 word_it.add_after_then_move(word);
90 row_it.add_after_then_move(real_row);
91 }
92}
93
100 ICOORD page_tr, // top right
101 float gradient, // page skew
102 BLOCK_LIST *blocks, // block list
103 TO_BLOCK_LIST *port_blocks) { // output list
104 TO_BLOCK_IT block_it; // iterator
105 TO_BLOCK *block; // current block
106
107 if (textord->use_cjk_fp_model()) {
108 compute_fixed_pitch_cjk(page_tr, port_blocks);
109 } else {
110 compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
112 }
113 textord->to_spacing(page_tr, port_blocks);
114 block_it.set_to_list(port_blocks);
115 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
116 block = block_it.data();
117 make_real_words(textord, block, FCOORD(1.0f, 0.0f));
118 }
119}
120
128void set_row_spaces( // find space sizes
129 TO_BLOCK *block, // block to do
130 FCOORD rotation, // for drawing
131 bool testing_on // correct orientation
132) {
133 TO_ROW *row; // current row
134 TO_ROW_IT row_it = block->get_rows();
135
136 if (row_it.empty()) {
137 return; // empty block
138 }
139 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
140 row = row_it.data();
141 if (row->fixed_pitch == 0) {
142 row->min_space = static_cast<int32_t>(
143 ceil(row->pr_space - (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
144 row->max_nonspace = static_cast<int32_t>(
145 floor(row->pr_nonsp + (row->pr_space - row->pr_nonsp) * textord_words_definite_spread));
146 if (testing_on && textord_show_initial_words) {
147 tprintf("Assigning defaults %d non, %d space to row at %g\n", row->max_nonspace,
148 row->min_space, row->intercept());
149 }
150 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
151 row->space_size = row->pr_space;
152 row->kern_size = row->pr_nonsp;
153 }
154#ifndef GRAPHICS_DISABLED
155 if (textord_show_initial_words && testing_on) {
156 plot_word_decisions(to_win, static_cast<int16_t>(row->fixed_pitch), row);
157 }
158#endif
159 }
160}
161
168int32_t row_words( // compute space size
169 TO_BLOCK *block, // block it came from
170 TO_ROW *row, // row to operate on
171 int32_t maxwidth, // max expected space size
172 FCOORD rotation, // for drawing
173 bool testing_on // for debug
174) {
175 bool testing_row; // contains testpt
176 bool prev_valid; // if decent size
177 int32_t prev_x; // end of prev blob
178 int32_t cluster_count; // no of clusters
179 int32_t gap_index; // which cluster
180 int32_t smooth_factor; // for smoothing stats
181 BLOBNBOX *blob; // current blob
182 float lower, upper; // clustering parameters
183 float gaps[3]; // gap clusers
184 ICOORD testpt;
185 TBOX blob_box; // bounding box
186 // iterator
187 BLOBNBOX_IT blob_it = row->blob_list();
188 STATS gap_stats(0, maxwidth - 1);
189 STATS cluster_stats[4]; // clusters
190
192 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
193 // if (testing_on)
194 // tprintf("Row smooth factor=%d\n",smooth_factor);
195 prev_valid = false;
196 prev_x = -INT32_MAX;
197 testing_row = false;
198 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
199 blob = blob_it.data();
200 blob_box = blob->bounding_box();
201 if (blob_box.contains(testpt)) {
202 testing_row = true;
203 }
204 gap_stats.add(blob_box.width(), 1);
205 }
206 gap_stats.clear();
207 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
208 blob = blob_it.data();
209 if (!blob->joined_to_prev()) {
210 blob_box = blob->bounding_box();
211 if (prev_valid && blob_box.left() - prev_x < maxwidth) {
212 gap_stats.add(blob_box.left() - prev_x, 1);
213 }
214 prev_valid = true;
215 prev_x = blob_box.right();
216 }
217 }
218 if (gap_stats.get_total() == 0) {
219 row->min_space = 0; // no evidence
220 row->max_nonspace = 0;
221 return 0;
222 }
223 gap_stats.smooth(smooth_factor);
226 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
227 while (cluster_count < 2 && std::ceil(lower) < std::floor(upper)) {
228 // shrink gap
229 upper = (upper * 3 + lower) / 4;
230 lower = (lower * 3 + upper) / 4;
231 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop, 3, cluster_stats);
232 }
233 if (cluster_count < 2) {
234 row->min_space = 0; // no evidence
235 row->max_nonspace = 0;
236 return 0;
237 }
238 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
239 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
240 }
241 // get medians
242 if (cluster_count > 2) {
243 if (testing_on && textord_show_initial_words) {
244 tprintf("Row at %g has 3 sizes of gap:%g,%g,%g\n", row->intercept(),
245 cluster_stats[1].ile(0.5), cluster_stats[2].ile(0.5), cluster_stats[3].ile(0.5));
246 }
247 lower = gaps[0];
248 if (gaps[1] > lower) {
249 upper = gaps[1]; // prefer most frequent
250 if (upper < block->xheight * textord_words_min_minspace && gaps[2] > gaps[1]) {
251 upper = gaps[2];
252 }
253 } else if (gaps[2] > lower && gaps[2] >= block->xheight * textord_words_min_minspace) {
254 upper = gaps[2];
255 } else if (lower >= block->xheight * textord_words_min_minspace) {
256 upper = lower; // not nice
257 lower = gaps[1];
258 if (testing_on && textord_show_initial_words) {
259 tprintf("Had to switch most common from lower to upper!!\n");
260 gap_stats.print();
261 }
262 } else {
263 row->min_space = 0; // no evidence
264 row->max_nonspace = 0;
265 return 0;
266 }
267 } else {
268 if (gaps[1] < gaps[0]) {
269 if (testing_on && textord_show_initial_words) {
270 tprintf("Had to switch most common from lower to upper!!\n");
271 gap_stats.print();
272 }
273 lower = gaps[1];
274 upper = gaps[0];
275 } else {
276 upper = gaps[1];
277 lower = gaps[0];
278 }
279 }
280 if (upper < block->xheight * textord_words_min_minspace) {
281 row->min_space = 0; // no evidence
282 row->max_nonspace = 0;
283 return 0;
284 }
285 if (upper * 3 < block->min_space * 2 + block->max_nonspace ||
286 lower * 3 > block->min_space * 2 + block->max_nonspace) {
287 if (testing_on && textord_show_initial_words) {
288 tprintf("Disagreement between block and row at %g!!\n", row->intercept());
289 tprintf("Lower=%g, upper=%g, Stats:\n", lower, upper);
290 gap_stats.print();
291 }
292 }
293 row->min_space =
294 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
295 row->max_nonspace =
296 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
297 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
298 row->space_size = upper;
299 row->kern_size = lower;
300 if (testing_on && textord_show_initial_words) {
301 if (testing_row) {
302 tprintf("GAP STATS\n");
303 gap_stats.print();
304 tprintf("SPACE stats\n");
305 cluster_stats[2].print_summary();
306 tprintf("NONSPACE stats\n");
307 cluster_stats[1].print_summary();
308 }
309 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
310 upper, row->max_nonspace, lower);
311 }
312 return cluster_stats[2].get_total();
313}
314
321int32_t row_words2( // compute space size
322 TO_BLOCK *block, // block it came from
323 TO_ROW *row, // row to operate on
324 int32_t maxwidth, // max expected space size
325 FCOORD rotation, // for drawing
326 bool testing_on // for debug
327) {
328 bool prev_valid; // if decent size
329 bool this_valid; // current blob big enough
330 int32_t prev_x; // end of prev blob
331 int32_t min_width; // min interesting width
332 int32_t valid_count; // good gaps
333 int32_t total_count; // total gaps
334 int32_t cluster_count; // no of clusters
335 int32_t prev_count; // previous cluster_count
336 int32_t gap_index; // which cluster
337 int32_t smooth_factor; // for smoothing stats
338 BLOBNBOX *blob; // current blob
339 float lower, upper; // clustering parameters
340 ICOORD testpt;
341 TBOX blob_box; // bounding box
342 // iterator
343 BLOBNBOX_IT blob_it = row->blob_list();
344 STATS gap_stats(0, maxwidth - 1);
345 // gap sizes
346 float gaps[BLOCK_STATS_CLUSTERS];
347 STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
348 // clusters
349
351 smooth_factor = static_cast<int32_t>(block->xheight * textord_wordstats_smooth_factor + 1.5);
352 // if (testing_on)
353 // tprintf("Row smooth factor=%d\n",smooth_factor);
354 prev_valid = false;
355 prev_x = -INT16_MAX;
356 const bool testing_row = false;
357 // min blob size
358 min_width = static_cast<int32_t>(block->pr_space);
359 total_count = 0;
360 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
361 blob = blob_it.data();
362 if (!blob->joined_to_prev()) {
363 blob_box = blob->bounding_box();
364 this_valid = blob_box.width() >= min_width;
365 if (this_valid && prev_valid && blob_box.left() - prev_x < maxwidth) {
366 gap_stats.add(blob_box.left() - prev_x, 1);
367 }
368 total_count++; // count possibles
369 prev_x = blob_box.right();
370 prev_valid = this_valid;
371 }
372 }
373 valid_count = gap_stats.get_total();
374 if (valid_count < total_count * textord_words_minlarge) {
375 gap_stats.clear();
376 prev_x = -INT16_MAX;
377 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
378 blob = blob_it.data();
379 if (!blob->joined_to_prev()) {
380 blob_box = blob->bounding_box();
381 if (blob_box.left() - prev_x < maxwidth) {
382 gap_stats.add(blob_box.left() - prev_x, 1);
383 }
384 prev_x = blob_box.right();
385 }
386 }
387 }
388 if (gap_stats.get_total() == 0) {
389 row->min_space = 0; // no evidence
390 row->max_nonspace = 0;
391 return 0;
392 }
393
394 cluster_count = 0;
395 lower = block->xheight * words_initial_lower;
396 upper = block->xheight * words_initial_upper;
397 gap_stats.smooth(smooth_factor);
398 do {
399 prev_count = cluster_count;
400 cluster_count = gap_stats.cluster(lower, upper, textord_spacesize_ratioprop,
401 BLOCK_STATS_CLUSTERS, cluster_stats);
402 } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
403 if (cluster_count < 1) {
404 row->min_space = 0;
405 row->max_nonspace = 0;
406 return 0;
407 }
408 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
409 gaps[gap_index] = cluster_stats[gap_index + 1].ile(0.5);
410 }
411 // get medians
412 if (testing_on) {
413 tprintf("cluster_count=%d:", cluster_count);
414 for (gap_index = 0; gap_index < cluster_count; gap_index++) {
415 tprintf(" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total());
416 }
417 tprintf("\n");
418 }
419
420 // Try to find proportional non-space and space for row.
421 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace;
422 gap_index++) {
423 ;
424 }
425 if (gap_index < cluster_count) {
426 lower = gaps[gap_index]; // most frequent below
427 } else {
428 if (testing_on) {
429 tprintf("No cluster below block threshold!, using default=%g\n", block->pr_nonsp);
430 }
431 lower = block->pr_nonsp;
432 }
433 for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace;
434 gap_index++) {
435 ;
436 }
437 if (gap_index < cluster_count) {
438 upper = gaps[gap_index]; // most frequent above
439 } else {
440 if (testing_on) {
441 tprintf("No cluster above block threshold!, using default=%g\n", block->pr_space);
442 }
443 upper = block->pr_space;
444 }
445 row->min_space =
446 static_cast<int32_t>(ceil(upper - (upper - lower) * textord_words_definite_spread));
447 row->max_nonspace =
448 static_cast<int32_t>(floor(lower + (upper - lower) * textord_words_definite_spread));
449 row->space_threshold = (row->max_nonspace + row->min_space) / 2;
450 row->space_size = upper;
451 row->kern_size = lower;
452 if (testing_on) {
453 if (testing_row) {
454 tprintf("GAP STATS\n");
455 gap_stats.print();
456 tprintf("SPACE stats\n");
457 cluster_stats[2].print_summary();
458 tprintf("NONSPACE stats\n");
459 cluster_stats[1].print_summary();
460 }
461 tprintf("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept(), row->min_space,
462 upper, row->max_nonspace, lower);
463 }
464 return 1;
465}
466
474 TO_BLOCK *block, // block to do
475 FCOORD rotation // for drawing
476) {
477 TO_ROW *row; // current row
478 TO_ROW_IT row_it = block->get_rows();
479 ROW *real_row = nullptr; // output row
480 ROW_IT real_row_it = block->block->row_list();
481
482 if (row_it.empty()) {
483 return; // empty block
484 }
485 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
486 row = row_it.data();
487 if (row->blob_list()->empty() && !row->rep_words.empty()) {
488 real_row = make_rep_words(row, block);
489 } else if (!row->blob_list()->empty()) {
490 // In a fixed pitch document, some lines may be detected as fixed pitch
491 // while others don't, and will go through different path.
492 // For non-space delimited language like CJK, fixed pitch chop always
493 // leave the entire line as one word. We can force consistent chopping
494 // with force_make_prop_words flag.
495 POLY_BLOCK *pb = block->block->pdblk.poly_block();
497 real_row = textord->make_blob_words(row, rotation);
498 } else if (textord_force_make_prop_words || (pb != nullptr && !pb->IsText()) ||
500 real_row = textord->make_prop_words(row, rotation);
501 } else if (row->pitch_decision == PITCH_DEF_FIXED ||
503 real_row = fixed_pitch_words(row, rotation);
504 } else {
505 ASSERT_HOST(false);
506 }
507 }
508 if (real_row != nullptr) {
509 // put row in block
510 real_row_it.add_after_then_move(real_row);
511 }
512 }
513 block->block->set_stats(block->fixed_pitch == 0, static_cast<int16_t>(block->kern_size),
514 static_cast<int16_t>(block->space_size),
515 static_cast<int16_t>(block->fixed_pitch));
516 block->block->check_pitch();
517}
518
526ROW *make_rep_words( // make a row
527 TO_ROW *row, // row to convert
528 TO_BLOCK *block // block it lives in
529) {
530 ROW *real_row; // output row
531 TBOX word_box; // bounding box
532 // iterator
533 WERD_IT word_it = &row->rep_words;
534
535 if (word_it.empty()) {
536 return nullptr;
537 }
538 word_box = word_it.data()->bounding_box();
539 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
540 word_box += word_it.data()->bounding_box();
541 }
542 row->xheight = block->xheight;
543 real_row =
544 new ROW(row, static_cast<int16_t>(block->kern_size), static_cast<int16_t>(block->space_size));
545 word_it.set_to_list(real_row->word_list());
546 // put words in row
547 word_it.add_list_after(&row->rep_words);
548 real_row->recalc_bounding_box();
549 return real_row;
550}
551
559WERD *make_real_word(BLOBNBOX_IT *box_it, // iterator
560 int32_t blobcount, // no of blobs to use
561 bool bol, // start of line
562 uint8_t blanks // no of blanks
563) {
564 C_OUTLINE_IT cout_it;
565 C_BLOB_LIST cblobs;
566 C_BLOB_IT cblob_it = &cblobs;
567
568 for (int blobindex = 0; blobindex < blobcount; blobindex++) {
569 auto bblob = box_it->extract();
570 if (bblob->joined_to_prev()) {
571 auto cblob = bblob->remove_cblob();
572 if (cblob != nullptr) {
573 cout_it.set_to_list(cblob_it.data()->out_list());
574 cout_it.move_to_last();
575 cout_it.add_list_after(cblob->out_list());
576 delete cblob;
577 }
578 } else {
579 auto cblob = bblob->remove_cblob();
580 if (cblob != nullptr) {
581 cblob_it.add_after_then_move(cblob);
582 }
583 }
584 delete bblob;
585 box_it->forward(); // next one
586 }
587
588 if (blanks < 1) {
589 blanks = 1;
590 }
591
592 auto word = new WERD(&cblobs, blanks, nullptr);
593
594 if (bol) {
595 word->set_flag(W_BOL, true);
596 }
597 if (box_it->at_first()) {
598 word->set_flag(W_EOL, true); // at end of line
599 }
600
601 return word;
602}
603
604} // namespace tesseract
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define BLOCK_STATS_CLUSTERS
Definition: wordseg.cpp:44
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
@ W_DONT_CHOP
fixed pitch chopped
Definition: werd.h:39
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, bool testing_on)
Definition: topitch.cpp:75
double words_initial_upper
Definition: tovars.cpp:47
double textord_wordstats_smooth_factor
Definition: tovars.cpp:31
double words_initial_lower
Definition: tovars.cpp:46
int textord_test_y
Definition: makerow.cpp:65
int textord_test_x
Definition: makerow.cpp:64
bool textord_force_make_prop_words
Definition: wordseg.cpp:41
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
WERD * make_real_word(BLOBNBOX_IT *box_it, int32_t blobcount, bool bol, uint8_t blanks)
Definition: wordseg.cpp:559
ScrollView * to_win
Definition: drawtord.cpp:37
ROW * fixed_pitch_words(TO_ROW *row, FCOORD rotation)
Definition: fpchop.cpp:65
@ PITCH_DEF_FIXED
Definition: blobbox.h:49
@ PITCH_DEF_PROP
Definition: blobbox.h:51
@ PITCH_CORR_FIXED
Definition: blobbox.h:53
@ PITCH_CORR_PROP
Definition: blobbox.h:54
void set_row_spaces(TO_BLOCK *block, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:128
double textord_words_min_minspace
Definition: tovars.cpp:35
double textord_words_initial_upper
Definition: tovars.cpp:38
void make_real_words(tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
Definition: wordseg.cpp:473
void make_words(tesseract::Textord *textord, ICOORD page_tr, float gradient, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: wordseg.cpp:99
bool textord_chopper_test
Definition: wordseg.cpp:42
bool textord_test_landscape
Definition: makerow.cpp:52
void plot_word_decisions(ScrollView *win, int16_t pitch, TO_ROW *row)
Definition: drawtord.cpp:238
double textord_words_initial_lower
Definition: tovars.cpp:37
double textord_words_definite_spread
Definition: tovars.cpp:51
void compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: cjkpitch.cpp:1103
void make_single_word(bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows)
Definition: wordseg.cpp:53
bool textord_show_initial_words
Definition: tovars.cpp:25
int32_t row_words2(TO_BLOCK *block, TO_ROW *row, int32_t maxwidth, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:321
ROW * make_rep_words(TO_ROW *row, TO_BLOCK *block)
Definition: wordseg.cpp:526
double textord_spacesize_ratioprop
Definition: tovars.cpp:52
int32_t row_words(TO_BLOCK *block, TO_ROW *row, int32_t maxwidth, FCOORD rotation, bool testing_on)
Definition: wordseg.cpp:168
double textord_words_minlarge
Definition: tovars.cpp:39
const TBOX & bounding_box() const
Definition: blobbox.h:239
C_BLOB * remove_cblob()
Definition: blobbox.h:280
bool joined_to_prev() const
Definition: blobbox.h:265
int32_t min_space
Definition: blobbox.h:669
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:608
WERD_LIST rep_words
Definition: blobbox.h:674
int32_t max_nonspace
Definition: blobbox.h:670
float space_size
Definition: blobbox.h:673
float fixed_pitch
Definition: blobbox.h:657
int32_t space_threshold
Definition: blobbox.h:671
float intercept() const
Definition: blobbox.h:598
PITCH_TYPE pitch_decision
Definition: blobbox.h:656
TO_ROW_LIST * get_rows()
Definition: blobbox.h:709
int32_t min_space
Definition: blobbox.h:796
int32_t max_nonspace
Definition: blobbox.h:797
void check_pitch()
check proportional
Definition: ocrblock.cpp:164
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:111
void set_stats(bool prop, int16_t kern, int16_t space, int16_t ch_pitch)
Definition: ocrblock.h:56
WERD_LIST * word_list()
Definition: ocrrow.h:57
void recalc_bounding_box()
Definition: ocrrow.cpp:100
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
integer coordinate
Definition: points.h:36
bool IsText() const
Definition: polyblk.h:52
TDimension left() const
Definition: rect.h:82
TDimension width() const
Definition: rect.h:126
TDimension right() const
Definition: rect.h:89
bool contains(const FCOORD pt) const
Definition: rect.h:344
void print_summary() const
Definition: statistc.cpp:572
void print() const
Definition: statistc.cpp:547
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t get_total() const
Definition: statistc.h:85
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:334
void smooth(int32_t factor)
Definition: statistc.cpp:301
double ile(double frac) const
Definition: statistc.cpp:172
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131
bool use_cjk_fp_model() const
Definition: textord.h:98
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:844
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:45
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1118