All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tospace.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * tospace.cpp
3  *
4  * Compute fuzzy word spacing thresholds for each row.
5  * I.e. set : max_nonspace
6  * space_threshold
7  * min_space
8  * kern_size
9  * space_size
10  * for each row.
11  * ONLY FOR PROPORTIONAL BLOCKS - FIXED PITCH IS ASSUMED ALREADY DONE
12  *
13  * Note: functions in this file were originally not members of any
14  * class or enclosed by any namespace. Now they are all static members
15  * of the Textord class.
16  *
17  **********************************************************************/
18 
19 #include "drawtord.h"
20 #include "ndminx.h"
21 #include "statistc.h"
22 #include "textord.h"
23 #include "tovars.h"
24 
25 // Include automatically generated configuration file if running autoconf.
26 #ifdef HAVE_CONFIG_H
27 #include "config_auto.h"
28 #endif
29 
30 #define MAXSPACING 128 /*max expected spacing in pix */
31 
32 namespace tesseract {
34  ICOORD page_tr, //topright of page
35  TO_BLOCK_LIST *blocks //blocks on page
36  ) {
37  TO_BLOCK_IT block_it; //iterator
38  TO_BLOCK *block; //current block;
39  TO_ROW_IT row_it; //row iterator
40  TO_ROW *row; //current row
41  int block_index; //block number
42  int row_index; //row number
43  //estimated width of real spaces for whole block
44  inT16 block_space_gap_width;
45  //estimated width of non space gaps for whole block
46  inT16 block_non_space_gap_width;
47  BOOL8 old_text_ord_proportional;//old fixed/prop result
48  GAPMAP *gapmap = NULL; //map of big vert gaps in blk
49 
50  block_it.set_to_list (blocks);
51  block_index = 1;
52  for (block_it.mark_cycle_pt (); !block_it.cycled_list ();
53  block_it.forward ()) {
54  block = block_it.data ();
55  gapmap = new GAPMAP (block);
56  block_spacing_stats(block,
57  gapmap,
58  old_text_ord_proportional,
59  block_space_gap_width,
60  block_non_space_gap_width);
61  // Make sure relative values of block-level space and non-space gap
62  // widths are reasonable. The ratio of 1:3 is also used in
63  // block_spacing_stats, to corrrect the block_space_gap_width
64  // Useful for arabic and hindi, when the non-space gap width is
65  // often over-estimated and should not be trusted. A similar ratio
66  // is found in block_spacing_stats.
68  (float) block_space_gap_width / block_non_space_gap_width < 3.0) {
69  block_non_space_gap_width = (inT16) floor (block_space_gap_width / 3.0);
70  }
71  row_it.set_to_list (block->get_rows ());
72  row_index = 1;
73  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
74  row = row_it.data ();
75  if ((row->pitch_decision == PITCH_DEF_PROP) ||
76  (row->pitch_decision == PITCH_CORR_PROP)) {
77  if ((tosp_debug_level > 0) && !old_text_ord_proportional)
78  tprintf ("Block %d Row %d: Now Proportional\n",
79  block_index, row_index);
80  row_spacing_stats(row,
81  gapmap,
82  block_index,
83  row_index,
84  block_space_gap_width,
85  block_non_space_gap_width);
86  }
87  else {
88  if ((tosp_debug_level > 0) && old_text_ord_proportional)
89  tprintf
90  ("Block %d Row %d: Now Fixed Pitch Decision:%d fp flag:%f\n",
91  block_index, row_index, row->pitch_decision,
92  row->fixed_pitch);
93  }
94 #ifndef GRAPHICS_DISABLED
97 #endif
98  row_index++;
99  }
100  delete gapmap;
101  block_index++;
102  }
103 }
104 
105 
106 /*************************************************************************
107  * block_spacing_stats()
108  *************************************************************************/
109 
110 void Textord::block_spacing_stats(
111  TO_BLOCK *block,
112  GAPMAP *gapmap,
113  BOOL8 &old_text_ord_proportional,
114  inT16 &block_space_gap_width, // resulting estimate
115  inT16 &block_non_space_gap_width // resulting estimate
116  ) {
117  TO_ROW_IT row_it; // row iterator
118  TO_ROW *row; // current row
119  BLOBNBOX_IT blob_it; // iterator
120 
121  STATS centre_to_centre_stats (0, MAXSPACING);
122  // DEBUG USE ONLY
123  STATS all_gap_stats (0, MAXSPACING);
124  STATS space_gap_stats (0, MAXSPACING);
125  inT16 minwidth = MAXSPACING; // narrowest blob
126  TBOX blob_box;
127  TBOX prev_blob_box;
128  inT16 centre_to_centre;
129  inT16 gap_width;
130  float real_space_threshold;
131  float iqr_centre_to_centre; // DEBUG USE ONLY
132  float iqr_all_gap_stats; // DEBUG USE ONLY
133  inT32 end_of_row;
134  inT32 row_length;
135 
136  row_it.set_to_list (block->get_rows ());
137  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
138  row = row_it.data ();
139  if (!row->blob_list ()->empty () &&
141  (row->pitch_decision == PITCH_DEF_PROP) ||
142  (row->pitch_decision == PITCH_CORR_PROP))) {
143  blob_it.set_to_list (row->blob_list ());
144  blob_it.mark_cycle_pt ();
145  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
147  blob_box = box_next_pre_chopped (&blob_it);
148  else if (tosp_stats_use_xht_gaps)
149  blob_box = reduced_box_next (row, &blob_it);
150  else
151  blob_box = box_next (&blob_it);
152  row_length = end_of_row - blob_box.left ();
153  if (blob_box.width () < minwidth)
154  minwidth = blob_box.width ();
155  prev_blob_box = blob_box;
156  while (!blob_it.cycled_list ()) {
158  blob_box = box_next_pre_chopped (&blob_it);
159  else if (tosp_stats_use_xht_gaps)
160  blob_box = reduced_box_next (row, &blob_it);
161  else
162  blob_box = box_next (&blob_it);
163  if (blob_box.width () < minwidth)
164  minwidth = blob_box.width ();
165  gap_width = blob_box.left () - prev_blob_box.right ();
166  if (!ignore_big_gap (row, row_length, gapmap,
167  prev_blob_box.right (), blob_box.left ())) {
168  all_gap_stats.add (gap_width, 1);
169 
170  centre_to_centre = (blob_box.left () + blob_box.right () -
171  (prev_blob_box.left () +
172  prev_blob_box.right ())) / 2;
173  //DEBUG
174  centre_to_centre_stats.add (centre_to_centre, 1);
175  // DEBUG
176  }
177  prev_blob_box = blob_box;
178  }
179  }
180  }
181 
182  //Inadequate samples
183  if (all_gap_stats.get_total () <= 1) {
184  block_non_space_gap_width = minwidth;
185  block_space_gap_width = -1; //No est. space width
186  //DEBUG
187  old_text_ord_proportional = TRUE;
188  }
189  else {
190  /* For debug only ..... */
191  iqr_centre_to_centre = centre_to_centre_stats.ile (0.75) -
192  centre_to_centre_stats.ile (0.25);
193  iqr_all_gap_stats = all_gap_stats.ile (0.75) - all_gap_stats.ile (0.25);
194  old_text_ord_proportional =
195  iqr_centre_to_centre * 2 > iqr_all_gap_stats;
196  /* .......For debug only */
197 
198  /*
199  The median of the gaps is used as an estimate of the NON-SPACE gap width.
200  This RELIES on the assumption that there are more gaps WITHIN words than
201  BETWEEN words in a block
202 
203  Now try to estimate the width of a real space for all real spaces in the
204  block. Do this by using a crude threshold to ignore "narrow" gaps, then
205  find the median of the "wide" gaps and use this.
206  */
207  block_non_space_gap_width = (inT16) floor (all_gap_stats.median ());
208  // median gap
209 
210  row_it.set_to_list (block->get_rows ());
211  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
212  row = row_it.data ();
213  if (!row->blob_list ()->empty () &&
215  (row->pitch_decision == PITCH_DEF_PROP) ||
216  (row->pitch_decision == PITCH_CORR_PROP))) {
217  real_space_threshold =
218  MAX (tosp_init_guess_kn_mult * block_non_space_gap_width,
220  blob_it.set_to_list (row->blob_list ());
221  blob_it.mark_cycle_pt ();
222  end_of_row =
223  blob_it.data_relative (-1)->bounding_box ().right ();
225  blob_box = box_next_pre_chopped (&blob_it);
226  else if (tosp_stats_use_xht_gaps)
227  blob_box = reduced_box_next (row, &blob_it);
228  else
229  blob_box = box_next (&blob_it);
230  row_length = blob_box.left () - end_of_row;
231  prev_blob_box = blob_box;
232  while (!blob_it.cycled_list ()) {
234  blob_box = box_next_pre_chopped (&blob_it);
235  else if (tosp_stats_use_xht_gaps)
236  blob_box = reduced_box_next (row, &blob_it);
237  else
238  blob_box = box_next (&blob_it);
239  gap_width = blob_box.left () - prev_blob_box.right ();
240  if ((gap_width > real_space_threshold) &&
241  !ignore_big_gap (row, row_length, gapmap,
242  prev_blob_box.right (),
243  blob_box.left ())) {
244  /*
245  If tosp_use_cert_spaces is enabled, the estimate of the space gap is
246  restricted to obvious spaces - those wider than half the xht or those
247  with wide blobs on both sides - i.e not things that are suspect 1's or
248  punctuation that is sometimes widely spaced.
249  */
251  (gap_width >
253  ||
254  ((gap_width >
257  || (!narrow_blob (row, prev_blob_box)
258  && !narrow_blob (row, blob_box))))
259  || (wide_blob (row, prev_blob_box)
260  && wide_blob (row, blob_box)))
261  space_gap_stats.add (gap_width, 1);
262  }
263  prev_blob_box = blob_box;
264  }
265  }
266  }
267  //Inadequate samples
268  if (space_gap_stats.get_total () <= 2)
269  block_space_gap_width = -1;//No est. space width
270  else
271  block_space_gap_width =
272  MAX ((inT16) floor (space_gap_stats.median ()),
273  3 * block_non_space_gap_width);
274  }
275 }
276 
277 
278 /*************************************************************************
279  * row_spacing_stats()
280  * Set values for min_space, max_non_space based on row stats only
281  * If failure - return 0 values.
282  *************************************************************************/
283 void Textord::row_spacing_stats(
284  TO_ROW *row,
285  GAPMAP *gapmap,
286  inT16 block_idx,
287  inT16 row_idx,
288  inT16 block_space_gap_width, //estimate for block
289  inT16 block_non_space_gap_width //estimate for block
290  ) {
291  //iterator
292  BLOBNBOX_IT blob_it = row->blob_list ();
293  STATS all_gap_stats (0, MAXSPACING);
294  STATS cert_space_gap_stats (0, MAXSPACING);
295  STATS all_space_gap_stats (0, MAXSPACING);
296  STATS small_gap_stats (0, MAXSPACING);
297  TBOX blob_box;
298  TBOX prev_blob_box;
299  inT16 gap_width;
300  inT16 real_space_threshold = 0;
301  inT16 max = 0;
302  inT16 index;
303  inT16 large_gap_count = 0;
304  BOOL8 suspected_table;
305  inT32 max_max_nonspace; //upper bound
306  BOOL8 good_block_space_estimate = block_space_gap_width > 0;
307  inT32 end_of_row;
308  inT32 row_length = 0;
309  float sane_space;
310  inT32 sane_threshold;
311 
312  /* Collect first pass stats for row */
313 
314  if (!good_block_space_estimate)
315  block_space_gap_width = inT16 (floor (row->xheight / 2));
316  if (!row->blob_list ()->empty ()) {
317  if (tosp_threshold_bias1 > 0)
318  real_space_threshold =
319  block_non_space_gap_width +
320  inT16 (floor (0.5 +
321  tosp_threshold_bias1 * (block_space_gap_width -
322  block_non_space_gap_width)));
323  else
324  real_space_threshold = //Old TO method
325  (block_space_gap_width + block_non_space_gap_width) / 2;
326  blob_it.set_to_list (row->blob_list ());
327  blob_it.mark_cycle_pt ();
328  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
330  blob_box = box_next_pre_chopped (&blob_it);
331  else if (tosp_stats_use_xht_gaps)
332  blob_box = reduced_box_next (row, &blob_it);
333  else
334  blob_box = box_next (&blob_it);
335  row_length = end_of_row - blob_box.left ();
336  prev_blob_box = blob_box;
337  while (!blob_it.cycled_list ()) {
339  blob_box = box_next_pre_chopped (&blob_it);
340  else if (tosp_stats_use_xht_gaps)
341  blob_box = reduced_box_next (row, &blob_it);
342  else
343  blob_box = box_next (&blob_it);
344  gap_width = blob_box.left () - prev_blob_box.right ();
345  if (ignore_big_gap (row, row_length, gapmap,
346  prev_blob_box.right (), blob_box.left ()))
347  large_gap_count++;
348  else {
349  if (gap_width >= real_space_threshold) {
351  (gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
352  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight)
354  || (!narrow_blob (row, prev_blob_box)
355  && !narrow_blob (row, blob_box))))
356  || (wide_blob (row, prev_blob_box)
357  && wide_blob (row, blob_box)))
358  cert_space_gap_stats.add (gap_width, 1);
359  all_space_gap_stats.add (gap_width, 1);
360  }
361  else
362  small_gap_stats.add (gap_width, 1);
363  all_gap_stats.add (gap_width, 1);
364  }
365  prev_blob_box = blob_box;
366  }
367  }
368  suspected_table = (large_gap_count > 1) ||
369  ((large_gap_count > 0) &&
370  (all_gap_stats.get_total () <= tosp_few_samples));
371 
372  /* Now determine row kern size, space size and threshold */
373 
374  if ((cert_space_gap_stats.get_total () >=
376  ((suspected_table ||
377  all_gap_stats.get_total () <= tosp_short_row) &&
378  cert_space_gap_stats.get_total () > 0)) {
379  old_to_method(row,
380  &all_gap_stats,
381  &cert_space_gap_stats,
382  &small_gap_stats,
383  block_space_gap_width,
384  block_non_space_gap_width);
385  } else {
387  !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table,
388  block_idx, row_idx)) {
390  tprintf ("B:%d R:%d -- Inadequate certain spaces.\n",
391  block_idx, row_idx);
392  if (tosp_row_use_cert_spaces1 && good_block_space_estimate) {
393  //Use block default
394  row->space_size = block_space_gap_width;
395  if (all_gap_stats.get_total () > tosp_redo_kern_limit)
396  row->kern_size = all_gap_stats.median ();
397  else
398  row->kern_size = block_non_space_gap_width;
399  row->space_threshold =
400  inT32 (floor ((row->space_size + row->kern_size) /
402  }
403  else
404  old_to_method(row,
405  &all_gap_stats,
406  &all_space_gap_stats,
407  &small_gap_stats,
408  block_space_gap_width,
409  block_non_space_gap_width);
410  }
411  }
412 
413  if (tosp_improve_thresh && !suspected_table)
414  improve_row_threshold(row, &all_gap_stats);
415 
416  /* Now lets try to be careful not to do anything silly with tables when we
417  are ignoring big gaps*/
418  if (tosp_sanity_method == 0) {
419  if (suspected_table &&
420  (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) {
421  if (tosp_debug_level > 5)
422  tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n",
423  block_idx, row_idx,
424  row->kern_size, row->space_threshold, row->space_size);
425  row->space_threshold =
427  row->space_size = MAX (row->space_threshold + 1, row->xheight);
428  }
429  }
430  else if (tosp_sanity_method == 1) {
431  sane_space = row->space_size;
432  /* NEVER let space size get too close to kern size */
433  if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5))
434  || ((row->space_size - row->kern_size) <
435  (tosp_silly_kn_sp_gap * row->xheight))) {
436  if (good_block_space_estimate &&
437  (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size))
438  sane_space = block_space_gap_width;
439  else
440  sane_space =
441  MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5),
442  row->xheight / 2);
443  if (tosp_debug_level > 5)
444  tprintf
445  ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n",
446  block_idx, row_idx, row->kern_size, row->space_threshold,
447  row->space_size, sane_space);
448  row->space_size = sane_space;
449  row->space_threshold =
450  inT32 (floor ((row->space_size + row->kern_size) /
452  }
453  /* NEVER let threshold get VERY far away from kern */
454  sane_threshold = inT32 (floor (tosp_max_sane_kn_thresh *
455  MAX (row->kern_size, 2.5)));
456  if (row->space_threshold > sane_threshold) {
457  if (tosp_debug_level > 5)
458  tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n",
459  block_idx, row_idx,
460  row->kern_size,
461  row->space_threshold, row->space_size, sane_threshold);
462  row->space_threshold = sane_threshold;
463  if (row->space_size <= sane_threshold)
464  row->space_size = row->space_threshold + 1.0f;
465  }
466  /* Beware of tables - there may be NO spaces */
467  if (suspected_table) {
468  sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size,
470  sane_threshold = inT32 (floor ((sane_space + row->kern_size) / 2));
471 
472  if ((row->space_size < sane_space) ||
473  (row->space_threshold < sane_threshold)) {
474  if (tosp_debug_level > 5)
475  tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n",
476  block_idx, row_idx,
477  row->kern_size,
478  row->space_threshold, row->space_size);
479  //the minimum sane value
480  row->space_threshold = (inT32) sane_space;
481  row->space_size = MAX (row->space_threshold + 1, row->xheight);
482  }
483  }
484  }
485 
486  /* Now lets try to put some error limits on the threshold */
487 
488  if (tosp_old_to_method) {
489  /* Old textord made a space if gap >= threshold */
490  //NO FUZZY SPACES YET
491  row->max_nonspace = row->space_threshold;
492  //NO FUZZY SPACES YET
493  row->min_space = row->space_threshold + 1;
494  }
495  else {
496  /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */
497  row->min_space =
498  MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)),
499  inT32 (row->space_size));
500  if (row->min_space <= row->space_threshold)
501  //Dont be silly
502  row->min_space = row->space_threshold + 1;
503  /*
504  Lets try to guess the max certain kern gap by looking at the cluster of
505  kerns for the row. The row is proportional so the kerns should cluster
506  tightly at the bottom of the distribution. We also expect most gaps to be
507  kerns. Find the maximum of the kern piles between 0 and twice the kern
508  estimate. Piles before the first one with less than 1/10 the maximum
509  number of samples can be taken as certain kerns.
510 
511  Of course, there are some cases where the kern peak and space peaks merge,
512  so we will put an UPPER limit on the max certain kern gap of some fraction
513  below the threshold.
514  */
515 
516  max_max_nonspace = inT32 ((row->space_threshold + row->kern_size) / 2);
517 
518  //default
519  row->max_nonspace = max_max_nonspace;
520  for (index = 0; index <= max_max_nonspace; index++) {
521  if (all_gap_stats.pile_count (index) > max)
522  max = all_gap_stats.pile_count (index);
523  if ((index > row->kern_size) &&
524  (all_gap_stats.pile_count (index) < 0.1 * max)) {
525  row->max_nonspace = index;
526  break;
527  }
528  }
529  }
530 
531  /* Yet another algorithm - simpler this time - just choose a fraction of the
532  threshold to space range */
533 
534  if ((tosp_fuzzy_sp_fraction > 0) &&
535  (row->space_size > row->space_threshold))
536  row->min_space = MAX (row->min_space,
537  (inT32) ceil (row->space_threshold +
539  (row->space_size -
540  row->space_threshold)));
541 
542  /* Ensure that ANY space less than some multiplier times the kern size is
543  fuzzy. In tables there is a risk of erroneously setting a small space size
544  when there are no real spaces. Sometimes tables have text squashed into
545  columns so that the kn->sp ratio is small anyway - this means that we cant
546  use this to force a wider separation - hence we rely on context to join any
547  dubious breaks. */
548 
549  if ((tosp_table_fuzzy_kn_sp_ratio > 0) &&
550  (suspected_table || tosp_fuzzy_limit_all))
551  row->min_space = MAX (row->min_space,
553  row->kern_size));
554 
555  if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) {
556  row->max_nonspace = (inT32) floor (0.5 + row->kern_size +
558  (row->space_threshold -
559  row->kern_size));
560  }
561  if (row->max_nonspace > row->space_threshold) {
562  //Dont be silly
563  row->max_nonspace = row->space_threshold;
564  }
565 
566  if (tosp_debug_level > 5)
567  tprintf
568  ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n",
569  block_idx, row_idx, row_length, block_non_space_gap_width,
570  block_space_gap_width, real_space_threshold, row->kern_size,
571  row->max_nonspace, row->space_threshold, row->min_space,
572  row->space_size);
573  if (tosp_debug_level > 10)
574  tprintf("row->kern_size = %3.2f, row->space_size = %3.2f, "
575  "row->space_threshold = %d\n",
576  row->kern_size, row->space_size, row->space_threshold);
577 }
578 
579 void Textord::old_to_method(
580  TO_ROW *row,
581  STATS *all_gap_stats,
582  STATS *space_gap_stats,
583  STATS *small_gap_stats,
584  inT16 block_space_gap_width, //estimate for block
585  inT16 block_non_space_gap_width //estimate for block
586  ) {
587  /* First, estimate row space size */
588  /* Old to condition was > 2 */
589  if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) {
590  //Adequate samples
591  /* Set space size to median of spaces BUT limits it if it seems wildly out */
592  row->space_size = space_gap_stats->median ();
593  if (row->space_size > block_space_gap_width * 1.5) {
595  row->space_size = block_space_gap_width * 1.5;
596  else
597  //BUG??? should be *1.5
598  row->space_size = block_space_gap_width;
599  }
600  if (row->space_size < (block_non_space_gap_width * 2) + 1)
601  row->space_size = (block_non_space_gap_width * 2) + 1;
602  }
603  //Only 1 or 2 samples
604  else if (space_gap_stats->get_total () >= 1) {
605  //hence mean not median
606  row->space_size = space_gap_stats->mean ();
607  if (row->space_size > block_space_gap_width * 1.5) {
609  row->space_size = block_space_gap_width * 1.5;
610  else
611  //BUG??? should be *1.5
612  row->space_size = block_space_gap_width;
613  }
614  if (row->space_size < (block_non_space_gap_width * 3) + 1)
615  row->space_size = (block_non_space_gap_width * 3) + 1;
616  }
617  else {
618  //Use block default
619  row->space_size = block_space_gap_width;
620  }
621 
622  /* Next, estimate row kern size */
624  (small_gap_stats->get_total () > tosp_redo_kern_limit))
625  row->kern_size = small_gap_stats->median ();
626  else if (all_gap_stats->get_total () > tosp_redo_kern_limit)
627  row->kern_size = all_gap_stats->median ();
628  else //old TO -SAME FOR ALL ROWS
629  row->kern_size = block_non_space_gap_width;
630 
631  /* Finally, estimate row space threshold */
632  if (tosp_threshold_bias2 > 0) {
633  row->space_threshold =
634  inT32 (floor (0.5 + row->kern_size +
636  row->kern_size)));
637  } else {
638  /*
639  NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold
640  and holds this in a float. The use is with a >= test
641  NEW textord uses an integer threshold and a > test
642  It comes to the same thing.
643  (Though there is a difference in that old textor has integer space_size
644  and kern_size.)
645  */
646  row->space_threshold =
647  inT32 (floor ((row->space_size + row->kern_size) / 2));
648  }
649 
650  // Apply the same logic and ratios as in row_spacing_stats to
651  // restrict relative values of the row's space_size, kern_size, and
652  // space_threshold
654  ((row->space_size <
655  tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) ||
656  ((row->space_size - row->kern_size) <
657  tosp_silly_kn_sp_gap * row->xheight))) {
658  if (row->kern_size > 2.5)
660  row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) /
662  }
663 }
664 
665 
666 /*************************************************************************
667  * isolated_row_stats()
668  * Set values for min_space, max_non_space based on row stats only
669  *************************************************************************/
670 BOOL8 Textord::isolated_row_stats(TO_ROW *row,
671  GAPMAP *gapmap,
672  STATS *all_gap_stats,
673  BOOL8 suspected_table,
674  inT16 block_idx,
675  inT16 row_idx) {
676  float kern_estimate;
677  float crude_threshold_estimate;
678  inT16 small_gaps_count;
679  inT16 total;
680  //iterator
681  BLOBNBOX_IT blob_it = row->blob_list ();
682  STATS cert_space_gap_stats (0, MAXSPACING);
683  STATS all_space_gap_stats (0, MAXSPACING);
684  STATS small_gap_stats (0, MAXSPACING);
685  TBOX blob_box;
686  TBOX prev_blob_box;
687  inT16 gap_width;
688  inT32 end_of_row;
689  inT32 row_length;
690 
691  kern_estimate = all_gap_stats->median ();
692  crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate,
694  small_gaps_count = stats_count_under (all_gap_stats,
695  (inT16)
696  ceil (crude_threshold_estimate));
697  total = all_gap_stats->get_total ();
698 
699  if ((total <= tosp_redo_kern_limit) ||
700  ((small_gaps_count / (float) total) < tosp_enough_small_gaps) ||
701  (total - small_gaps_count < 1)) {
702  if (tosp_debug_level > 5)
703  tprintf ("B:%d R:%d -- Cant do isolated row stats.\n",
704  block_idx, row_idx);
705  return FALSE;
706  }
707  blob_it.set_to_list (row->blob_list ());
708  blob_it.mark_cycle_pt ();
709  end_of_row = blob_it.data_relative (-1)->bounding_box ().right ();
711  blob_box = box_next_pre_chopped (&blob_it);
712  else if (tosp_stats_use_xht_gaps)
713  blob_box = reduced_box_next (row, &blob_it);
714  else
715  blob_box = box_next (&blob_it);
716  row_length = end_of_row - blob_box.left ();
717  prev_blob_box = blob_box;
718  while (!blob_it.cycled_list ()) {
720  blob_box = box_next_pre_chopped (&blob_it);
721  else if (tosp_stats_use_xht_gaps)
722  blob_box = reduced_box_next (row, &blob_it);
723  else
724  blob_box = box_next (&blob_it);
725  gap_width = blob_box.left () - prev_blob_box.right ();
726  if (!ignore_big_gap (row, row_length, gapmap,
727  prev_blob_box.right (), blob_box.left ()) &&
728  (gap_width > crude_threshold_estimate)) {
729  if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) ||
730  ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) &&
732  (!narrow_blob (row, prev_blob_box) &&
733  !narrow_blob (row, blob_box)))) ||
734  (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box)))
735  cert_space_gap_stats.add (gap_width, 1);
736  all_space_gap_stats.add (gap_width, 1);
737  }
738  if (gap_width < crude_threshold_estimate)
739  small_gap_stats.add (gap_width, 1);
740 
741  prev_blob_box = blob_box;
742  }
743  if (cert_space_gap_stats.get_total () >=
745  //median
746  row->space_size = cert_space_gap_stats.median ();
747  else if (suspected_table && (cert_space_gap_stats.get_total () > 0))
748  //to avoid spaced
749  row->space_size = cert_space_gap_stats.mean ();
750  // 1's in tables
751  else if (all_space_gap_stats.get_total () >=
753  //median
754  row->space_size = all_space_gap_stats.median ();
755  else
756  row->space_size = all_space_gap_stats.mean ();
757 
759  row->kern_size = small_gap_stats.median ();
760  else
761  row->kern_size = all_gap_stats->median ();
762  row->space_threshold =
763  inT32 (floor ((row->space_size + row->kern_size) / 2));
764  /* Sanity check */
765  if ((row->kern_size >= row->space_threshold) ||
766  (row->space_threshold >= row->space_size) ||
767  (row->space_threshold <= 0)) {
768  if (tosp_debug_level > 5)
769  tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n",
770  block_idx, row_idx,
771  row->kern_size, row->space_threshold, row->space_size);
772  row->kern_size = 0.0f;
773  row->space_threshold = 0;
774  row->space_size = 0.0f;
775  return FALSE;
776  }
777 
778  if (tosp_debug_level > 5)
779  tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n",
780  block_idx, row_idx,
781  row->kern_size, row->space_threshold, row->space_size);
782  return TRUE;
783 }
784 
785 inT16 Textord::stats_count_under(STATS *stats, inT16 threshold) {
786  inT16 index;
787  inT16 total = 0;
788 
789  for (index = 0; index < threshold; index++)
790  total += stats->pile_count (index);
791  return total;
792 }
793 
794 
795 /*************************************************************************
796  * improve_row_threshold()
797  * Try to recognise a "normal line" -
798  * > 25 gaps
799  * && space > 3 * kn && space > 10
800  * (I.e. reasonably large space and kn:sp ratio)
801  * && > 3/4 # gaps < kn + (sp - kn)/3
802  * (I.e. most gaps are well away from space estimate)
803  * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found
804  * somewhere in the histogram between kn and sp
805  * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies
806  * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!!
807  * try moving the default threshold to within this band but leave the
808  * fuzzy limit calculation as at present.
809  *************************************************************************/
810 void Textord::improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) {
811  float sp = row->space_size;
812  float kn = row->kern_size;
813  inT16 reqd_zero_width = 0;
814  inT16 zero_width = 0;
815  inT16 zero_start = 0;
816  inT16 index = 0;
817 
818  if (tosp_debug_level > 10)
819  tprintf ("Improve row threshold 0");
820  if ((all_gap_stats->get_total () <= 25) ||
821  (sp <= 10) ||
822  (sp <= 3 * kn) ||
823  (stats_count_under (all_gap_stats,
824  (inT16) ceil (kn + (sp - kn) / 3 + 0.5)) <
825  (0.75 * all_gap_stats->get_total ())))
826  return;
827  if (tosp_debug_level > 10)
828  tprintf (" 1");
829  /*
830  Look for the first region of all 0's in the histogram which is wider than
831  max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current
832  threshold is not within it, move the threshold so that is is just inside it.
833  */
834  reqd_zero_width = (inT16) floor ((sp - kn) / 3 + 0.5);
835  if (reqd_zero_width < 3)
836  reqd_zero_width = 3;
837 
838  for (index = inT16 (ceil (kn)); index < inT16 (floor (sp)); index++) {
839  if (all_gap_stats->pile_count (index) == 0) {
840  if (zero_width == 0)
841  zero_start = index;
842  zero_width++;
843  }
844  else {
845  if (zero_width >= reqd_zero_width)
846  break;
847  else {
848  zero_width = 0;
849  }
850  }
851  }
852  index--;
853  if (tosp_debug_level > 10)
854  tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n",
855  reqd_zero_width, zero_width, zero_start, row->space_threshold);
856  if ((zero_width < reqd_zero_width) ||
857  ((row->space_threshold >= zero_start) &&
858  (row->space_threshold <= index)))
859  return;
860  if (tosp_debug_level > 10)
861  tprintf (" 2");
862  if (row->space_threshold < zero_start) {
863  if (tosp_debug_level > 5)
864  tprintf
865  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
866  kn, sp, zero_start, index, row->space_threshold, zero_start);
867  row->space_threshold = zero_start;
868  }
869  if (row->space_threshold > index) {
870  if (tosp_debug_level > 5)
871  tprintf
872  ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n",
873  kn, sp, zero_start, index, row->space_threshold, index);
874  row->space_threshold = index;
875  }
876 }
877 
878 
879 /**********************************************************************
880  * make_prop_words
881  *
882  * Convert a TO_BLOCK to a BLOCK.
883  **********************************************************************/
885  TO_ROW *row, // row to make
886  FCOORD rotation // for drawing
887  ) {
888  BOOL8 bol; // start of line
889  /* prev_ values are for start of word being built. non prev_ values are for
890  the gap between the word being built and the next one. */
891  BOOL8 prev_fuzzy_sp; // probably space
892  BOOL8 prev_fuzzy_non; // probably not
893  uinT8 prev_blanks; // in front of word
894  BOOL8 fuzzy_sp = false; // probably space
895  BOOL8 fuzzy_non = false; // probably not
896  uinT8 blanks = 0; // in front of word
897  BOOL8 prev_gap_was_a_space = FALSE;
898  BOOL8 break_at_next_gap = FALSE;
899  ROW *real_row; // output row
900  C_OUTLINE_IT cout_it;
901  C_BLOB_LIST cblobs;
902  C_BLOB_IT cblob_it = &cblobs;
903  WERD_LIST words;
904  WERD_IT word_it; // new words
905  WERD *word; // new word
906  WERD_IT rep_char_it; // repeated char words
907  inT32 next_rep_char_word_right = MAX_INT32;
908  float repetition_spacing; // gap between repetitions
909  inT32 xstarts[2]; // row ends
910  inT32 prev_x; // end of prev blob
911  BLOBNBOX *bblob; // current blob
912  TBOX blob_box; // bounding box
913  BLOBNBOX_IT box_it; // iterator
914  TBOX prev_blob_box;
915  TBOX next_blob_box;
916  inT16 prev_gap = MAX_INT16;
917  inT16 current_gap = MAX_INT16;
918  inT16 next_gap = MAX_INT16;
919  inT16 prev_within_xht_gap = MAX_INT16;
920  inT16 current_within_xht_gap = MAX_INT16;
921  inT16 next_within_xht_gap = MAX_INT16;
922  inT16 word_count = 0;
923 
924  rep_char_it.set_to_list (&(row->rep_words));
925  if (!rep_char_it.empty ()) {
926  next_rep_char_word_right =
927  rep_char_it.data ()->bounding_box ().right ();
928  }
929 
930  prev_x = -MAX_INT16;
931  cblob_it.set_to_list (&cblobs);
932  box_it.set_to_list (row->blob_list ());
933  word_it.set_to_list (&words);
934  bol = TRUE;
935  prev_blanks = 0;
936  prev_fuzzy_sp = FALSE;
937  prev_fuzzy_non = FALSE;
938  if (!box_it.empty ()) {
939  xstarts[0] = box_it.data ()->bounding_box ().left ();
940  if (xstarts[0] > next_rep_char_word_right) {
941  /* We need to insert a repeated char word at the start of the row */
942  word = rep_char_it.extract ();
943  word_it.add_after_then_move (word);
944  /* Set spaces before repeated char word */
945  word->set_flag (W_BOL, TRUE);
946  bol = FALSE;
947  word->set_blanks (0);
948  //NO uncertainty
949  word->set_flag (W_FUZZY_SP, FALSE);
950  word->set_flag (W_FUZZY_NON, FALSE);
951  xstarts[0] = word->bounding_box ().left ();
952  /* Set spaces after repeated char word (and leave current word set) */
953  repetition_spacing = find_mean_blob_spacing (word);
954  current_gap = box_it.data ()->bounding_box ().left () -
955  next_rep_char_word_right;
956  current_within_xht_gap = current_gap;
957  if (current_gap > tosp_rep_space * repetition_spacing) {
958  prev_blanks = (uinT8) floor (current_gap / row->space_size);
959  if (prev_blanks < 1)
960  prev_blanks = 1;
961  }
962  else
963  prev_blanks = 0;
964  if (tosp_debug_level > 5)
965  tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ",
966  box_it.data ()->bounding_box ().left (),
967  box_it.data ()->bounding_box ().bottom (),
968  repetition_spacing, current_gap);
969  prev_fuzzy_sp = FALSE;
970  prev_fuzzy_non = FALSE;
971  if (rep_char_it.empty ()) {
972  next_rep_char_word_right = MAX_INT32;
973  }
974  else {
975  rep_char_it.forward ();
976  next_rep_char_word_right =
977  rep_char_it.data ()->bounding_box ().right ();
978  }
979  }
980 
981  peek_at_next_gap(row,
982  box_it,
983  next_blob_box,
984  next_gap,
985  next_within_xht_gap);
986  do {
987  bblob = box_it.data ();
988  blob_box = bblob->bounding_box ();
989  if (bblob->joined_to_prev ()) {
990  if (bblob->cblob () != NULL) {
991  cout_it.set_to_list (cblob_it.data ()->out_list ());
992  cout_it.move_to_last ();
993  cout_it.add_list_after (bblob->cblob ()->out_list ());
994  delete bblob->cblob ();
995  }
996  } else {
997  if (bblob->cblob() != NULL)
998  cblob_it.add_after_then_move (bblob->cblob ());
999  prev_x = blob_box.right ();
1000  }
1001  box_it.forward (); //next one
1002  bblob = box_it.data ();
1003  blob_box = bblob->bounding_box ();
1004 
1005  if (!bblob->joined_to_prev() && bblob->cblob() != NULL) {
1006  /* Real Blob - not multiple outlines or pre-chopped */
1007  prev_gap = current_gap;
1008  prev_within_xht_gap = current_within_xht_gap;
1009  prev_blob_box = next_blob_box;
1010  current_gap = next_gap;
1011  current_within_xht_gap = next_within_xht_gap;
1012  peek_at_next_gap(row,
1013  box_it,
1014  next_blob_box,
1015  next_gap,
1016  next_within_xht_gap);
1017 
1018  inT16 prev_gap_arg = prev_gap;
1019  inT16 next_gap_arg = next_gap;
1020  if (tosp_only_use_xht_gaps) {
1021  prev_gap_arg = prev_within_xht_gap;
1022  next_gap_arg = next_within_xht_gap;
1023  }
1024  // Decide if a word-break should be inserted
1025  if (blob_box.left () > next_rep_char_word_right ||
1026  make_a_word_break(row, blob_box, prev_gap_arg, prev_blob_box,
1027  current_gap, current_within_xht_gap,
1028  next_blob_box, next_gap_arg,
1029  blanks, fuzzy_sp, fuzzy_non,
1030  prev_gap_was_a_space,
1031  break_at_next_gap) ||
1032  box_it.at_first()) {
1033  /* Form a new word out of the blobs collected */
1034  word = new WERD (&cblobs, prev_blanks, NULL);
1035  word_count++;
1036  word_it.add_after_then_move (word);
1037  if (bol) {
1038  word->set_flag (W_BOL, TRUE);
1039  bol = FALSE;
1040  }
1041  if (prev_fuzzy_sp)
1042  //probably space
1043  word->set_flag (W_FUZZY_SP, TRUE);
1044  else if (prev_fuzzy_non)
1045  word->set_flag (W_FUZZY_NON, TRUE);
1046  //probably not
1047 
1048  if (blob_box.left () > next_rep_char_word_right) {
1049  /* We need to insert a repeated char word */
1050  word = rep_char_it.extract ();
1051  word_it.add_after_then_move (word);
1052 
1053  /* Set spaces before repeated char word */
1054  repetition_spacing = find_mean_blob_spacing (word);
1055  current_gap = word->bounding_box ().left () - prev_x;
1056  current_within_xht_gap = current_gap;
1057  if (current_gap > tosp_rep_space * repetition_spacing) {
1058  blanks =
1059  (uinT8) floor (current_gap / row->space_size);
1060  if (blanks < 1)
1061  blanks = 1;
1062  }
1063  else
1064  blanks = 0;
1065  if (tosp_debug_level > 5)
1066  tprintf
1067  ("Repch wd (%d,%d) rep gap %5.2f; Lgap:%d (%d blanks);",
1068  word->bounding_box ().left (),
1069  word->bounding_box ().bottom (),
1070  repetition_spacing, current_gap, blanks);
1071  word->set_blanks (blanks);
1072  //NO uncertainty
1073  word->set_flag (W_FUZZY_SP, FALSE);
1074  word->set_flag (W_FUZZY_NON, FALSE);
1075 
1076  /* Set spaces after repeated char word (and leave current word set) */
1077  current_gap =
1078  blob_box.left () - next_rep_char_word_right;
1079  if (current_gap > tosp_rep_space * repetition_spacing) {
1080  blanks = (uinT8) (current_gap / row->space_size);
1081  if (blanks < 1)
1082  blanks = 1;
1083  }
1084  else
1085  blanks = 0;
1086  if (tosp_debug_level > 5)
1087  tprintf (" Rgap:%d (%d blanks)\n",
1088  current_gap, blanks);
1089  fuzzy_sp = FALSE;
1090  fuzzy_non = FALSE;
1091 
1092  if (rep_char_it.empty ()) {
1093  next_rep_char_word_right = MAX_INT32;
1094  }
1095  else {
1096  rep_char_it.forward ();
1097  next_rep_char_word_right =
1098  rep_char_it.data ()->bounding_box ().right ();
1099  }
1100  }
1101 
1102  if (box_it.at_first () && rep_char_it.empty ()) {
1103  //at end of line
1104  word->set_flag (W_EOL, TRUE);
1105  xstarts[1] = prev_x;
1106  }
1107  else {
1108  prev_blanks = blanks;
1109  prev_fuzzy_sp = fuzzy_sp;
1110  prev_fuzzy_non = fuzzy_non;
1111  }
1112  }
1113  }
1114  }
1115  while (!box_it.at_first ()); //until back at start
1116 
1117  /* Insert any further repeated char words */
1118  while (!rep_char_it.empty ()) {
1119  word = rep_char_it.extract ();
1120  word_it.add_after_then_move (word);
1121 
1122  /* Set spaces before repeated char word */
1123  repetition_spacing = find_mean_blob_spacing (word);
1124  current_gap = word->bounding_box ().left () - prev_x;
1125  if (current_gap > tosp_rep_space * repetition_spacing) {
1126  blanks = (uinT8) floor (current_gap / row->space_size);
1127  if (blanks < 1)
1128  blanks = 1;
1129  }
1130  else
1131  blanks = 0;
1132  if (tosp_debug_level > 5)
1133  tprintf
1134  ("Repch wd at EOL (%d,%d). rep spacing %d; Lgap:%d (%d blanks)\n",
1135  word->bounding_box ().left (), word->bounding_box ().bottom (),
1136  repetition_spacing, current_gap, blanks);
1137  word->set_blanks (blanks);
1138  //NO uncertainty
1139  word->set_flag (W_FUZZY_SP, FALSE);
1140  word->set_flag (W_FUZZY_NON, FALSE);
1141  prev_x = word->bounding_box ().right ();
1142  if (rep_char_it.empty ()) {
1143  //at end of line
1144  word->set_flag (W_EOL, TRUE);
1145  xstarts[1] = prev_x;
1146  }
1147  else {
1148  rep_char_it.forward ();
1149  }
1150  }
1151  real_row = new ROW (row,
1152  (inT16) row->kern_size, (inT16) row->space_size);
1153  word_it.set_to_list (real_row->word_list ());
1154  //put words in row
1155  word_it.add_list_after (&words);
1156  real_row->recalc_bounding_box ();
1157 
1158  if (tosp_debug_level > 4) {
1159  tprintf ("Row: Made %d words in row ((%d,%d)(%d,%d))\n",
1160  word_count,
1161  real_row->bounding_box ().left (),
1162  real_row->bounding_box ().bottom (),
1163  real_row->bounding_box ().right (),
1164  real_row->bounding_box ().top ());
1165  }
1166  return real_row;
1167  }
1168  return NULL;
1169 }
1170 
1171 /**********************************************************************
1172  * make_blob_words
1173  *
1174  * Converts words into blobs so that each blob is a single character.
1175  * Used for chopper test.
1176  **********************************************************************/
1178  TO_ROW *row, // row to make
1179  FCOORD rotation // for drawing
1180  ) {
1181  bool bol; // start of line
1182  ROW *real_row; // output row
1183  C_OUTLINE_IT cout_it;
1184  C_BLOB_LIST cblobs;
1185  C_BLOB_IT cblob_it = &cblobs;
1186  WERD_LIST words;
1187  WERD_IT word_it; // new words
1188  WERD *word; // new word
1189  BLOBNBOX *bblob; // current blob
1190  TBOX blob_box; // bounding box
1191  BLOBNBOX_IT box_it; // iterator
1192  inT16 word_count = 0;
1193 
1194  cblob_it.set_to_list(&cblobs);
1195  box_it.set_to_list(row->blob_list());
1196  word_it.set_to_list(&words);
1197  bol = TRUE;
1198  if (!box_it.empty()) {
1199 
1200  do {
1201  bblob = box_it.data();
1202  blob_box = bblob->bounding_box();
1203  if (bblob->joined_to_prev()) {
1204  if (bblob->cblob() != NULL) {
1205  cout_it.set_to_list(cblob_it.data()->out_list());
1206  cout_it.move_to_last();
1207  cout_it.add_list_after(bblob->cblob()->out_list());
1208  delete bblob->cblob();
1209  }
1210  } else {
1211  if (bblob->cblob() != NULL)
1212  cblob_it.add_after_then_move(bblob->cblob());
1213  }
1214  box_it.forward(); // next one
1215  bblob = box_it.data();
1216  blob_box = bblob->bounding_box();
1217 
1218  if (!bblob->joined_to_prev() && !cblobs.empty()) {
1219  word = new WERD(&cblobs, 1, NULL);
1220  word_count++;
1221  word_it.add_after_then_move(word);
1222  if (bol) {
1223  word->set_flag(W_BOL, TRUE);
1224  bol = FALSE;
1225  }
1226  if (box_it.at_first()) { // at end of line
1227  word->set_flag(W_EOL, TRUE);
1228  }
1229  }
1230  }
1231  while (!box_it.at_first()); // until back at start
1232  /* Setup the row with created words. */
1233  real_row = new ROW(row, (inT16) row->kern_size, (inT16) row->space_size);
1234  word_it.set_to_list(real_row->word_list());
1235  //put words in row
1236  word_it.add_list_after(&words);
1237  real_row->recalc_bounding_box();
1238  if (tosp_debug_level > 4) {
1239  tprintf ("Row:Made %d words in row ((%d,%d)(%d,%d))\n",
1240  word_count,
1241  real_row->bounding_box().left(),
1242  real_row->bounding_box().bottom(),
1243  real_row->bounding_box().right(),
1244  real_row->bounding_box().top());
1245  }
1246  return real_row;
1247  }
1248  return NULL;
1249 }
1250 
1251 BOOL8 Textord::make_a_word_break(
1252  TO_ROW *row, // row being made
1253  TBOX blob_box, // for next_blob // how many blanks?
1254  inT16 prev_gap,
1255  TBOX prev_blob_box,
1256  inT16 real_current_gap,
1257  inT16 within_xht_current_gap,
1258  TBOX next_blob_box,
1259  inT16 next_gap,
1260  uinT8 &blanks,
1261  BOOL8 &fuzzy_sp,
1262  BOOL8 &fuzzy_non,
1263  BOOL8& prev_gap_was_a_space,
1264  BOOL8& break_at_next_gap) {
1265  BOOL8 space;
1266  inT16 current_gap;
1267  float fuzzy_sp_to_kn_limit;
1268 
1269  if (break_at_next_gap) {
1270  break_at_next_gap = FALSE;
1271  return TRUE;
1272  }
1273  /* Inhibit using the reduced gap if
1274  The kerning is large - chars are not kerned and reducing "f"s can cause
1275  erroneous blanks
1276  OR The real gap is less than 0
1277  OR The real gap is less than the kerning estimate
1278  */
1279  if ((row->kern_size > tosp_large_kerning * row->xheight) ||
1281  (real_current_gap < tosp_dont_fool_with_small_kerns * row->kern_size)))
1282  //Ignore the difference
1283  within_xht_current_gap = real_current_gap;
1284 
1286  current_gap = within_xht_current_gap;
1287  else
1288  current_gap = real_current_gap;
1289 
1290  if (tosp_old_to_method) {
1291  //Boring old method
1292  space = current_gap > row->max_nonspace;
1293  if (space && (current_gap < MAX_INT16)) {
1294  if (current_gap < row->min_space) {
1295  if (current_gap > row->space_threshold) {
1296  blanks = 1;
1297  fuzzy_sp = TRUE;
1298  fuzzy_non = FALSE;
1299  }
1300  else {
1301  blanks = 0;
1302  fuzzy_sp = FALSE;
1303  fuzzy_non = TRUE;
1304  }
1305  }
1306  else {
1307  blanks = (uinT8) (current_gap / row->space_size);
1308  if (blanks < 1)
1309  blanks = 1;
1310  fuzzy_sp = FALSE;
1311  fuzzy_non = FALSE;
1312  }
1313  }
1314  return space;
1315  }
1316  else {
1317  /* New exciting heuristic method */
1318  if (prev_blob_box.null_box ()) // Beginning of row
1319  prev_gap_was_a_space = TRUE;
1320 
1321  //Default as old TO
1322  space = current_gap > row->space_threshold;
1323 
1324  /* Set defaults for the word break incase we find one. Currently there are
1325  no fuzzy spaces. Depending on the reliability of the different heuristics
1326  we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY
1327  be used if the function returns TRUE - ie the word is to be broken.
1328  */
1329  blanks = (uinT8) (current_gap / row->space_size);
1330  if (blanks < 1)
1331  blanks = 1;
1332  fuzzy_sp = FALSE;
1333  fuzzy_non = FALSE;
1334  /*
1335  If xht measure causes gap to flip one of the 3 thresholds act accordingly -
1336  despite any other heuristics - the MINIMUM action is to pass a fuzzy kern to
1337  context.
1338  */
1339  if (tosp_use_xht_gaps &&
1340  (real_current_gap <= row->max_nonspace) &&
1341  (within_xht_current_gap > row->max_nonspace)) {
1342  space = TRUE;
1343  fuzzy_non = TRUE;
1344 #ifndef GRAPHICS_DISABLED
1345  mark_gap (blob_box, 20,
1346  prev_gap, prev_blob_box.width (),
1347  current_gap, next_blob_box.width (), next_gap);
1348 #endif
1349  }
1350  else if (tosp_use_xht_gaps &&
1351  (real_current_gap <= row->space_threshold) &&
1352  (within_xht_current_gap > row->space_threshold)) {
1353  space = TRUE;
1355  fuzzy_sp = TRUE;
1356  else
1357  fuzzy_non = TRUE;
1358 #ifndef GRAPHICS_DISABLED
1359  mark_gap (blob_box, 21,
1360  prev_gap, prev_blob_box.width (),
1361  current_gap, next_blob_box.width (), next_gap);
1362 #endif
1363  }
1364  else if (tosp_use_xht_gaps &&
1365  (real_current_gap < row->min_space) &&
1366  (within_xht_current_gap >= row->min_space)) {
1367  space = TRUE;
1368 #ifndef GRAPHICS_DISABLED
1369  mark_gap (blob_box, 22,
1370  prev_gap, prev_blob_box.width (),
1371  current_gap, next_blob_box.width (), next_gap);
1372 #endif
1373  }
1374  else if (tosp_force_wordbreak_on_punct &&
1375  !suspected_punct_blob(row, prev_blob_box) &&
1376  suspected_punct_blob(row, blob_box)) {
1377  break_at_next_gap = TRUE;
1378  }
1379  /* Now continue with normal heuristics */
1380  else if ((current_gap < row->min_space) &&
1381  (current_gap > row->space_threshold)) {
1382  /* Heuristics to turn dubious spaces to kerns */
1384  fuzzy_sp_to_kn_limit = row->kern_size +
1386  (row->space_size - row->kern_size);
1387  else
1388  fuzzy_sp_to_kn_limit = 99999.0f;
1389 
1390  /* If current gap is significantly smaller than the previous space the other
1391  side of a narrow blob then this gap is a kern. */
1392  if ((prev_blob_box.width () > 0) &&
1393  narrow_blob (row, prev_blob_box) &&
1394  prev_gap_was_a_space &&
1395  (current_gap <= tosp_gap_factor * prev_gap)) {
1396  if ((tosp_all_flips_fuzzy) ||
1397  (current_gap > fuzzy_sp_to_kn_limit)) {
1399  fuzzy_non = TRUE;
1400  else
1401  fuzzy_sp = TRUE;
1402  }
1403  else
1404  space = FALSE;
1405 #ifndef GRAPHICS_DISABLED
1406  mark_gap (blob_box, 1,
1407  prev_gap, prev_blob_box.width (),
1408  current_gap, next_blob_box.width (), next_gap);
1409 #endif
1410  }
1411  /* If current gap not much bigger than the previous kern the other side of a
1412  narrow blob then this gap is a kern as well */
1413  else if ((prev_blob_box.width () > 0) &&
1414  narrow_blob (row, prev_blob_box) &&
1415  !prev_gap_was_a_space &&
1416  (current_gap * tosp_gap_factor <= prev_gap)) {
1417  if ((tosp_all_flips_fuzzy) ||
1418  (current_gap > fuzzy_sp_to_kn_limit)) {
1420  fuzzy_non = TRUE;
1421  else
1422  fuzzy_sp = TRUE;
1423  }
1424  else
1425  space = FALSE;
1426 #ifndef GRAPHICS_DISABLED
1427  mark_gap (blob_box, 2,
1428  prev_gap, prev_blob_box.width (),
1429  current_gap, next_blob_box.width (), next_gap);
1430 #endif
1431  }
1432  else if ((next_blob_box.width () > 0) &&
1433  narrow_blob (row, next_blob_box) &&
1434  (next_gap > row->space_threshold) &&
1435  (current_gap <= tosp_gap_factor * next_gap)) {
1436  if ((tosp_all_flips_fuzzy) ||
1437  (current_gap > fuzzy_sp_to_kn_limit)) {
1439  fuzzy_non = TRUE;
1440  else
1441  fuzzy_sp = TRUE;
1442  }
1443  else
1444  space = FALSE;
1445 #ifndef GRAPHICS_DISABLED
1446  mark_gap (blob_box, 3,
1447  prev_gap, prev_blob_box.width (),
1448  current_gap, next_blob_box.width (), next_gap);
1449 #endif
1450  }
1451  else if ((next_blob_box.width () > 0) &&
1452  narrow_blob (row, next_blob_box) &&
1453  (next_gap <= row->space_threshold) &&
1454  (current_gap * tosp_gap_factor <= next_gap)) {
1455  if ((tosp_all_flips_fuzzy) ||
1456  (current_gap > fuzzy_sp_to_kn_limit)) {
1458  fuzzy_non = TRUE;
1459  else
1460  fuzzy_sp = TRUE;
1461  }
1462  else
1463  space = FALSE;
1464 #ifndef GRAPHICS_DISABLED
1465  mark_gap (blob_box, 4,
1466  prev_gap, prev_blob_box.width (),
1467  current_gap, next_blob_box.width (), next_gap);
1468 #endif
1469  }
1470  else if ((((next_blob_box.width () > 0) &&
1471  narrow_blob (row, next_blob_box)) ||
1472  ((prev_blob_box.width () > 0) &&
1473  narrow_blob (row, prev_blob_box)))) {
1474  fuzzy_sp = TRUE;
1475 #ifndef GRAPHICS_DISABLED
1476  mark_gap (blob_box, 6,
1477  prev_gap, prev_blob_box.width (),
1478  current_gap, next_blob_box.width (), next_gap);
1479 #endif
1480  }
1481  }
1482  else if ((current_gap > row->max_nonspace) &&
1483  (current_gap <= row->space_threshold)) {
1484 
1485  /* Heuristics to turn dubious kerns to spaces */
1486  /* TRIED THIS BUT IT MADE THINGS WORSE
1487  if ( prev_gap == MAX_INT16 )
1488  prev_gap = 0; // start of row
1489  if ( next_gap == MAX_INT16 )
1490  next_gap = 0; // end of row
1491  */
1492  if ((prev_blob_box.width () > 0) &&
1493  (next_blob_box.width () > 0) &&
1494  (current_gap >=
1495  tosp_kern_gap_factor1 * MAX (prev_gap, next_gap)) &&
1496  wide_blob (row, prev_blob_box) &&
1497  wide_blob (row, next_blob_box)) {
1498 
1499  space = TRUE;
1500  /*
1501  tosp_flip_caution is an attempt to stop the default changing in cases
1502  where there is a large difference between the kern and space estimates.
1503  See problem in 'chiefs' where "have" gets split in the quotation.
1504  */
1505  if ((tosp_flip_fuzz_kn_to_sp) &&
1506  ((tosp_flip_caution <= 0) ||
1507  (tosp_flip_caution * row->kern_size > row->space_size)))
1508  fuzzy_sp = TRUE;
1509  else
1510  fuzzy_non = TRUE;
1511 #ifndef GRAPHICS_DISABLED
1512  mark_gap (blob_box, 7,
1513  prev_gap, prev_blob_box.width (),
1514  current_gap, next_blob_box.width (), next_gap);
1515 #endif
1516  } else if (prev_blob_box.width() > 0 &&
1517  next_blob_box.width() > 0 &&
1518  current_gap > 5 && // Rule 9 handles small gap, big ratio.
1519  current_gap >=
1520  tosp_kern_gap_factor2 * MAX(prev_gap, next_gap) &&
1521  !(narrow_blob(row, prev_blob_box) ||
1522  suspected_punct_blob(row, prev_blob_box)) &&
1523  !(narrow_blob(row, next_blob_box) ||
1524  suspected_punct_blob(row, next_blob_box))) {
1525  space = TRUE;
1526  fuzzy_non = TRUE;
1527 #ifndef GRAPHICS_DISABLED
1528  mark_gap (blob_box, 8,
1529  prev_gap, prev_blob_box.width (),
1530  current_gap, next_blob_box.width (), next_gap);
1531 #endif
1532  }
1533  else if ((tosp_kern_gap_factor3 > 0) &&
1534  (prev_blob_box.width () > 0) &&
1535  (next_blob_box.width () > 0) &&
1536  (current_gap >= tosp_kern_gap_factor3 * MAX (prev_gap, next_gap)) &&
1538  (!suspected_punct_blob (row, prev_blob_box) &&
1539  !suspected_punct_blob (row, next_blob_box)))) {
1540  space = TRUE;
1541  fuzzy_non = TRUE;
1542 #ifndef GRAPHICS_DISABLED
1543  mark_gap (blob_box, 9,
1544  prev_gap, prev_blob_box.width (),
1545  current_gap, next_blob_box.width (), next_gap);
1546 #endif
1547  }
1548  }
1549  if (tosp_debug_level > 10)
1550  tprintf("word break = %d current_gap = %d, prev_gap = %d, "
1551  "next_gap = %d\n", space ? 1 : 0, current_gap,
1552  prev_gap, next_gap);
1553  prev_gap_was_a_space = space && !(fuzzy_non);
1554  return space;
1555  }
1556 }
1557 
1558 BOOL8 Textord::narrow_blob(TO_ROW *row, TBOX blob_box) {
1559  BOOL8 result;
1560  result = ((blob_box.width () <= tosp_narrow_fraction * row->xheight) ||
1561  (((float) blob_box.width () / blob_box.height ()) <=
1563  return result;
1564 }
1565 
1566 BOOL8 Textord::wide_blob(TO_ROW *row, TBOX blob_box) {
1567  BOOL8 result;
1568  if (tosp_wide_fraction > 0) {
1569  if (tosp_wide_aspect_ratio > 0)
1570  result = ((blob_box.width () >= tosp_wide_fraction * row->xheight) &&
1571  (((float) blob_box.width () / blob_box.height ()) >
1573  else
1574  result = (blob_box.width () >= tosp_wide_fraction * row->xheight);
1575  }
1576  else
1577  result = !narrow_blob (row, blob_box);
1578  return result;
1579 }
1580 
1581 BOOL8 Textord::suspected_punct_blob(TO_ROW *row, TBOX box) {
1582  BOOL8 result;
1583  float baseline;
1584  float blob_x_centre;
1585  /* Find baseline of centre of blob */
1586  blob_x_centre = (box.right () + box.left ()) / 2.0;
1587  baseline = row->baseline.y (blob_x_centre);
1588 
1589  result = (box.height () <= 0.66 * row->xheight) ||
1590  (box.top () < baseline + row->xheight / 2.0) ||
1591  (box.bottom () > baseline + row->xheight / 2.0);
1592  return result;
1593 }
1594 
1595 
1596 void Textord::peek_at_next_gap(TO_ROW *row,
1597  BLOBNBOX_IT box_it,
1598  TBOX &next_blob_box,
1599  inT16 &next_gap,
1600  inT16 &next_within_xht_gap) {
1601  TBOX next_reduced_blob_box;
1602  TBOX bit_beyond;
1603  BLOBNBOX_IT reduced_box_it = box_it;
1604 
1605  next_blob_box = box_next (&box_it);
1606  next_reduced_blob_box = reduced_box_next (row, &reduced_box_it);
1607  if (box_it.at_first ()) {
1608  next_gap = MAX_INT16;
1609  next_within_xht_gap = MAX_INT16;
1610  }
1611  else {
1612  bit_beyond = box_it.data ()->bounding_box ();
1613  next_gap = bit_beyond.left () - next_blob_box.right ();
1614  bit_beyond = reduced_box_next (row, &reduced_box_it);
1615  next_within_xht_gap =
1616  bit_beyond.left () - next_reduced_blob_box.right ();
1617  }
1618 }
1619 
1620 
1621 #ifndef GRAPHICS_DISABLED
1622 void Textord::mark_gap(
1623  TBOX blob, // blob following gap
1624  inT16 rule, // heuristic id
1625  inT16 prev_gap,
1626  inT16 prev_blob_width,
1627  inT16 current_gap,
1628  inT16 next_blob_width,
1629  inT16 next_gap) {
1630  ScrollView::Color col; //of ellipse marking flipped gap
1631 
1632  switch (rule) {
1633  case 1:
1634  col = ScrollView::RED;
1635  break;
1636  case 2:
1637  col = ScrollView::CYAN;
1638  break;
1639  case 3:
1640  col = ScrollView::GREEN;
1641  break;
1642  case 4:
1643  col = ScrollView::BLACK;
1644  break;
1645  case 5:
1646  col = ScrollView::MAGENTA;
1647  break;
1648  case 6:
1649  col = ScrollView::BLUE;
1650  break;
1651 
1652  case 7:
1653  col = ScrollView::WHITE;
1654  break;
1655  case 8:
1656  col = ScrollView::YELLOW;
1657  break;
1658  case 9:
1659  col = ScrollView::BLACK;
1660  break;
1661 
1662  case 20:
1663  col = ScrollView::CYAN;
1664  break;
1665  case 21:
1666  col = ScrollView::GREEN;
1667  break;
1668  case 22:
1669  col = ScrollView::MAGENTA;
1670  break;
1671  default:
1672  col = ScrollView::BLACK;
1673  }
1675  to_win->Pen(col);
1676  /* if (rule < 20)
1677  //interior_style(to_win, INT_SOLID, FALSE);
1678  else
1679  //interior_style(to_win, INT_HOLLOW, TRUE);*/
1680  //x radius
1681  to_win->Ellipse (current_gap / 2.0f,
1682  blob.height () / 2.0f, //y radius
1683  //x centre
1684  blob.left () - current_gap / 2.0f,
1685  //y centre
1686  blob.bottom () + blob.height () / 2.0f);
1687  }
1688  if (tosp_debug_level > 5)
1689  tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d\n",
1690  blob.left () - current_gap / 2, blob.bottom (), rule,
1691  prev_gap, prev_blob_width, current_gap,
1692  next_blob_width, next_gap);
1693 }
1694 #endif
1695 
1696 float Textord::find_mean_blob_spacing(WERD *word) {
1697  C_BLOB_IT cblob_it;
1698  TBOX blob_box;
1699  inT32 gap_sum = 0;
1700  inT16 gap_count = 0;
1701  inT16 prev_right;
1702 
1703  cblob_it.set_to_list (word->cblob_list ());
1704  if (!cblob_it.empty ()) {
1705  cblob_it.mark_cycle_pt ();
1706  prev_right = cblob_it.data ()->bounding_box ().right ();
1707  //first blob
1708  cblob_it.forward ();
1709  for (; !cblob_it.cycled_list (); cblob_it.forward ()) {
1710  blob_box = cblob_it.data ()->bounding_box ();
1711  gap_sum += blob_box.left () - prev_right;
1712  gap_count++;
1713  prev_right = blob_box.right ();
1714  }
1715  }
1716  if (gap_count > 0)
1717  return (gap_sum / (float) gap_count);
1718  else
1719  return 0.0f;
1720 }
1721 
1722 
1723 BOOL8 Textord::ignore_big_gap(TO_ROW *row,
1724  inT32 row_length,
1725  GAPMAP *gapmap,
1726  inT16 left,
1727  inT16 right) {
1728  inT16 gap = right - left + 1;
1729 
1730  if (tosp_ignore_big_gaps > 999)
1731  return FALSE; //Dont ignore
1732  if (tosp_ignore_big_gaps > 0)
1733  return (gap > tosp_ignore_big_gaps * row->xheight);
1734  if (gap > tosp_ignore_very_big_gaps * row->xheight)
1735  return TRUE;
1736  if (tosp_ignore_big_gaps == 0) {
1737  if ((gap > 2.1 * row->xheight) && (row_length > 20 * row->xheight))
1738  return TRUE;
1739  if ((gap > 1.75 * row->xheight) &&
1740  ((row_length > 35 * row->xheight) ||
1741  gapmap->table_gap (left, right)))
1742  return TRUE;
1743  }
1744  else {
1745  /* ONLY time gaps < 3.0 * xht are ignored is when they are part of a table */
1746  if ((gap > gapmap_big_gaps * row->xheight) &&
1747  gapmap->table_gap (left, right))
1748  return TRUE;
1749  }
1750  return FALSE;
1751 }
1752 
1753 
1754 /**********************************************************************
1755  * reduced_box_next
1756  *
1757  * Compute the bounding box of this blob with merging of x overlaps
1758  * but no pre-chopping.
1759  * Then move the iterator on to the start of the next blob.
1760  * DONT reduce the box for small things - eg punctuation.
1761  **********************************************************************/
1762 TBOX Textord::reduced_box_next(
1763  TO_ROW *row, // current row
1764  BLOBNBOX_IT *it // iterator to blobds
1765  ) {
1766  BLOBNBOX *blob; //current blob
1767  BLOBNBOX *head_blob; //place to store box
1768  TBOX full_box; //full blob boundg box
1769  TBOX reduced_box; //box of significant part
1770  inT16 left_above_xht; //ABOVE xht left limit
1771  inT16 new_left_above_xht; //ABOVE xht left limit
1772 
1773  blob = it->data ();
1774  if (blob->red_box_set ()) {
1775  reduced_box = blob->reduced_box ();
1776  do {
1777  it->forward();
1778  blob = it->data();
1779  }
1780  while (blob->cblob() == NULL || blob->joined_to_prev());
1781  return reduced_box;
1782  }
1783  head_blob = blob;
1784  full_box = blob->bounding_box ();
1785  reduced_box = reduced_box_for_blob (blob, row, &left_above_xht);
1786  do {
1787  it->forward ();
1788  blob = it->data ();
1789  if (blob->cblob() == NULL)
1790  //was pre-chopped
1791  full_box += blob->bounding_box ();
1792  else if (blob->joined_to_prev ()) {
1793  reduced_box +=
1794  reduced_box_for_blob(blob, row, &new_left_above_xht);
1795  left_above_xht = MIN (left_above_xht, new_left_above_xht);
1796  }
1797  }
1798  //until next real blob
1799  while (blob->cblob() == NULL || blob->joined_to_prev());
1800 
1801  if ((reduced_box.width () > 0) &&
1802  ((reduced_box.left () + tosp_near_lh_edge * reduced_box.width ())
1803  < left_above_xht) && (reduced_box.height () > 0.7 * row->xheight)) {
1804 #ifndef GRAPHICS_DISABLED
1807 #endif
1808  }
1809  else
1810  reduced_box = full_box;
1811  head_blob->set_reduced_box (reduced_box);
1812  return reduced_box;
1813 }
1814 
1815 
1816 /*************************************************************************
1817  * reduced_box_for_blob()
1818  * Find box for blob which is the same height and y position as the whole blob,
1819  * but whose left limit is the left most position of the blob ABOVE the
1820  * baseline and whose right limit is the right most position of the blob BELOW
1821  * the xheight.
1822  *
1823  *
1824  * !!!!!!! WONT WORK WITH LARGE UPPER CASE CHARS - T F V W - look at examples on
1825  * "home". Perhaps we need something which say if the width ABOVE the
1826  * xht alone includes the whole of the reduced width, then use the full
1827  * blob box - Might still fail on italic F
1828  *
1829  * Alternatively we could be a little less severe and only reduce the
1830  * left and right edges by half the difference between the full box and
1831  * the reduced box.
1832  *
1833  * NOTE that we need to rotate all the coordinates as
1834  * find_blob_limits finds the y min and max within a specified x band
1835  *************************************************************************/
1836 TBOX Textord::reduced_box_for_blob(
1837  BLOBNBOX *blob,
1838  TO_ROW *row,
1839  inT16 *left_above_xht) {
1840  float baseline;
1841  float blob_x_centre;
1842  float left_limit;
1843  float right_limit;
1844  float junk;
1845  TBOX blob_box;
1846 
1847  /* Find baseline of centre of blob */
1848 
1849  blob_box = blob->bounding_box ();
1850  blob_x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
1851  baseline = row->baseline.y (blob_x_centre);
1852 
1853  /*
1854  Find LH limit of blob ABOVE the xht. This is so that we can detect certain
1855  caps ht chars which should NOT have their box reduced: T, Y, V, W etc
1856  */
1857  left_limit = (float) MAX_INT32;
1858  junk = (float) -MAX_INT32;
1859  find_cblob_hlimits(blob->cblob(), (baseline + 1.1 * row->xheight),
1860  static_cast<float>(MAX_INT16), left_limit, junk);
1861  if (left_limit > junk)
1862  *left_above_xht = MAX_INT16; //No area above xht
1863  else
1864  *left_above_xht = (inT16) floor (left_limit);
1865  /*
1866  Find reduced LH limit of blob - the left extent of the region ABOVE the
1867  baseline.
1868  */
1869  left_limit = (float) MAX_INT32;
1870  junk = (float) -MAX_INT32;
1871  find_cblob_hlimits(blob->cblob(), baseline, static_cast<float>(MAX_INT16),
1872  left_limit, junk);
1873 
1874  if (left_limit > junk)
1875  return TBOX (); //no area within xht so return empty box
1876  /*
1877  Find reduced RH limit of blob - the right extent of the region BELOW the xht.
1878  */
1879  junk = (float) MAX_INT32;
1880  right_limit = (float) -MAX_INT32;
1881  find_cblob_hlimits(blob->cblob(), static_cast<float>(-MAX_INT16),
1882  (baseline + row->xheight), junk, right_limit);
1883  if (junk > right_limit)
1884  return TBOX (); //no area within xht so return empty box
1885 
1886  return TBOX (ICOORD ((inT16) floor (left_limit), blob_box.bottom ()),
1887  ICOORD ((inT16) ceil (right_limit), blob_box.top ()));
1888 }
1889 } // namespace tesseract
bool tosp_use_xht_gaps
Definition: textord.h:293
inT32 get_total() const
Definition: statistc.h:86
float kern_size
Definition: blobbox.h:662
void Pen(Color color)
Definition: scrollview.cpp:726
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1177
#define MAX(x, y)
Definition: ndminx.h:24
void set_reduced_box(TBOX new_box)
Definition: blobbox.h:234
double tosp_old_sp_kn_th_factor
Definition: textord.h:314
double tosp_fuzzy_space_factor1
Definition: textord.h:329
bool tosp_fuzzy_limit_all
Definition: textord.h:289
bool tosp_improve_thresh
Definition: textord.h:301
double tosp_near_lh_edge
Definition: textord.h:367
double tosp_min_sane_kn_sp
Definition: textord.h:353
const TBOX & reduced_box() const
Definition: blobbox.h:231
bool tosp_stats_use_xht_gaps
Definition: textord.h:291
bool tosp_only_use_xht_gaps
Definition: textord.h:295
bool joined_to_prev() const
Definition: blobbox.h:241
double tosp_pass_wide_fuzz_sp_to_context
Definition: textord.h:371
inT32 min_space
Definition: blobbox.h:659
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
int tosp_sanity_method
Definition: textord.h:311
bool tosp_use_pre_chopping
Definition: textord.h:273
bool tosp_old_to_method
Definition: textord.h:263
Definition: statistc.h:33
bool tosp_old_to_constrain_sp_kn
Definition: textord.h:266
int tosp_enough_space_samples_for_median
Definition: textord.h:304
bool tosp_row_use_cert_spaces1
Definition: textord.h:283
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
bool tosp_flip_fuzz_sp_to_kn
Definition: textord.h:299
double tosp_init_guess_kn_mult
Definition: textord.h:355
unsigned char BOOL8
Definition: host.h:113
bool tosp_only_small_gaps_for_kern
Definition: textord.h:286
TBOX bounding_box() const
Definition: werd.cpp:160
bool tosp_block_use_cert_spaces
Definition: textord.h:277
double tosp_gap_factor
Definition: textord.h:332
QSPLINE baseline
Definition: blobbox.h:666
double tosp_ignore_big_gaps
Definition: textord.h:339
BOOL8 table_gap(inT16 left, inT16 right)
Definition: gap_map.cpp:150
double tosp_table_kn_sp_ratio
Definition: textord.h:345
bool tosp_only_use_prop_rows
Definition: textord.h:268
inT16 right() const
Definition: rect.h:75
bool null_box() const
Definition: rect.h:46
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
EXTERN double gapmap_big_gaps
Definition: gap_map.cpp:10
Definition: ocrrow.h:32
float fixed_pitch
Definition: blobbox.h:647
double tosp_kern_gap_factor3
Definition: textord.h:338
double mean() const
Definition: statistc.cpp:138
double tosp_ignore_very_big_gaps
Definition: textord.h:340
double tosp_enough_small_gaps
Definition: textord.h:343
Definition: werd.h:35
void find_cblob_hlimits(C_BLOB *blob, float bottomy, float topy, float &xmin, float &xmax)
Definition: blobbox.cpp:569
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
double tosp_threshold_bias1
Definition: textord.h:316
double median() const
Definition: statistc.cpp:243
double tosp_silly_kn_sp_gap
Definition: textord.h:369
bool tosp_old_to_bug_fix
Definition: textord.h:275
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
Definition: werd.h:36
#define MAXSPACING
Definition: tospace.cpp:30
double tosp_fuzzy_sp_fraction
Definition: textord.h:351
inT16 left() const
Definition: rect.h:68
double tosp_wide_fraction
Definition: textord.h:323
bool tosp_narrow_blobs_not_cert
Definition: textord.h:281
int tosp_redo_kern_limit
Definition: textord.h:306
bool tosp_all_flips_fuzzy
Definition: textord.h:287
C_BLOB * cblob() const
Definition: blobbox.h:253
double tosp_rep_space
Definition: textord.h:341
TBOX bounding_box() const
Definition: ocrrow.h:85
void Ellipse(int x, int y, int width, int height)
Definition: scrollview.cpp:615
WERD_LIST rep_words
Definition: blobbox.h:664
#define MAX_INT32
Definition: host.h:120
Definition: werd.h:60
bool tosp_flip_fuzz_kn_to_sp
Definition: textord.h:298
double y(double x) const
Definition: quspline.cpp:217
integer coordinate
Definition: points.h:30
inT16 bottom() const
Definition: rect.h:61
inT32 max_nonspace
Definition: blobbox.h:660
Definition: gap_map.h:6
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
double tosp_fuzzy_kn_fraction
Definition: textord.h:350
inT16 height() const
Definition: rect.h:104
TBOX box_next(BLOBNBOX_IT *it)
Definition: blobbox.cpp:629
bool tosp_force_wordbreak_on_punct
Definition: textord.h:271
double tosp_table_fuzzy_kn_sp_ratio
Definition: textord.h:349
bool tosp_row_use_cert_spaces
Definition: textord.h:279
double tosp_init_guess_xht_mult
Definition: textord.h:357
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
float space_size
Definition: blobbox.h:663
inT16 width() const
Definition: rect.h:111
TBOX box_next_pre_chopped(BLOBNBOX_IT *it)
Definition: blobbox.cpp:658
void plot_word_decisions(ScrollView *win, inT16 pitch, TO_ROW *row)
Definition: drawtord.cpp:250
PITCH_TYPE pitch_decision
Definition: blobbox.h:646
#define FALSE
Definition: capi.h:29
double tosp_fuzzy_space_factor
Definition: textord.h:327
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:33
Definition: rect.h:30
double tosp_large_kerning
Definition: textord.h:363
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:884
double tosp_kern_gap_factor2
Definition: textord.h:336
#define TRUE
Definition: capi.h:28
#define MAX_INT16
Definition: host.h:119
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
double tosp_table_xht_sp_ratio
Definition: textord.h:347
double tosp_fuzzy_space_factor2
Definition: textord.h:331
bool tosp_rule_9_test_punct
Definition: textord.h:297
#define NULL
Definition: host.h:144
const TBOX & bounding_box() const
Definition: blobbox.h:215
double tosp_threshold_bias2
Definition: textord.h:318
double tosp_wide_aspect_ratio
Definition: textord.h:325
double tosp_narrow_fraction
Definition: textord.h:320
float xheight
Definition: blobbox.h:653
inT16 top() const
Definition: rect.h:54
double tosp_kern_gap_factor1
Definition: textord.h:334
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
double tosp_max_sane_kn_thresh
Definition: textord.h:359
double tosp_flip_caution
Definition: textord.h:361
bool tosp_recovery_isolated_row_stats
Definition: textord.h:285
bool red_box_set() const
Definition: blobbox.h:244
Definition: points.h:189
double tosp_narrow_aspect_ratio
Definition: textord.h:322
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
double tosp_dont_fool_with_small_kerns
Definition: textord.h:365
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
void recalc_bounding_box()
Definition: ocrrow.cpp:101
inT32 space_threshold
Definition: blobbox.h:661
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102
void plot(ScrollView *fd) const
Definition: rect.h:278
unsigned char uinT8
Definition: host.h:99