All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
wordseg.cpp File Reference
#include "stderr.h"
#include "blobbox.h"
#include "statistc.h"
#include "drawtord.h"
#include "makerow.h"
#include "pitsync1.h"
#include "tovars.h"
#include "topitch.h"
#include "cjkpitch.h"
#include "textord.h"
#include "fpchop.h"
#include "wordseg.h"

Go to the source code of this file.

Macros

#define EXTERN
 
#define FIXED_WIDTH_MULTIPLE   5
 
#define BLOCK_STATS_CLUSTERS   10
 

Functions

make_single_word

For each row, arrange the blobs into one word. There is no fixed pitch detection.

void make_single_word (bool one_blob, TO_ROW_LIST *rows, ROW_LIST *real_rows)
 
void make_words (tesseract::Textord *textord, ICOORD page_tr, float gradient, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
 
set_row_spaces

Set the min_space and max_nonspace members of the row so that the blobs can be arranged into words.

void set_row_spaces (TO_BLOCK *block, FCOORD rotation, BOOL8 testing_on)
 
row_words

Compute the max nonspace and min space for the row.

inT32 row_words (TO_BLOCK *block, TO_ROW *row, inT32 maxwidth, FCOORD rotation, BOOL8 testing_on)
 
row_words2

Compute the max nonspace and min space for the row.

inT32 row_words2 (TO_BLOCK *block, TO_ROW *row, inT32 maxwidth, FCOORD rotation, BOOL8 testing_on)
 
make_real_words

Convert a TO_BLOCK to a BLOCK.

void make_real_words (tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
 
make_rep_words

Fabricate a real row from only the repeated blob words. Get the xheight from the block as it may be more meaningful.

ROWmake_rep_words (TO_ROW *row, TO_BLOCK *block)
 
make_real_word

Construct a WERD from a given number of adjacent entries in a list of BLOBNBOXs.

WERDmake_real_word (BLOBNBOX_IT *box_it, inT32 blobcount, BOOL8 bol, uinT8 blanks)
 

Variables

EXTERN bool textord_fp_chopping = TRUE
 
EXTERN bool textord_force_make_prop_words = FALSE
 
EXTERN bool textord_chopper_test = FALSE
 

Macro Definition Documentation

#define BLOCK_STATS_CLUSTERS   10

Definition at line 50 of file wordseg.cpp.

#define EXTERN

Definition at line 41 of file wordseg.cpp.

#define FIXED_WIDTH_MULTIPLE   5

Definition at line 49 of file wordseg.cpp.

Function Documentation

WERD* make_real_word ( BLOBNBOX_IT *  box_it,
inT32  blobcount,
BOOL8  bol,
uinT8  blanks 
)

Definition at line 594 of file wordseg.cpp.

598  {
599  C_OUTLINE_IT cout_it;
600  C_BLOB_LIST cblobs;
601  C_BLOB_IT cblob_it = &cblobs;
602  WERD *word; // new word
603  BLOBNBOX *bblob; // current blob
604  inT32 blobindex; // in row
605 
606  for (blobindex = 0; blobindex < blobcount; blobindex++) {
607  bblob = box_it->extract();
608  if (bblob->joined_to_prev()) {
609  if (bblob->cblob() != NULL) {
610  cout_it.set_to_list(cblob_it.data()->out_list());
611  cout_it.move_to_last();
612  cout_it.add_list_after(bblob->cblob()->out_list());
613  delete bblob->cblob();
614  }
615  }
616  else {
617  if (bblob->cblob() != NULL)
618  cblob_it.add_after_then_move(bblob->cblob());
619  }
620  delete bblob;
621  box_it->forward(); // next one
622  }
623 
624  if (blanks < 1)
625  blanks = 1;
626 
627  word = new WERD(&cblobs, blanks, NULL);
628 
629  if (bol)
630  word->set_flag(W_BOL, TRUE);
631  if (box_it->at_first())
632  word->set_flag(W_EOL, TRUE); // at end of line
633 
634  return word;
635 }
bool joined_to_prev() const
Definition: blobbox.h:241
Definition: werd.h:35
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
Definition: werd.h:36
C_BLOB * cblob() const
Definition: blobbox.h:253
Definition: werd.h:60
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
int inT32
Definition: host.h:102
void make_real_words ( tesseract::Textord textord,
TO_BLOCK block,
FCOORD  rotation 
)

Definition at line 506 of file wordseg.cpp.

510  {
511  TO_ROW *row; //current row
512  TO_ROW_IT row_it = block->get_rows ();
513  ROW *real_row = NULL; //output row
514  ROW_IT real_row_it = block->block->row_list ();
515 
516  if (row_it.empty ())
517  return; //empty block
518  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
519  row = row_it.data ();
520  if (row->blob_list ()->empty () && !row->rep_words.empty ()) {
521  real_row = make_rep_words (row, block);
522  } else if (!row->blob_list()->empty()) {
523  // In a fixed pitch document, some lines may be detected as fixed pitch
524  // while others don't, and will go through different path.
525  // For non-space delimited language like CJK, fixed pitch chop always
526  // leave the entire line as one word. We can force consistent chopping
527  // with force_make_prop_words flag.
528  POLY_BLOCK* pb = block->block->poly_block();
529  if (textord_chopper_test) {
530  real_row = textord->make_blob_words (row, rotation);
531  } else if (textord_force_make_prop_words ||
532  (pb != NULL && !pb->IsText()) ||
533  row->pitch_decision == PITCH_DEF_PROP ||
535  real_row = textord->make_prop_words (row, rotation);
536  } else if (row->pitch_decision == PITCH_DEF_FIXED ||
538  real_row = fixed_pitch_words (row, rotation);
539  } else {
541  }
542  }
543  if (real_row != NULL) {
544  //put row in block
545  real_row_it.add_after_then_move (real_row);
546  }
547  }
548  block->block->set_stats (block->fixed_pitch == 0, (inT16) block->kern_size,
549  (inT16) block->space_size,
550  (inT16) block->fixed_pitch);
551  block->block->check_pitch ();
552 }
ROW * make_blob_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:1177
float space_size
Definition: blobbox.h:787
float fixed_pitch
Definition: blobbox.h:785
EXTERN bool textord_chopper_test
Definition: wordseg.cpp:47
bool IsText() const
Definition: polyblk.h:52
void set_stats(BOOL8 prop, inT16 kern, inT16 space, inT16 ch_pitch)
Definition: ocrblock.h:62
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
ROW * make_rep_words(TO_ROW *row, TO_BLOCK *block)
Definition: wordseg.cpp:562
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: ocrrow.h:32
float kern_size
Definition: blobbox.h:786
WERD_LIST rep_words
Definition: blobbox.h:664
void check_pitch()
check proportional
Definition: ocrblock.cpp:170
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
PITCH_TYPE pitch_decision
Definition: blobbox.h:646
#define FALSE
Definition: capi.h:29
ROW * make_prop_words(TO_ROW *row, FCOORD rotation)
Definition: tospace.cpp:884
#define NULL
Definition: host.h:144
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
EXTERN bool textord_force_make_prop_words
Definition: wordseg.cpp:45
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
ROW * fixed_pitch_words(TO_ROW *row, FCOORD rotation)
Definition: fpchop.cpp:51
BLOCK * block
Definition: blobbox.h:773
short inT16
Definition: host.h:100
ROW* make_rep_words ( TO_ROW row,
TO_BLOCK block 
)

Definition at line 562 of file wordseg.cpp.

565  {
566  ROW *real_row; //output row
567  TBOX word_box; //bounding box
568  //iterator
569  WERD_IT word_it = &row->rep_words;
570 
571  if (word_it.empty ())
572  return NULL;
573  word_box = word_it.data ()->bounding_box ();
574  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())
575  word_box += word_it.data ()->bounding_box ();
576  row->xheight = block->xheight;
577  real_row = new ROW(row,
578  (inT16) block->kern_size, (inT16) block->space_size);
579  word_it.set_to_list (real_row->word_list ());
580  //put words in row
581  word_it.add_list_after (&row->rep_words);
582  real_row->recalc_bounding_box ();
583  return real_row;
584 }
float space_size
Definition: blobbox.h:787
Definition: ocrrow.h:32
float xheight
Definition: blobbox.h:784
float kern_size
Definition: blobbox.h:786
WERD_LIST rep_words
Definition: blobbox.h:664
Definition: rect.h:30
#define NULL
Definition: host.h:144
float xheight
Definition: blobbox.h:653
WERD_LIST * word_list()
Definition: ocrrow.h:52
void recalc_bounding_box()
Definition: ocrrow.cpp:101
short inT16
Definition: host.h:100
void make_single_word ( bool  one_blob,
TO_ROW_LIST *  rows,
ROW_LIST *  real_rows 
)

Definition at line 60 of file wordseg.cpp.

60  {
61  TO_ROW_IT to_row_it(rows);
62  ROW_IT row_it(real_rows);
63  for (to_row_it.mark_cycle_pt(); !to_row_it.cycled_list();
64  to_row_it.forward()) {
65  TO_ROW* row = to_row_it.data();
66  // The blobs have to come out of the BLOBNBOX into the C_BLOB_LIST ready
67  // to create the word.
68  C_BLOB_LIST cblobs;
69  C_BLOB_IT cblob_it(&cblobs);
70  BLOBNBOX_IT box_it(row->blob_list());
71  for (;!box_it.empty(); box_it.forward()) {
72  BLOBNBOX* bblob= box_it.extract();
73  if (bblob->joined_to_prev() || (one_blob && !cblob_it.empty())) {
74  if (bblob->cblob() != NULL) {
75  C_OUTLINE_IT cout_it(cblob_it.data()->out_list());
76  cout_it.move_to_last();
77  cout_it.add_list_after(bblob->cblob()->out_list());
78  delete bblob->cblob();
79  }
80  } else {
81  if (bblob->cblob() != NULL)
82  cblob_it.add_after_then_move(bblob->cblob());
83  }
84  delete bblob;
85  }
86  // Convert the TO_ROW to a ROW.
87  ROW* real_row = new ROW(row, static_cast<inT16>(row->kern_size),
88  static_cast<inT16>(row->space_size));
89  WERD_IT word_it(real_row->word_list());
90  WERD* word = new WERD(&cblobs, 0, NULL);
91  word->set_flag(W_BOL, TRUE);
92  word->set_flag(W_EOL, TRUE);
93  word->set_flag(W_DONT_CHOP, one_blob);
94  word_it.add_after_then_move(word);
95  row_it.add_after_then_move(real_row);
96  }
97 }
float kern_size
Definition: blobbox.h:662
bool joined_to_prev() const
Definition: blobbox.h:241
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
Definition: ocrrow.h:32
Definition: werd.h:35
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
Definition: werd.h:36
C_BLOB * cblob() const
Definition: blobbox.h:253
Definition: werd.h:60
float space_size
Definition: blobbox.h:663
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
WERD_LIST * word_list()
Definition: ocrrow.h:52
void make_words ( tesseract::Textord textord,
ICOORD  page_tr,
float  gradient,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  port_blocks 
)

make_words

Arrange the blobs into words.

Definition at line 104 of file wordseg.cpp.

108  { // output list
109  TO_BLOCK_IT block_it; // iterator
110  TO_BLOCK *block; // current block
111 
112  if (textord->use_cjk_fp_model()) {
113  compute_fixed_pitch_cjk(page_tr, port_blocks);
114  } else {
115  compute_fixed_pitch(page_tr, port_blocks, gradient, FCOORD(0.0f, -1.0f),
117  }
118  textord->to_spacing(page_tr, port_blocks);
119  block_it.set_to_list(port_blocks);
120  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
121  block = block_it.data();
122  make_real_words(textord, block, FCOORD(1.0f, 0.0f));
123  }
124 }
void compute_fixed_pitch_cjk(ICOORD page_tr, TO_BLOCK_LIST *port_blocks)
Definition: cjkpitch.cpp:1057
void make_real_words(tesseract::Textord *textord, TO_BLOCK *block, FCOORD rotation)
Definition: wordseg.cpp:506
unsigned char BOOL8
Definition: host.h:113
bool use_cjk_fp_model() const
Definition: textord.h:92
bool textord_test_landscape
Definition: makerow.cpp:50
void compute_fixed_pitch(ICOORD page_tr, TO_BLOCK_LIST *port_blocks, float gradient, FCOORD rotation, BOOL8 testing_on)
Definition: topitch.cpp:73
void to_spacing(ICOORD page_tr, TO_BLOCK_LIST *blocks)
Definition: tospace.cpp:33
Definition: points.h:189
inT32 row_words ( TO_BLOCK block,
TO_ROW row,
inT32  maxwidth,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 178 of file wordseg.cpp.

184  {
185  BOOL8 testing_row; //contains testpt
186  BOOL8 prev_valid; //if decent size
187  BOOL8 this_valid; //current blob big enough
188  inT32 prev_x; //end of prev blob
189  inT32 min_gap; //min interesting gap
190  inT32 cluster_count; //no of clusters
191  inT32 gap_index; //which cluster
192  inT32 smooth_factor; //for smoothing stats
193  BLOBNBOX *blob; //current blob
194  float lower, upper; //clustering parameters
195  float gaps[3]; //gap clusers
196  ICOORD testpt;
197  TBOX blob_box; //bounding box
198  //iterator
199  BLOBNBOX_IT blob_it = row->blob_list ();
200  STATS gap_stats (0, maxwidth);
201  STATS cluster_stats[4]; //clusters
202 
204  smooth_factor =
205  (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
206  // if (testing_on)
207  // tprintf("Row smooth factor=%d\n",smooth_factor);
208  prev_valid = FALSE;
209  prev_x = -MAX_INT32;
210  testing_row = FALSE;
211  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
212  blob = blob_it.data ();
213  blob_box = blob->bounding_box ();
214  if (blob_box.contains (testpt))
215  testing_row = TRUE;
216  gap_stats.add (blob_box.width (), 1);
217  }
218  min_gap = (inT32) floor (gap_stats.ile (textord_words_width_ile));
219  gap_stats.clear ();
220  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
221  blob = blob_it.data ();
222  if (!blob->joined_to_prev ()) {
223  blob_box = blob->bounding_box ();
224  // this_valid=blob_box.width()>=min_gap;
225  this_valid = TRUE;
226  if (this_valid && prev_valid
227  && blob_box.left () - prev_x < maxwidth) {
228  gap_stats.add (blob_box.left () - prev_x, 1);
229  }
230  prev_x = blob_box.right ();
231  prev_valid = this_valid;
232  }
233  }
234  if (gap_stats.get_total () == 0) {
235  row->min_space = 0; //no evidence
236  row->max_nonspace = 0;
237  return 0;
238  }
239  gap_stats.smooth (smooth_factor);
240  lower = row->xheight * textord_words_initial_lower;
241  upper = row->xheight * textord_words_initial_upper;
242  cluster_count = gap_stats.cluster (lower, upper,
244  cluster_stats);
245  while (cluster_count < 2 && ceil (lower) < floor (upper)) {
246  //shrink gap
247  upper = (upper * 3 + lower) / 4;
248  lower = (lower * 3 + upper) / 4;
249  cluster_count = gap_stats.cluster (lower, upper,
251  cluster_stats);
252  }
253  if (cluster_count < 2) {
254  row->min_space = 0; //no evidence
255  row->max_nonspace = 0;
256  return 0;
257  }
258  for (gap_index = 0; gap_index < cluster_count; gap_index++)
259  gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
260  //get medians
261  if (cluster_count > 2) {
262  if (testing_on && textord_show_initial_words) {
263  tprintf ("Row at %g has 3 sizes of gap:%g,%g,%g\n",
264  row->intercept (),
265  cluster_stats[1].ile (0.5),
266  cluster_stats[2].ile (0.5), cluster_stats[3].ile (0.5));
267  }
268  lower = gaps[0];
269  if (gaps[1] > lower) {
270  upper = gaps[1]; //prefer most frequent
271  if (upper < block->xheight * textord_words_min_minspace
272  && gaps[2] > gaps[1]) {
273  upper = gaps[2];
274  }
275  }
276  else if (gaps[2] > lower
277  && gaps[2] >= block->xheight * textord_words_min_minspace)
278  upper = gaps[2];
279  else if (lower >= block->xheight * textord_words_min_minspace) {
280  upper = lower; //not nice
281  lower = gaps[1];
282  if (testing_on && textord_show_initial_words) {
283  tprintf ("Had to switch most common from lower to upper!!\n");
284  gap_stats.print();
285  }
286  }
287  else {
288  row->min_space = 0; //no evidence
289  row->max_nonspace = 0;
290  return 0;
291  }
292  }
293  else {
294  if (gaps[1] < gaps[0]) {
295  if (testing_on && textord_show_initial_words) {
296  tprintf ("Had to switch most common from lower to upper!!\n");
297  gap_stats.print();
298  }
299  lower = gaps[1];
300  upper = gaps[0];
301  }
302  else {
303  upper = gaps[1];
304  lower = gaps[0];
305  }
306  }
307  if (upper < block->xheight * textord_words_min_minspace) {
308  row->min_space = 0; //no evidence
309  row->max_nonspace = 0;
310  return 0;
311  }
312  if (upper * 3 < block->min_space * 2 + block->max_nonspace
313  || lower * 3 > block->min_space * 2 + block->max_nonspace) {
314  if (testing_on && textord_show_initial_words) {
315  tprintf ("Disagreement between block and row at %g!!\n",
316  row->intercept ());
317  tprintf ("Lower=%g, upper=%g, Stats:\n", lower, upper);
318  gap_stats.print();
319  }
320  }
321  row->min_space =
322  (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
323  row->max_nonspace =
324  (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
325  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
326  row->space_size = upper;
327  row->kern_size = lower;
328  if (testing_on && textord_show_initial_words) {
329  if (testing_row) {
330  tprintf ("GAP STATS\n");
331  gap_stats.print();
332  tprintf ("SPACE stats\n");
333  cluster_stats[2].print_summary();
334  tprintf ("NONSPACE stats\n");
335  cluster_stats[1].print_summary();
336  }
337  tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
338  row->intercept (), row->min_space, upper,
339  row->max_nonspace, lower);
340  }
341  return cluster_stats[2].get_total ();
342 }
inT32 get_total() const
Definition: statistc.h:86
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:76
float kern_size
Definition: blobbox.h:662
EXTERN double textord_wordstats_smooth_factor
Definition: tovars.cpp:39
void print_summary() const
Definition: statistc.cpp:564
bool joined_to_prev() const
Definition: blobbox.h:241
inT32 min_space
Definition: blobbox.h:659
#define tprintf(...)
Definition: tprintf.h:31
Definition: statistc.h:33
unsigned char BOOL8
Definition: host.h:113
float intercept() const
Definition: blobbox.h:584
inT16 right() const
Definition: rect.h:75
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
float xheight
Definition: blobbox.h:784
double ile(double frac) const
Definition: statistc.cpp:177
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:80
inT16 left() const
Definition: rect.h:68
int textord_test_x
Definition: makerow.cpp:62
EXTERN double textord_words_initial_upper
Definition: tovars.cpp:55
EXTERN double textord_words_min_minspace
Definition: tovars.cpp:49
#define MAX_INT32
Definition: host.h:120
integer coordinate
Definition: points.h:30
inT32 max_nonspace
Definition: blobbox.h:660
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
float space_size
Definition: blobbox.h:663
inT16 width() const
Definition: rect.h:111
inT32 min_space
Definition: blobbox.h:788
#define FALSE
Definition: capi.h:29
Definition: rect.h:30
#define TRUE
Definition: capi.h:28
bool contains(const FCOORD pt) const
Definition: rect.h:323
const TBOX & bounding_box() const
Definition: blobbox.h:215
float xheight
Definition: blobbox.h:653
EXTERN double textord_words_width_ile
Definition: tovars.cpp:43
inT32 max_nonspace
Definition: blobbox.h:789
int textord_test_y
Definition: makerow.cpp:63
EXTERN double textord_words_initial_lower
Definition: tovars.cpp:53
inT32 space_threshold
Definition: blobbox.h:661
int inT32
Definition: host.h:102
inT32 row_words2 ( TO_BLOCK block,
TO_ROW row,
inT32  maxwidth,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 351 of file wordseg.cpp.

357  {
358  BOOL8 testing_row; //contains testpt
359  BOOL8 prev_valid; //if decent size
360  BOOL8 this_valid; //current blob big enough
361  inT32 prev_x; //end of prev blob
362  inT32 min_width; //min interesting width
363  inT32 valid_count; //good gaps
364  inT32 total_count; //total gaps
365  inT32 cluster_count; //no of clusters
366  inT32 prev_count; //previous cluster_count
367  inT32 gap_index; //which cluster
368  inT32 smooth_factor; //for smoothing stats
369  BLOBNBOX *blob; //current blob
370  float lower, upper; //clustering parameters
371  ICOORD testpt;
372  TBOX blob_box; //bounding box
373  //iterator
374  BLOBNBOX_IT blob_it = row->blob_list ();
375  STATS gap_stats (0, maxwidth);
376  //gap sizes
377  float gaps[BLOCK_STATS_CLUSTERS];
378  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];
379  //clusters
380 
382  smooth_factor =
383  (inT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);
384  // if (testing_on)
385  // tprintf("Row smooth factor=%d\n",smooth_factor);
386  prev_valid = FALSE;
387  prev_x = -MAX_INT16;
388  testing_row = FALSE;
389  //min blob size
390  min_width = (inT32) block->pr_space;
391  total_count = 0;
392  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
393  blob = blob_it.data ();
394  if (!blob->joined_to_prev ()) {
395  blob_box = blob->bounding_box ();
396  this_valid = blob_box.width () >= min_width;
397  if (this_valid && prev_valid
398  && blob_box.left () - prev_x < maxwidth) {
399  gap_stats.add (blob_box.left () - prev_x, 1);
400  }
401  total_count++; //count possibles
402  prev_x = blob_box.right ();
403  prev_valid = this_valid;
404  }
405  }
406  valid_count = gap_stats.get_total ();
407  if (valid_count < total_count * textord_words_minlarge) {
408  gap_stats.clear ();
409  prev_x = -MAX_INT16;
410  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
411  blob_it.forward ()) {
412  blob = blob_it.data ();
413  if (!blob->joined_to_prev ()) {
414  blob_box = blob->bounding_box ();
415  if (blob_box.left () - prev_x < maxwidth) {
416  gap_stats.add (blob_box.left () - prev_x, 1);
417  }
418  prev_x = blob_box.right ();
419  }
420  }
421  }
422  if (gap_stats.get_total () == 0) {
423  row->min_space = 0; //no evidence
424  row->max_nonspace = 0;
425  return 0;
426  }
427 
428  cluster_count = 0;
429  lower = block->xheight * words_initial_lower;
430  upper = block->xheight * words_initial_upper;
431  gap_stats.smooth (smooth_factor);
432  do {
433  prev_count = cluster_count;
434  cluster_count = gap_stats.cluster (lower, upper,
436  BLOCK_STATS_CLUSTERS, cluster_stats);
437  }
438  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);
439  if (cluster_count < 1) {
440  row->min_space = 0;
441  row->max_nonspace = 0;
442  return 0;
443  }
444  for (gap_index = 0; gap_index < cluster_count; gap_index++)
445  gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);
446  //get medians
447  if (testing_on) {
448  tprintf ("cluster_count=%d:", cluster_count);
449  for (gap_index = 0; gap_index < cluster_count; gap_index++)
450  tprintf (" %g(%d)", gaps[gap_index],
451  cluster_stats[gap_index + 1].get_total ());
452  tprintf ("\n");
453  }
454 
455  //Try to find proportional non-space and space for row.
456  for (gap_index = 0; gap_index < cluster_count
457  && gaps[gap_index] > block->max_nonspace; gap_index++);
458  if (gap_index < cluster_count)
459  lower = gaps[gap_index]; //most frequent below
460  else {
461  if (testing_on)
462  tprintf ("No cluster below block threshold!, using default=%g\n",
463  block->pr_nonsp);
464  lower = block->pr_nonsp;
465  }
466  for (gap_index = 0; gap_index < cluster_count
467  && gaps[gap_index] <= block->max_nonspace; gap_index++);
468  if (gap_index < cluster_count)
469  upper = gaps[gap_index]; //most frequent above
470  else {
471  if (testing_on)
472  tprintf ("No cluster above block threshold!, using default=%g\n",
473  block->pr_space);
474  upper = block->pr_space;
475  }
476  row->min_space =
477  (inT32) ceil (upper - (upper - lower) * textord_words_definite_spread);
478  row->max_nonspace =
479  (inT32) floor (lower + (upper - lower) * textord_words_definite_spread);
480  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
481  row->space_size = upper;
482  row->kern_size = lower;
483  if (testing_on) {
484  if (testing_row) {
485  tprintf ("GAP STATS\n");
486  gap_stats.print();
487  tprintf ("SPACE stats\n");
488  cluster_stats[2].print_summary();
489  tprintf ("NONSPACE stats\n");
490  cluster_stats[1].print_summary();
491  }
492  tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",
493  row->intercept (), row->min_space, upper,
494  row->max_nonspace, lower);
495  }
496  return 1;
497 }
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:76
float kern_size
Definition: blobbox.h:662
EXTERN double words_initial_upper
Definition: tovars.cpp:71
EXTERN double textord_wordstats_smooth_factor
Definition: tovars.cpp:39
void print_summary() const
Definition: statistc.cpp:564
bool joined_to_prev() const
Definition: blobbox.h:241
inT32 min_space
Definition: blobbox.h:659
#define tprintf(...)
Definition: tprintf.h:31
Definition: statistc.h:33
unsigned char BOOL8
Definition: host.h:113
float intercept() const
Definition: blobbox.h:584
inT16 right() const
Definition: rect.h:75
BLOBNBOX_LIST * blob_list()
Definition: blobbox.h:595
float xheight
Definition: blobbox.h:784
EXTERN double textord_spacesize_ratioprop
Definition: tovars.cpp:80
inT16 left() const
Definition: rect.h:68
int textord_test_x
Definition: makerow.cpp:62
float pr_nonsp
Definition: blobbox.h:793
#define BLOCK_STATS_CLUSTERS
Definition: wordseg.cpp:50
EXTERN double textord_words_minlarge
Definition: tovars.cpp:57
integer coordinate
Definition: points.h:30
inT32 max_nonspace
Definition: blobbox.h:660
float space_size
Definition: blobbox.h:663
inT16 width() const
Definition: rect.h:111
#define FALSE
Definition: capi.h:29
Definition: rect.h:30
#define MAX_INT16
Definition: host.h:119
float pr_space
Definition: blobbox.h:792
const TBOX & bounding_box() const
Definition: blobbox.h:215
EXTERN double words_initial_lower
Definition: tovars.cpp:70
inT32 max_nonspace
Definition: blobbox.h:789
int textord_test_y
Definition: makerow.cpp:63
inT32 space_threshold
Definition: blobbox.h:661
int inT32
Definition: host.h:102
void set_row_spaces ( TO_BLOCK block,
FCOORD  rotation,
BOOL8  testing_on 
)

Definition at line 134 of file wordseg.cpp.

138  {
139  TO_ROW *row; //current row
140  TO_ROW_IT row_it = block->get_rows ();
141 
142  if (row_it.empty ())
143  return; //empty block
144  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {
145  row = row_it.data ();
146  if (row->fixed_pitch == 0) {
147  row->min_space =
148  (inT32) ceil (row->pr_space -
149  (row->pr_space -
151  row->max_nonspace =
152  (inT32) floor (row->pr_nonsp +
153  (row->pr_space -
155  if (testing_on && textord_show_initial_words) {
156  tprintf ("Assigning defaults %d non, %d space to row at %g\n",
157  row->max_nonspace, row->min_space, row->intercept ());
158  }
159  row->space_threshold = (row->max_nonspace + row->min_space) / 2;
160  row->space_size = row->pr_space;
161  row->kern_size = row->pr_nonsp;
162  }
163 #ifndef GRAPHICS_DISABLED
164  if (textord_show_initial_words && testing_on) {
166  }
167 #endif
168  }
169 }
EXTERN double textord_words_definite_spread
Definition: tovars.cpp:76
float kern_size
Definition: blobbox.h:662
inT32 min_space
Definition: blobbox.h:659
#define tprintf(...)
Definition: tprintf.h:31
float intercept() const
Definition: blobbox.h:584
float fixed_pitch
Definition: blobbox.h:647
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
float pr_nonsp
Definition: blobbox.h:651
float pr_space
Definition: blobbox.h:650
inT32 max_nonspace
Definition: blobbox.h:660
EXTERN bool textord_show_initial_words
Definition: tovars.cpp:25
TO_ROW_LIST * get_rows()
Definition: blobbox.h:700
float space_size
Definition: blobbox.h:663
void plot_word_decisions(ScrollView *win, inT16 pitch, TO_ROW *row)
Definition: drawtord.cpp:250
inT32 space_threshold
Definition: blobbox.h:661
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102

Variable Documentation

EXTERN bool textord_chopper_test = FALSE

"Chopper is being tested."

Definition at line 47 of file wordseg.cpp.

EXTERN bool textord_force_make_prop_words = FALSE

"Force proportional word segmentation on all rows"

Definition at line 45 of file wordseg.cpp.

EXTERN bool textord_fp_chopping = TRUE

"Do fixed pitch chopping"

Definition at line 43 of file wordseg.cpp.