All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tordmain.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tordmain.cpp (Formerly textordp.c)
3  * Description: C++ top level textord code.
4  * Author: Ray Smith
5  * Created: Tue Jul 28 17:12:33 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config_auto.h"
22 #endif
23 
24 #ifdef __UNIX__
25 #include <assert.h>
26 #endif
27 #include "stderr.h"
28 #include "globaloc.h"
29 #include "blread.h"
30 #include "blobbox.h"
31 #include "ccstruct.h"
32 #include "edgblob.h"
33 #include "drawtord.h"
34 #include "makerow.h"
35 #include "wordseg.h"
36 #include "textord.h"
37 #include "tordmain.h"
38 
39 #include "allheaders.h"
40 
41 // Gridsize for word grid when reassigning diacritics to words. Not critical.
42 const int kWordGridSize = 50;
43 
44 #undef EXTERN
45 #define EXTERN
46 
47 #define MAX_NEAREST_DIST 600 //for block skew stats
48 
49 namespace tesseract {
50 
51 CLISTIZE(WordWithBox)
52 
53 /**********************************************************************
54  * SetBlobStrokeWidth
55  *
56  * Set the horizontal and vertical stroke widths in the blob.
57  **********************************************************************/
58 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
59  // Cut the blob rectangle into a Pix.
60  int pix_height = pixGetHeight(pix);
61  const TBOX& box = blob->bounding_box();
62  int width = box.width();
63  int height = box.height();
64  Box* blob_pix_box = boxCreate(box.left(), pix_height - box.top(),
65  width, height);
66  Pix* pix_blob = pixClipRectangle(pix, blob_pix_box, NULL);
67  boxDestroy(&blob_pix_box);
68  Pix* dist_pix = pixDistanceFunction(pix_blob, 4, 8, L_BOUNDARY_BG);
69  pixDestroy(&pix_blob);
70  // Compute the stroke widths.
71  uinT32* data = pixGetData(dist_pix);
72  int wpl = pixGetWpl(dist_pix);
73  // Horizontal width of stroke.
74  STATS h_stats(0, width + 1);
75  for (int y = 0; y < height; ++y) {
76  uinT32* pixels = data + y*wpl;
77  int prev_pixel = 0;
78  int pixel = GET_DATA_BYTE(pixels, 0);
79  for (int x = 1; x < width; ++x) {
80  int next_pixel = GET_DATA_BYTE(pixels, x);
81  // We are looking for a pixel that is equal to its vertical neighbours,
82  // yet greater than its left neighbour.
83  if (prev_pixel < pixel &&
84  (y == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
85  (y == height - 1 || pixel == GET_DATA_BYTE(pixels + wpl, x - 1))) {
86  if (pixel > next_pixel) {
87  // Single local max, so an odd width.
88  h_stats.add(pixel * 2 - 1, 1);
89  } else if (pixel == next_pixel && x + 1 < width &&
90  pixel > GET_DATA_BYTE(pixels, x + 1)) {
91  // Double local max, so an even width.
92  h_stats.add(pixel * 2, 1);
93  }
94  }
95  prev_pixel = pixel;
96  pixel = next_pixel;
97  }
98  }
99  // Vertical width of stroke.
100  STATS v_stats(0, height + 1);
101  for (int x = 0; x < width; ++x) {
102  int prev_pixel = 0;
103  int pixel = GET_DATA_BYTE(data, x);
104  for (int y = 1; y < height; ++y) {
105  uinT32* pixels = data + y*wpl;
106  int next_pixel = GET_DATA_BYTE(pixels, x);
107  // We are looking for a pixel that is equal to its horizontal neighbours,
108  // yet greater than its upper neighbour.
109  if (prev_pixel < pixel &&
110  (x == 0 || pixel == GET_DATA_BYTE(pixels - wpl, x - 1)) &&
111  (x == width - 1 || pixel == GET_DATA_BYTE(pixels - wpl, x + 1))) {
112  if (pixel > next_pixel) {
113  // Single local max, so an odd width.
114  v_stats.add(pixel * 2 - 1, 1);
115  } else if (pixel == next_pixel && y + 1 < height &&
116  pixel > GET_DATA_BYTE(pixels + wpl, x)) {
117  // Double local max, so an even width.
118  v_stats.add(pixel * 2, 1);
119  }
120  }
121  prev_pixel = pixel;
122  pixel = next_pixel;
123  }
124  }
125  pixDestroy(&dist_pix);
126  // Store the horizontal and vertical width in the blob, keeping both
127  // widths if there is enough information, otherwse only the one with
128  // the most samples.
129  // If there are insufficent samples, store zero, rather than using
130  // 2*area/perimeter, as the numbers that gives do not match the numbers
131  // from the distance method.
132  if (h_stats.get_total() >= (width + height) / 4) {
133  blob->set_horz_stroke_width(h_stats.ile(0.5f));
134  if (v_stats.get_total() >= (width + height) / 4)
135  blob->set_vert_stroke_width(v_stats.ile(0.5f));
136  else
137  blob->set_vert_stroke_width(0.0f);
138  } else {
139  if (v_stats.get_total() >= (width + height) / 4 ||
140  v_stats.get_total() > h_stats.get_total()) {
141  blob->set_horz_stroke_width(0.0f);
142  blob->set_vert_stroke_width(v_stats.ile(0.5f));
143  } else {
144  blob->set_horz_stroke_width(h_stats.get_total() > 2 ? h_stats.ile(0.5f)
145  : 0.0f);
146  blob->set_vert_stroke_width(0.0f);
147  }
148  }
149 }
150 
151 /**********************************************************************
152  * assign_blobs_to_blocks2
153  *
154  * Make a list of TO_BLOCKs for portrait and landscape orientation.
155  **********************************************************************/
156 
158  BLOCK_LIST *blocks, // blocks to process
159  TO_BLOCK_LIST *port_blocks) { // output list
160  BLOCK *block; // current block
161  BLOBNBOX *newblob; // created blob
162  C_BLOB *blob; // current blob
163  BLOCK_IT block_it = blocks;
164  C_BLOB_IT blob_it; // iterator
165  BLOBNBOX_IT port_box_it; // iterator
166  // destination iterator
167  TO_BLOCK_IT port_block_it = port_blocks;
168  TO_BLOCK *port_block; // created block
169 
170  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
171  block = block_it.data();
172  port_block = new TO_BLOCK(block);
173 
174  // Convert the good outlines to block->blob_list
175  port_box_it.set_to_list(&port_block->blobs);
176  blob_it.set_to_list(block->blob_list());
177  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
178  blob = blob_it.extract();
179  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
180  SetBlobStrokeWidth(pix, newblob);
181  port_box_it.add_after_then_move(newblob);
182  }
183 
184  // Put the rejected outlines in block->noise_blobs, which allows them to
185  // be reconsidered and sorted back into rows and recover outlines mistakenly
186  // rejected.
187  port_box_it.set_to_list(&port_block->noise_blobs);
188  blob_it.set_to_list(block->reject_blobs());
189  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
190  blob = blob_it.extract();
191  newblob = new BLOBNBOX(blob); // Convert blob to BLOBNBOX.
192  SetBlobStrokeWidth(pix, newblob);
193  port_box_it.add_after_then_move(newblob);
194  }
195 
196  port_block_it.add_after_then_move(port_block);
197  }
198 }
199 
200 /**********************************************************************
201  * find_components
202  *
203  * Find the C_OUTLINEs of the connected components in each block, put them
204  * in C_BLOBs, and filter them by size, putting the different size
205  * grades on different lists in the matching TO_BLOCK in to_blocks.
206  **********************************************************************/
207 
208 void Textord::find_components(Pix* pix, BLOCK_LIST *blocks,
209  TO_BLOCK_LIST *to_blocks) {
210  int width = pixGetWidth(pix);
211  int height = pixGetHeight(pix);
212  if (width > MAX_INT16 || height > MAX_INT16) {
213  tprintf("Input image too large! (%d, %d)\n", width, height);
214  return; // Can't handle it.
215  }
216 
218 
219  BLOCK_IT block_it(blocks); // iterator
220  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
221  block_it.forward()) {
222  BLOCK* block = block_it.data();
223  if (block->poly_block() == NULL || block->poly_block()->IsText()) {
224  extract_edges(pix, block);
225  }
226  }
227 
228  assign_blobs_to_blocks2(pix, blocks, to_blocks);
229  ICOORD page_tr(width, height);
230  filter_blobs(page_tr, to_blocks, !textord_test_landscape);
231 }
232 
233 /**********************************************************************
234  * filter_blobs
235  *
236  * Sort the blobs into sizes in all the blocks for later work.
237  **********************************************************************/
238 
239 void Textord::filter_blobs(ICOORD page_tr, // top right
240  TO_BLOCK_LIST *blocks, // output list
241  BOOL8 testing_on) { // for plotting
242  TO_BLOCK_IT block_it = blocks; // destination iterator
243  TO_BLOCK *block; // created block
244 
245  #ifndef GRAPHICS_DISABLED
246  if (to_win != NULL)
247  to_win->Clear();
248  #endif // GRAPHICS_DISABLED
249 
250  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
251  block_it.forward()) {
252  block = block_it.data();
253  block->line_size = filter_noise_blobs(&block->blobs,
254  &block->noise_blobs,
255  &block->small_blobs,
256  &block->large_blobs);
257  block->line_spacing = block->line_size *
264 
265  #ifndef GRAPHICS_DISABLED
266  if (textord_show_blobs && testing_on) {
267  if (to_win == NULL)
268  create_to_win(page_tr);
269  block->plot_graded_blobs(to_win);
270  }
271  if (textord_show_boxes && testing_on) {
272  if (to_win == NULL)
273  create_to_win(page_tr);
278  }
279  #endif // GRAPHICS_DISABLED
280  }
281 }
282 
283 /**********************************************************************
284  * filter_noise_blobs
285  *
286  * Move small blobs to a separate list.
287  **********************************************************************/
288 
289 float Textord::filter_noise_blobs(
290  BLOBNBOX_LIST *src_list, // original list
291  BLOBNBOX_LIST *noise_list, // noise list
292  BLOBNBOX_LIST *small_list, // small blobs
293  BLOBNBOX_LIST *large_list) { // large blobs
294  inT16 height; //height of blob
295  inT16 width; //of blob
296  BLOBNBOX *blob; //current blob
297  float initial_x; //first guess
298  BLOBNBOX_IT src_it = src_list; //iterators
299  BLOBNBOX_IT noise_it = noise_list;
300  BLOBNBOX_IT small_it = small_list;
301  BLOBNBOX_IT large_it = large_list;
302  STATS size_stats (0, MAX_NEAREST_DIST);
303  //blob heights
304  float min_y; //size limits
305  float max_y;
306  float max_x;
307  float max_height; //of good blobs
308 
309  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
310  blob = src_it.data();
312  noise_it.add_after_then_move(src_it.extract());
313  else if (blob->enclosed_area() >= blob->bounding_box().height()
315  small_it.add_after_then_move(src_it.extract());
316  }
317  for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
318  size_stats.add(src_it.data()->bounding_box().height(), 1);
319  }
320  initial_x = size_stats.ile(textord_initialx_ile);
321  max_y = ceil(initial_x *
326  min_y = floor (initial_x / 2);
327  max_x = ceil (initial_x * textord_width_limit);
328  small_it.move_to_first ();
329  for (small_it.mark_cycle_pt (); !small_it.cycled_list ();
330  small_it.forward ()) {
331  height = small_it.data()->bounding_box().height();
332  if (height > max_y)
333  large_it.add_after_then_move(small_it.extract ());
334  else if (height >= min_y)
335  src_it.add_after_then_move(small_it.extract ());
336  }
337  size_stats.clear ();
338  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {
339  height = src_it.data ()->bounding_box ().height ();
340  width = src_it.data ()->bounding_box ().width ();
341  if (height < min_y)
342  small_it.add_after_then_move (src_it.extract ());
343  else if (height > max_y || width > max_x)
344  large_it.add_after_then_move (src_it.extract ());
345  else
346  size_stats.add (height, 1);
347  }
348  max_height = size_stats.ile (textord_initialasc_ile);
349  // tprintf("max_y=%g, min_y=%g, initial_x=%g, max_height=%g,",
350  // max_y,min_y,initial_x,max_height);
352  if (max_height > initial_x)
353  initial_x = max_height;
354  // tprintf(" ret=%g\n",initial_x);
355  return initial_x;
356 }
357 
358 // Fixes the block so it obeys all the rules:
359 // Must have at least one ROW.
360 // Must have at least one WERD.
361 // WERDs contain a fake blob.
362 void Textord::cleanup_nontext_block(BLOCK* block) {
363  // Non-text blocks must contain at least one row.
364  ROW_IT row_it(block->row_list());
365  if (row_it.empty()) {
366  TBOX box = block->bounding_box();
367  float height = box.height();
368  inT32 xstarts[2] = {box.left(), box.right()};
369  double coeffs[3] = {0.0, 0.0, static_cast<double>(box.bottom())};
370  ROW* row = new ROW(1, xstarts, coeffs, height / 2.0f, height / 4.0f,
371  height / 4.0f, 0, 1);
372  row_it.add_after_then_move(row);
373  }
374  // Each row must contain at least one word.
375  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
376  ROW* row = row_it.data();
377  WERD_IT w_it(row->word_list());
378  if (w_it.empty()) {
379  // Make a fake blob to put in the word.
380  TBOX box = block->row_list()->singleton() ? block->bounding_box()
381  : row->bounding_box();
382  C_BLOB* blob = C_BLOB::FakeBlob(box);
383  C_BLOB_LIST blobs;
384  C_BLOB_IT blob_it(&blobs);
385  blob_it.add_after_then_move(blob);
386  WERD* word = new WERD(&blobs, 0, NULL);
387  w_it.add_after_then_move(word);
388  }
389  // Each word must contain a fake blob.
390  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
391  WERD* word = w_it.data();
392  // Just assert that this is true, as it would be useful to find
393  // out why it isn't.
394  ASSERT_HOST(!word->cblob_list()->empty());
395  }
396  row->recalc_bounding_box();
397  }
398 }
399 
400 /**********************************************************************
401  * cleanup_blocks
402  *
403  * Delete empty blocks, rows from the page.
404  **********************************************************************/
405 
406 void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
407  BLOCK_IT block_it = blocks; //iterator
408  ROW_IT row_it; //row iterator
409 
410  int num_rows = 0;
411  int num_rows_all = 0;
412  int num_blocks = 0;
413  int num_blocks_all = 0;
414  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
415  block_it.forward()) {
416  BLOCK* block = block_it.data();
417  if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
418  cleanup_nontext_block(block);
419  continue;
420  }
421  num_rows = 0;
422  num_rows_all = 0;
423  if (clean_noise) {
424  row_it.set_to_list(block->row_list());
425  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
426  ROW* row = row_it.data();
427  ++num_rows_all;
428  clean_small_noise_from_words(row);
429  if ((textord_noise_rejrows && !row->word_list()->empty() &&
430  clean_noise_from_row(row)) ||
431  row->word_list()->empty()) {
432  delete row_it.extract(); // lose empty row.
433  } else {
435  clean_noise_from_words(row_it.data());
436  if (textord_blshift_maxshift >= 0)
439  ++num_rows;
440  }
441  }
442  }
443  if (block->row_list()->empty()) {
444  delete block_it.extract(); // Lose empty text blocks.
445  } else {
446  ++num_blocks;
447  }
448  ++num_blocks_all;
450  tprintf("cleanup_blocks: # rows = %d / %d\n", num_rows, num_rows_all);
451  }
453  tprintf("cleanup_blocks: # blocks = %d / %d\n", num_blocks, num_blocks_all);
454 }
455 
456 
457 /**********************************************************************
458  * clean_noise_from_row
459  *
460  * Move blobs of words from rows of garbage into the reject blobs list.
461  **********************************************************************/
462 
463 BOOL8 Textord::clean_noise_from_row( //remove empties
464  ROW *row //row to clean
465  ) {
466  BOOL8 testing_on;
467  TBOX blob_box; //bounding box
468  C_BLOB *blob; //current blob
469  C_OUTLINE *outline; //current outline
470  WERD *word; //current word
471  inT32 blob_size; //biggest size
472  inT32 trans_count = 0; //no of transitions
473  inT32 trans_threshold; //noise tolerance
474  inT32 dot_count; //small objects
475  inT32 norm_count; //normal objects
476  inT32 super_norm_count; //real char-like
477  //words of row
478  WERD_IT word_it = row->word_list ();
479  C_BLOB_IT blob_it; //blob iterator
480  C_OUTLINE_IT out_it; //outline iterator
481 
484  && textord_test_y < row->base_line (textord_test_x) + row->x_height ())
485  testing_on = TRUE;
486  else
487  testing_on = FALSE;
488  dot_count = 0;
489  norm_count = 0;
490  super_norm_count = 0;
491  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
492  word = word_it.data (); //current word
493  //blobs in word
494  blob_it.set_to_list (word->cblob_list ());
495  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
496  blob_it.forward ()) {
497  blob = blob_it.data ();
498  if (!word->flag (W_DONT_CHOP)) {
499  //get outlines
500  out_it.set_to_list (blob->out_list ());
501  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
502  out_it.forward ()) {
503  outline = out_it.data ();
504  blob_box = outline->bounding_box ();
505  blob_size =
506  blob_box.width () >
507  blob_box.height ()? blob_box.width () : blob_box.
508  height();
509  if (blob_size < textord_noise_sizelimit * row->x_height ())
510  dot_count++; //count smal outlines
511  if (!outline->child ()->empty ()
512  && blob_box.height () <
513  (1 + textord_noise_syfract) * row->x_height ()
514  && blob_box.height () >
515  (1 - textord_noise_syfract) * row->x_height ()
516  && blob_box.width () <
517  (1 + textord_noise_sxfract) * row->x_height ()
518  && blob_box.width () >
519  (1 - textord_noise_sxfract) * row->x_height ())
520  super_norm_count++; //count smal outlines
521  }
522  }
523  else
524  super_norm_count++;
525  blob_box = blob->bounding_box ();
526  blob_size =
527  blob_box.width () >
528  blob_box.height ()? blob_box.width () : blob_box.height ();
529  if (blob_size >= textord_noise_sizelimit * row->x_height ()
530  && blob_size < row->x_height () * 2) {
531  trans_threshold = blob_size / textord_noise_sizefraction;
532  trans_count = blob->count_transitions (trans_threshold);
533  if (trans_count < textord_noise_translimit)
534  norm_count++;
535  }
536  else if (blob_box.height () > row->x_height () * 2
537  && (!word_it.at_first () || !blob_it.at_first ()))
538  dot_count += 2;
539  if (testing_on) {
540  tprintf
541  ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n",
542  blob_box.left (), blob_box.bottom (), blob_box.right (),
543  blob_box.top (), blob->out_list ()->length (), trans_count,
544  blob_box.bottom () - row->base_line (blob_box.left ()));
545  }
546  }
547  }
548  if (textord_noise_debug) {
549  tprintf ("Row ending at (%d,%g):",
550  blob_box.right (), row->base_line (blob_box.right ()));
551  tprintf (" R=%g, dc=%d, nc=%d, %s\n",
552  norm_count > 0 ? (float) dot_count / norm_count : 9999,
553  dot_count, norm_count,
554  dot_count > norm_count * textord_noise_normratio
555  && dot_count > 2 ? "REJECTED" : "ACCEPTED");
556  }
557  return super_norm_count < textord_noise_sncount
558  && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;
559 }
560 
561 /**********************************************************************
562  * clean_noise_from_words
563  *
564  * Move blobs of words from rows of garbage into the reject blobs list.
565  **********************************************************************/
566 
567 void Textord::clean_noise_from_words( //remove empties
568  ROW *row //row to clean
569  ) {
570  TBOX blob_box; //bounding box
571  inT8 *word_dud; //was it chucked
572  C_BLOB *blob; //current blob
573  C_OUTLINE *outline; //current outline
574  WERD *word; //current word
575  inT32 blob_size; //biggest size
576  inT32 trans_count; //no of transitions
577  inT32 trans_threshold; //noise tolerance
578  inT32 dot_count; //small objects
579  inT32 norm_count; //normal objects
580  inT32 dud_words; //number discarded
581  inT32 ok_words; //number remaining
582  inT32 word_index; //current word
583  //words of row
584  WERD_IT word_it = row->word_list ();
585  C_BLOB_IT blob_it; //blob iterator
586  C_OUTLINE_IT out_it; //outline iterator
587 
588  ok_words = word_it.length ();
589  if (ok_words == 0 || textord_no_rejects)
590  return;
591  word_dud = (inT8 *) alloc_mem (ok_words * sizeof (inT8));
592  dud_words = 0;
593  ok_words = 0;
594  word_index = 0;
595  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
596  word = word_it.data (); //current word
597  dot_count = 0;
598  norm_count = 0;
599  //blobs in word
600  blob_it.set_to_list (word->cblob_list ());
601  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
602  blob_it.forward ()) {
603  blob = blob_it.data ();
604  if (!word->flag (W_DONT_CHOP)) {
605  //get outlines
606  out_it.set_to_list (blob->out_list ());
607  for (out_it.mark_cycle_pt (); !out_it.cycled_list ();
608  out_it.forward ()) {
609  outline = out_it.data ();
610  blob_box = outline->bounding_box ();
611  blob_size =
612  blob_box.width () >
613  blob_box.height ()? blob_box.width () : blob_box.
614  height();
615  if (blob_size < textord_noise_sizelimit * row->x_height ())
616  dot_count++; //count smal outlines
617  if (!outline->child ()->empty ()
618  && blob_box.height () <
619  (1 + textord_noise_syfract) * row->x_height ()
620  && blob_box.height () >
621  (1 - textord_noise_syfract) * row->x_height ()
622  && blob_box.width () <
623  (1 + textord_noise_sxfract) * row->x_height ()
624  && blob_box.width () >
625  (1 - textord_noise_sxfract) * row->x_height ())
626  norm_count++; //count smal outlines
627  }
628  }
629  else
630  norm_count++;
631  blob_box = blob->bounding_box ();
632  blob_size =
633  blob_box.width () >
634  blob_box.height ()? blob_box.width () : blob_box.height ();
635  if (blob_size >= textord_noise_sizelimit * row->x_height ()
636  && blob_size < row->x_height () * 2) {
637  trans_threshold = blob_size / textord_noise_sizefraction;
638  trans_count = blob->count_transitions (trans_threshold);
639  if (trans_count < textord_noise_translimit)
640  norm_count++;
641  }
642  else if (blob_box.height () > row->x_height () * 2
643  && (!word_it.at_first () || !blob_it.at_first ()))
644  dot_count += 2;
645  }
646  if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
647  if (dot_count > norm_count * textord_noise_normratio * 2)
648  word_dud[word_index] = 2;
649  else if (dot_count > norm_count * textord_noise_normratio)
650  word_dud[word_index] = 1;
651  else
652  word_dud[word_index] = 0;
653  } else {
654  word_dud[word_index] = 0;
655  }
656  if (word_dud[word_index] == 2)
657  dud_words++;
658  else
659  ok_words++;
660  word_index++;
661  }
662 
663  word_index = 0;
664  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
665  if (word_dud[word_index] == 2
666  || (word_dud[word_index] == 1 && dud_words > ok_words)) {
667  word = word_it.data(); // Current word.
668  // Previously we threw away the entire word.
669  // Now just aggressively throw all small blobs into the reject list, where
670  // the classifier can decide whether they are actually needed.
672  }
673  word_index++;
674  }
675  free_mem(word_dud);
676 }
677 
678 // Remove outlines that are a tiny fraction in either width or height
679 // of the word height.
680 void Textord::clean_small_noise_from_words(ROW *row) {
681  WERD_IT word_it(row->word_list());
682  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
683  WERD* word = word_it.data();
684  int min_size = static_cast<int>(
685  textord_noise_hfract * word->bounding_box().height() + 0.5);
686  C_BLOB_IT blob_it(word->cblob_list());
687  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
688  C_BLOB* blob = blob_it.data();
689  C_OUTLINE_IT out_it(blob->out_list());
690  for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
691  C_OUTLINE* outline = out_it.data();
692  outline->RemoveSmallRecursive(min_size, &out_it);
693  }
694  if (blob->out_list()->empty()) {
695  delete blob_it.extract();
696  }
697  }
698  if (word->cblob_list()->empty()) {
699  if (!word_it.at_last()) {
700  // The next word is no longer a fuzzy non space if it was before,
701  // since the word before is about to be deleted.
702  WERD* next_word = word_it.data_relative(1);
703  if (next_word->flag(W_FUZZY_NON)) {
704  next_word->set_flag(W_FUZZY_NON, false);
705  }
706  }
707  delete word_it.extract();
708  }
709  }
710 }
711 
712 // Local struct to hold a group of blocks.
713 struct BlockGroup {
714  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
715  explicit BlockGroup(BLOCK* block)
716  : bounding_box(block->bounding_box()),
717  rotation(block->re_rotation()),
718  angle(block->re_rotation().angle()),
719  min_xheight(block->x_height()) {
720  blocks.push_back(block);
721  }
722  // Union of block bounding boxes.
724  // Common rotation of the blocks.
726  // Angle of rotation.
727  float angle;
728  // Min xheight of the blocks.
729  float min_xheight;
730  // Collection of borrowed pointers to the blocks in the group.
732 };
733 
734 // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
735 // TransferDiacriticsToWords to copy the diacritic blobs to the most
736 // appropriate words in the group of blocks. Source blobs are not touched.
737 void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
738  BLOCK_LIST* blocks) {
739  // Angle difference larger than this is too much to consider equal.
740  // They should only be in multiples of M_PI/2 anyway.
741  const double kMaxAngleDiff = 0.01; // About 0.6 degrees.
743  BLOCK_IT bk_it(blocks);
744  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
745  BLOCK* block = bk_it.data();
746  if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
747  continue;
748  }
749  // Linear search of the groups to find a matching rotation.
750  float block_angle = block->re_rotation().angle();
751  int best_g = 0;
752  float best_angle_diff = MAX_FLOAT32;
753  for (int g = 0; g < groups.size(); ++g) {
754  double angle_diff = fabs(block_angle - groups[g]->angle);
755  if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
756  if (angle_diff < best_angle_diff) {
757  best_angle_diff = angle_diff;
758  best_g = g;
759  }
760  }
761  if (best_angle_diff > kMaxAngleDiff) {
762  groups.push_back(new BlockGroup(block));
763  } else {
764  groups[best_g]->blocks.push_back(block);
765  groups[best_g]->bounding_box += block->bounding_box();
766  float x_height = block->x_height();
767  if (x_height < groups[best_g]->min_xheight)
768  groups[best_g]->min_xheight = x_height;
769  }
770  }
771  // Now process each group of blocks.
772  PointerVector<WordWithBox> word_ptrs;
773  for (int g = 0; g < groups.size(); ++g) {
774  const BlockGroup* group = groups[g];
775  WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
776  group->bounding_box.topright());
777  for (int b = 0; b < group->blocks.size(); ++b) {
778  ROW_IT row_it(group->blocks[b]->row_list());
779  for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
780  ROW* row = row_it.data();
781  // Put the words of the row into the grid.
782  WERD_IT w_it(row->word_list());
783  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
784  WERD* word = w_it.data();
785  WordWithBox* box_word = new WordWithBox(word);
786  word_grid.InsertBBox(true, true, box_word);
787  // Save the pointer where it will be auto-deleted.
788  word_ptrs.push_back(box_word);
789  }
790  }
791  }
792  FCOORD rotation = group->rotation;
793  // Make it a forward rotation that will transform blob coords to block.
794  rotation.set_y(-rotation.y());
795  TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
796  }
797 }
798 
799 // Places a copy of blobs that are near a word (after applying rotation to the
800 // blob) in the most appropriate word, unless there is doubt, in which case a
801 // blob can end up in two words. Source blobs are not touched.
802 void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
803  const FCOORD& rotation,
804  WordGrid* word_grid) {
805  WordSearch ws(word_grid);
806  BLOBNBOX_IT b_it(diacritic_blobs);
807  // Apply rotation to each blob before finding the nearest words. The rotation
808  // allows us to only consider above/below placement and not left/right on
809  // vertical text, because all text is horizontal here.
810  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
811  BLOBNBOX* blobnbox = b_it.data();
812  TBOX blob_box = blobnbox->bounding_box();
813  blob_box.rotate(rotation);
814  ws.StartRectSearch(blob_box);
815  // Above/below refer to word position relative to diacritic. Since some
816  // scripts eg Kannada/Telugu habitually put diacritics below words, and
817  // others eg Thai/Vietnamese/Latin put most diacritics above words, try
818  // for both if there isn't much in it.
819  WordWithBox* best_above_word = NULL;
820  WordWithBox* best_below_word = NULL;
821  int best_above_distance = 0;
822  int best_below_distance = 0;
823  for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
824  word = ws.NextRectSearch()) {
825  if (word->word()->flag(W_REP_CHAR)) continue;
826  TBOX word_box = word->true_bounding_box();
827  int x_distance = blob_box.x_gap(word_box);
828  int y_distance = blob_box.y_gap(word_box);
829  if (x_distance > 0) {
830  // Arbitrarily divide x-distance by 2 if there is a major y overlap,
831  // and the word is to the left of the diacritic. If the
832  // diacritic is a dropped broken character between two words, this will
833  // help send all the pieces to a single word, instead of splitting them
834  // over the 2 words.
835  if (word_box.major_y_overlap(blob_box) &&
836  blob_box.left() > word_box.right()) {
837  x_distance /= 2;
838  }
839  y_distance += x_distance;
840  }
841  if (word_box.y_middle() > blob_box.y_middle() &&
842  (best_above_word == NULL || y_distance < best_above_distance)) {
843  best_above_word = word;
844  best_above_distance = y_distance;
845  }
846  if (word_box.y_middle() <= blob_box.y_middle() &&
847  (best_below_word == NULL || y_distance < best_below_distance)) {
848  best_below_word = word;
849  best_below_distance = y_distance;
850  }
851  }
852  bool above_good =
853  best_above_word != NULL &&
854  (best_below_word == NULL ||
855  best_above_distance < best_below_distance + blob_box.height());
856  bool below_good =
857  best_below_word != NULL && best_below_word != best_above_word &&
858  (best_above_word == NULL ||
859  best_below_distance < best_above_distance + blob_box.height());
860  if (below_good) {
861  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
862  copied_blob->rotate(rotation);
863  // Put the blob into the word's reject blobs list.
864  C_BLOB_IT blob_it(best_below_word->RejBlobs());
865  blob_it.add_to_end(copied_blob);
866  }
867  if (above_good) {
868  C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
869  copied_blob->rotate(rotation);
870  // Put the blob into the word's reject blobs list.
871  C_BLOB_IT blob_it(best_above_word->RejBlobs());
872  blob_it.add_to_end(copied_blob);
873  }
874  }
875 }
876 
877 } // tesseract
878 
879 /**********************************************************************
880  * tweak_row_baseline
881  *
882  * Shift baseline to fit the blobs more accurately where they are
883  * close enough.
884  **********************************************************************/
885 
887  double blshift_maxshift,
888  double blshift_xfraction) {
889  TBOX blob_box; //bounding box
890  C_BLOB *blob; //current blob
891  WERD *word; //current word
892  inT32 blob_count; //no of blobs
893  inT32 src_index; //source segment
894  inT32 dest_index; //destination segment
895  inT32 *xstarts; //spline segments
896  double *coeffs; //spline coeffs
897  float ydiff; //baseline error
898  float x_centre; //centre of blob
899  //words of row
900  WERD_IT word_it = row->word_list ();
901  C_BLOB_IT blob_it; //blob iterator
902 
903  blob_count = 0;
904  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
905  word = word_it.data (); //current word
906  //get total blobs
907  blob_count += word->cblob_list ()->length ();
908  }
909  if (blob_count == 0)
910  return;
911  xstarts =
912  (inT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) *
913  sizeof (inT32));
914  coeffs =
915  (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 *
916  sizeof (double));
917 
918  src_index = 0;
919  dest_index = 0;
920  xstarts[0] = row->baseline.xcoords[0];
921  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
922  word = word_it.data (); //current word
923  //blobs in word
924  blob_it.set_to_list (word->cblob_list ());
925  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();
926  blob_it.forward ()) {
927  blob = blob_it.data ();
928  blob_box = blob->bounding_box ();
929  x_centre = (blob_box.left () + blob_box.right ()) / 2.0;
930  ydiff = blob_box.bottom () - row->base_line (x_centre);
931  if (ydiff < 0)
932  ydiff = -ydiff / row->x_height ();
933  else
934  ydiff = ydiff / row->x_height ();
935  if (ydiff < blshift_maxshift
936  && blob_box.height () / row->x_height () > blshift_xfraction) {
937  if (xstarts[dest_index] >= x_centre)
938  xstarts[dest_index] = blob_box.left ();
939  coeffs[dest_index * 3] = 0;
940  coeffs[dest_index * 3 + 1] = 0;
941  coeffs[dest_index * 3 + 2] = blob_box.bottom ();
942  //shift it
943  dest_index++;
944  xstarts[dest_index] = blob_box.right () + 1;
945  }
946  else {
947  if (xstarts[dest_index] <= x_centre) {
948  while (row->baseline.xcoords[src_index + 1] <= x_centre
949  && src_index < row->baseline.segments - 1) {
950  if (row->baseline.xcoords[src_index + 1] >
951  xstarts[dest_index]) {
952  coeffs[dest_index * 3] =
953  row->baseline.quadratics[src_index].a;
954  coeffs[dest_index * 3 + 1] =
955  row->baseline.quadratics[src_index].b;
956  coeffs[dest_index * 3 + 2] =
957  row->baseline.quadratics[src_index].c;
958  dest_index++;
959  xstarts[dest_index] =
960  row->baseline.xcoords[src_index + 1];
961  }
962  src_index++;
963  }
964  coeffs[dest_index * 3] =
965  row->baseline.quadratics[src_index].a;
966  coeffs[dest_index * 3 + 1] =
967  row->baseline.quadratics[src_index].b;
968  coeffs[dest_index * 3 + 2] =
969  row->baseline.quadratics[src_index].c;
970  dest_index++;
971  xstarts[dest_index] = row->baseline.xcoords[src_index + 1];
972  }
973  }
974  }
975  }
976  while (src_index < row->baseline.segments
977  && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index])
978  src_index++;
979  while (src_index < row->baseline.segments) {
980  coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a;
981  coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b;
982  coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c;
983  dest_index++;
984  src_index++;
985  xstarts[dest_index] = row->baseline.xcoords[src_index];
986  }
987  //turn to spline
988  row->baseline = QSPLINE (dest_index, xstarts, coeffs);
989  free_mem(xstarts);
990  free_mem(coeffs);
991 }
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
inT32 get_total() const
Definition: statistc.h:86
int textord_max_noise_size
Definition: textord.h:376
double a
Definition: quadratc.h:58
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
bool textord_noise_rejrows
Definition: textord.h:389
ScrollView * create_to_win(ICOORD page_tr)
Definition: drawtord.cpp:47
void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction)
Definition: tordmain.cpp:886
void rotate(const FCOORD &rotation)
Definition: stepblob.cpp:387
inT32 count_transitions(inT32 threshold)
Definition: stepblob.cpp:330
static const double kXHeightCapRatio
Definition: ccstruct.h:37
float c
Definition: quadratc.h:60
void free_mem(void *oldchunk)
Definition: memry.cpp:55
BlockGroup(BLOCK *block)
Definition: tordmain.cpp:715
double textord_noise_syfract
Definition: textord.h:390
int push_back(T *object)
bool textord_noise_debug
Definition: textord.h:397
#define tprintf(...)
Definition: tprintf.h:31
BBGrid< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordGrid
Definition: textord.h:65
Definition: statistc.h:33
#define LOC_EDGE_PROG
Definition: errcode.h:44
GenericVector< BLOCK * > blocks
Definition: tordmain.cpp:731
static const double kDescenderFraction
Definition: ccstruct.h:33
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
unsigned char BOOL8
Definition: host.h:113
float x_height() const
Definition: ocrrow.h:61
TBOX bounding_box() const
Definition: werd.cpp:160
const int kWordGridSize
Definition: tordmain.cpp:42
double textord_noise_hfract
Definition: textord.h:394
bool IsText() const
Definition: polyblk.h:52
double textord_blshift_maxshift
Definition: textord.h:398
inT16 right() const
Definition: rect.h:75
int textord_noise_sncount
Definition: textord.h:395
double textord_noise_normratio
Definition: textord.h:387
void RemoveSmallRecursive(int min_size, C_OUTLINE_IT *it)
Definition: coutln.cpp:636
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: ocrrow.h:32
float base_line(float xpos) const
Definition: ocrrow.h:56
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:771
float angle() const
find angle
Definition: points.h:249
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
bool textord_noise_rejwords
Definition: textord.h:388
double textord_initialasc_ile
Definition: textord.h:383
void Clear()
Definition: scrollview.cpp:595
double textord_noise_rowratio
Definition: textord.h:396
static const double kAscenderFraction
Definition: ccstruct.h:35
EXTERN ScrollView * to_win
Definition: drawtord.cpp:38
bool textord_no_rejects
Definition: textord.h:373
double ile(double frac) const
Definition: statistc.cpp:177
#define CLISTIZE(CLASSNAME)
Definition: clst.h:958
double textord_initialx_ile
Definition: textord.h:382
inT32 x_height() const
return xheight
Definition: ocrblock.h:110
unsigned int uinT32
Definition: host.h:103
inT16 left() const
Definition: rect.h:68
FCOORD re_rotation() const
Definition: ocrblock.h:138
double textord_excess_blobsize
Definition: makerow.cpp:85
Definition: ocrblock.h:30
int y_gap(const TBOX &box) const
Definition: rect.h:225
bool textord_test_landscape
Definition: makerow.cpp:50
int textord_test_x
Definition: makerow.cpp:62
#define MAX_NEAREST_DIST
Definition: tordmain.cpp:47
GridSearch< WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT > WordSearch
Definition: textord.h:66
C_BLOB * cblob() const
Definition: blobbox.h:253
double textord_width_limit
Definition: makerow.cpp:77
void set_y(float yin)
rewrite function
Definition: points.h:220
double textord_blshift_xfraction
Definition: textord.h:399
TBOX bounding_box() const
Definition: ocrrow.h:85
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
bool textord_show_blobs
Definition: textord.h:374
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:770
void plot_box_list(ScrollView *win, BLOBNBOX_LIST *list, ScrollView::Color body_colour)
Definition: drawtord.cpp:70
double textord_noise_sxfract
Definition: textord.h:392
const TBOX & bounding_box() const
Definition: coutln.h:111
Definition: werd.h:60
void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on)
Definition: tordmain.cpp:239
integer coordinate
Definition: points.h:30
void CleanNoise(float size_threshold)
Definition: werd.cpp:506
inT16 bottom() const
Definition: rect.h:61
inT32 enclosed_area() const
Definition: blobbox.h:238
int textord_noise_sizefraction
Definition: textord.h:384
inT16 height() const
Definition: rect.h:104
bool textord_show_boxes
Definition: textord.h:375
void plot_graded_blobs(ScrollView *to_win)
Definition: blobbox.cpp:1065
inT16 width() const
Definition: rect.h:111
double textord_noise_sizelimit
Definition: textord.h:385
#define FALSE
Definition: capi.h:29
int x_gap(const TBOX &box) const
Definition: rect.h:217
bool major_y_overlap(const TBOX &box) const
Definition: rect.h:429
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
Definition: rect.h:30
float line_spacing
Definition: blobbox.h:775
#define TRUE
Definition: capi.h:28
#define MAX_INT16
Definition: host.h:119
float y() const
Definition: points.h:212
TBOX true_bounding_box() const
Definition: werd.cpp:181
#define MAX_FLOAT32
Definition: host.h:124
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
static const double kXHeightFraction
Definition: ccstruct.h:34
TBOX bounding_box() const
Definition: stepblob.cpp:250
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:208
void * alloc_mem(inT32 count)
Definition: memry.cpp:47
double textord_min_linesize
Definition: makerow.cpp:83
#define NULL
Definition: host.h:144
const TBOX & bounding_box() const
Definition: blobbox.h:215
SIGNED char inT8
Definition: host.h:98
int y_middle() const
Definition: rect.h:84
C_OUTLINE_LIST * child()
Definition: coutln.h:106
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
inT16 top() const
Definition: rect.h:54
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:772
double textord_noise_area_ratio
Definition: textord.h:380
void assign_blobs_to_blocks2(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks)
Definition: tordmain.cpp:157
Definition: points.h:189
float max_blob_size
Definition: blobbox.h:782
float b
Definition: quadratc.h:59
int textord_test_y
Definition: makerow.cpp:63
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD_LIST * word_list()
Definition: ocrrow.h:52
void recalc_bounding_box()
Definition: ocrrow.cpp:101
float line_size
Definition: blobbox.h:781
C_BLOB_LIST * reject_blobs()
Definition: ocrblock.h:135
void SetBlobStrokeWidth(Pix *pix, BLOBNBOX *blob)
Definition: tordmain.cpp:58
short inT16
Definition: host.h:100
int textord_noise_translimit
Definition: textord.h:386
int inT32
Definition: host.h:102
void rotate(const FCOORD &vec)
Definition: rect.h:189
BLOBNBOX_LIST blobs
Definition: blobbox.h:768