All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
colpartition.cpp
Go to the documentation of this file.
1 // File: colpartition.cpp
3 // Description: Class to hold partitions of the page that correspond
4 // roughly to text lines.
5 // Author: Ray Smith
6 // Created: Thu Aug 14 10:54:01 PDT 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifdef _MSC_VER
22 #pragma warning(disable:4244) // Conversion warnings
23 #endif
24 
25 #ifdef HAVE_CONFIG_H
26 #include "config_auto.h"
27 #endif
28 
29 #include "colpartition.h"
30 #include "colpartitiongrid.h"
31 #include "colpartitionset.h"
32 #include "detlinefit.h"
33 #include "dppoint.h"
34 #include "imagefind.h"
35 #include "workingpartset.h"
36 
37 namespace tesseract {
38 
39 ELIST2IZE(ColPartition)
40 CLISTIZE(ColPartition)
41 
43 
44 // If multiple partners survive the partner depth test beyond this level,
45 // then arbitrarily pick one.
46 const int kMaxPartnerDepth = 4;
47 // Maximum change in spacing (in inches) to ignore.
48 const double kMaxSpacingDrift = 1.0 / 72; // 1/72 is one point.
49 // Maximum fraction of line height used as an additional allowance
50 // for top spacing.
51 const double kMaxTopSpacingFraction = 0.25;
52 // What multiple of the largest line height should be used as an upper bound
53 // for whether lines are in the same text block?
54 const double kMaxSameBlockLineSpacing = 3;
55 // Maximum ratio of sizes for lines to be considered the same size.
56 const double kMaxSizeRatio = 1.5;
57 // Fraction of max of leader width and gap for max IQR of gaps.
58 const double kMaxLeaderGapFractionOfMax = 0.25;
59 // Fraction of min of leader width and gap for max IQR of gaps.
60 const double kMaxLeaderGapFractionOfMin = 0.5;
61 // Minimum number of blobs to be considered a leader.
62 const int kMinLeaderCount = 5;
63 // Cost of a cut through a leader.
64 const int kLeaderCutCost = 8;
65 // Minimum score for a STRONG_CHAIN textline.
66 const int kMinStrongTextValue = 6;
67 // Minimum score for a CHAIN textline.
68 const int kMinChainTextValue = 3;
69 // Minimum number of blobs for strong horizontal text lines.
71 // Minimum height (in image pixels) for strong horizontal text lines.
73 // Minimum aspect ratio for strong horizontal text lines.
75 // Maximum upper quartile error allowed on a baseline fit as a fraction
76 // of height.
77 const double kMaxBaselineError = 0.4375;
78 // Min coverage for a good baseline between vectors
79 const double kMinBaselineCoverage = 0.5;
80 // Max RMS color noise to compare colors.
81 const int kMaxRMSColorNoise = 128;
82 // Maximum distance to allow a partition color to be to use that partition
83 // in smoothing neighbouring types. This is a squared distance.
84 const int kMaxColorDistance = 900;
85 
86 // blob_type is the blob_region_type_ of the blobs in this partition.
87 // Vertical is the direction of logical vertical on the possibly skewed image.
88 ColPartition::ColPartition(BlobRegionType blob_type, const ICOORD& vertical)
89  : left_margin_(-MAX_INT32), right_margin_(MAX_INT32),
90  median_bottom_(MAX_INT32), median_top_(-MAX_INT32), median_size_(0),
91  median_left_(MAX_INT32), median_right_(-MAX_INT32), median_width_(0),
92  blob_type_(blob_type), flow_(BTFT_NONE), good_blob_score_(0),
93  good_width_(false), good_column_(false),
94  left_key_tab_(false), right_key_tab_(false),
95  left_key_(0), right_key_(0), type_(PT_UNKNOWN), vertical_(vertical),
96  working_set_(NULL), last_add_was_vertical_(false), block_owned_(false),
97  desperately_merged_(false),
98  first_column_(-1), last_column_(-1), column_set_(NULL),
99  side_step_(0), top_spacing_(0), bottom_spacing_(0),
100  type_before_table_(PT_UNKNOWN), inside_table_column_(false),
101  nearest_neighbor_above_(NULL), nearest_neighbor_below_(NULL),
102  space_above_(0), space_below_(0), space_to_left_(0), space_to_right_(0),
103  owns_blobs_(true) {
104  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
105 }
106 
107 // Constructs a fake ColPartition with a single fake BLOBNBOX, all made
108 // from a single TBOX.
109 // WARNING: Despite being on C_LISTs, the BLOBNBOX owns the C_BLOB and
110 // the ColPartition owns the BLOBNBOX!!!
111 // Call DeleteBoxes before deleting the ColPartition.
113  PolyBlockType block_type,
114  BlobRegionType blob_type,
115  BlobTextFlowType flow) {
116  ColPartition* part = new ColPartition(blob_type, ICOORD(0, 1));
117  part->set_type(block_type);
118  part->set_flow(flow);
119  part->AddBox(new BLOBNBOX(C_BLOB::FakeBlob(box)));
120  part->set_left_margin(box.left());
121  part->set_right_margin(box.right());
122  part->SetBlobTypes();
123  part->ComputeLimits();
124  part->ClaimBoxes();
125  return part;
126 }
127 
128 // Constructs and returns a ColPartition with the given real BLOBNBOX,
129 // and sets it up to be a "big" partition (single-blob partition bigger
130 // than the surrounding text that may be a dropcap, two or more vertically
131 // touching characters, or some graphic element.
132 // If the given list is not NULL, the partition is also added to the list.
134  ColPartition_LIST* big_part_list) {
135  box->set_owner(NULL);
136  ColPartition* single = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
137  single->set_flow(BTFT_NONE);
138  single->AddBox(box);
139  single->ComputeLimits();
140  single->ClaimBoxes();
141  single->SetBlobTypes();
142  single->set_block_owned(true);
143  if (big_part_list != NULL) {
144  ColPartition_IT part_it(big_part_list);
145  part_it.add_to_end(single);
146  }
147  return single;
148 }
149 
151  // Remove this as a partner of all partners, as we don't want them
152  // referring to a deleted object.
153  ColPartition_C_IT it(&upper_partners_);
154  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
155  it.data()->RemovePartner(false, this);
156  }
157  it.set_to_list(&lower_partners_);
158  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
159  it.data()->RemovePartner(true, this);
160  }
161 }
162 
163 // Constructs a fake ColPartition with no BLOBNBOXes to represent a
164 // horizontal or vertical line, given a type and a bounding box.
166  const ICOORD& vertical,
167  int left, int bottom,
168  int right, int top) {
169  ColPartition* part = new ColPartition(blob_type, vertical);
170  part->bounding_box_ = TBOX(left, bottom, right, top);
171  part->median_bottom_ = bottom;
172  part->median_top_ = top;
173  part->median_size_ = top - bottom;
174  part->median_width_ = right - left;
175  part->left_key_ = part->BoxLeftKey();
176  part->right_key_ = part->BoxRightKey();
177  return part;
178 }
179 
180 
181 // Adds the given box to the partition, updating the partition bounds.
182 // The list of boxes in the partition is updated, ensuring that no box is
183 // recorded twice, and the boxes are kept in increasing left position.
185  TBOX box = bbox->bounding_box();
186  // Update the partition limits.
187  if (boxes_.length() == 0) {
188  bounding_box_ = box;
189  } else {
190  bounding_box_ += box;
191  }
192 
193  if (IsVerticalType()) {
194  if (!last_add_was_vertical_) {
195  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
196  last_add_was_vertical_ = true;
197  }
198  boxes_.add_sorted(SortByBoxBottom<BLOBNBOX>, true, bbox);
199  } else {
200  if (last_add_was_vertical_) {
201  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
202  last_add_was_vertical_ = false;
203  }
204  boxes_.add_sorted(SortByBoxLeft<BLOBNBOX>, true, bbox);
205  }
206  if (!left_key_tab_)
207  left_key_ = BoxLeftKey();
208  if (!right_key_tab_)
209  right_key_ = BoxRightKey();
210  if (TabFind::WithinTestRegion(2, box.left(), box.bottom()))
211  tprintf("Added box (%d,%d)->(%d,%d) left_blob_x_=%d, right_blob_x_ = %d\n",
212  box.left(), box.bottom(), box.right(), box.top(),
213  bounding_box_.left(), bounding_box_.right());
214 }
215 
216 // Removes the given box from the partition, updating the bounds.
218  BLOBNBOX_C_IT bb_it(&boxes_);
219  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
220  if (box == bb_it.data()) {
221  bb_it.extract();
222  ComputeLimits();
223  return;
224  }
225  }
226 }
227 
228 // Returns the tallest box in the partition, as measured perpendicular to the
229 // presumed flow of text.
231  BLOBNBOX* biggest = NULL;
232  BLOBNBOX_C_IT bb_it(&boxes_);
233  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
234  BLOBNBOX* bbox = bb_it.data();
235  if (IsVerticalType()) {
236  if (biggest == NULL ||
237  bbox->bounding_box().width() > biggest->bounding_box().width())
238  biggest = bbox;
239  } else {
240  if (biggest == NULL ||
241  bbox->bounding_box().height() > biggest->bounding_box().height())
242  biggest = bbox;
243  }
244  }
245  return biggest;
246 }
247 
248 // Returns the bounding box excluding the given box.
250  TBOX result;
251  BLOBNBOX_C_IT bb_it(&boxes_);
252  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
253  if (box != bb_it.data()) {
254  result += bb_it.data()->bounding_box();
255  }
256  }
257  return result;
258 }
259 
260 // Claims the boxes in the boxes_list by marking them with a this owner
261 // pointer. If a box is already owned, then it must be owned by this.
263  BLOBNBOX_C_IT bb_it(&boxes_);
264  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
265  BLOBNBOX* bblob = bb_it.data();
266  ColPartition* other = bblob->owner();
267  if (other == NULL) {
268  // Normal case: ownership is available.
269  bblob->set_owner(this);
270  } else {
271  ASSERT_HOST(other == this);
272  }
273  }
274 }
275 
276 // NULL the owner of the blobs in this partition, so they can be deleted
277 // independently of the ColPartition.
279  BLOBNBOX_C_IT bb_it(&boxes_);
280  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
281  BLOBNBOX* bblob = bb_it.data();
282  ASSERT_HOST(bblob->owner() == this || bblob->owner() == NULL);
283  bblob->set_owner(NULL);
284  }
285 }
286 
287 // NULL the owner of the blobs in this partition that are owned by this
288 // partition, so they can be deleted independently of the ColPartition.
289 // Any blobs that are not owned by this partition get to keep their owner
290 // without an assert failure.
292  BLOBNBOX_C_IT bb_it(&boxes_);
293  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
294  BLOBNBOX* bblob = bb_it.data();
295  if (bblob->owner() == this)
296  bblob->set_owner(NULL);
297  }
298 }
299 
300 // NULLs the owner of the blobs in this partition that are owned by this
301 // partition and not leader blobs, removing them from the boxes_ list, thus
302 // turning this partition back to a leader partition if it contains a leader,
303 // or otherwise leaving it empty. Returns true if any boxes remain.
305  BLOBNBOX_C_IT bb_it(&boxes_);
306  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
307  BLOBNBOX* bblob = bb_it.data();
308  if (bblob->flow() != BTFT_LEADER) {
309  if (bblob->owner() == this) bblob->set_owner(NULL);
310  bb_it.extract();
311  }
312  }
313  if (bb_it.empty()) return false;
314  flow_ = BTFT_LEADER;
315  ComputeLimits();
316  return true;
317 }
318 
319 // Delete the boxes that this partition owns.
321  // Although the boxes_ list is a C_LIST, in some cases it owns the
322  // BLOBNBOXes, as the ColPartition takes ownership from the grid,
323  // and the BLOBNBOXes own the underlying C_BLOBs.
324  for (BLOBNBOX_C_IT bb_it(&boxes_); !bb_it.empty(); bb_it.forward()) {
325  BLOBNBOX* bblob = bb_it.extract();
326  delete bblob->cblob();
327  delete bblob;
328  }
329 }
330 
331 // Reflects the partition in the y-axis, assuming that its blobs have
332 // already been done. Corrects only a limited part of the members, since
333 // this function is assumed to be used shortly after initial creation, which
334 // is before a lot of the members are used.
336  BLOBNBOX_CLIST reversed_boxes;
337  BLOBNBOX_C_IT reversed_it(&reversed_boxes);
338  // Reverse the order of the boxes_.
339  BLOBNBOX_C_IT bb_it(&boxes_);
340  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
341  reversed_it.add_before_then_move(bb_it.extract());
342  }
343  bb_it.add_list_after(&reversed_boxes);
344  ASSERT_HOST(!left_key_tab_ && !right_key_tab_);
345  int tmp = left_margin_;
346  left_margin_ = -right_margin_;
347  right_margin_ = -tmp;
348  ComputeLimits();
349 }
350 
351 // Returns true if this is a legal partition - meaning that the conditions
352 // left_margin <= bounding_box left
353 // left_key <= bounding box left key
354 // bounding box left <= bounding box right
355 // and likewise for right margin and key
356 // are all met.
358  if (bounding_box_.left() > bounding_box_.right()) {
359  if (textord_debug_bugs) {
360  tprintf("Bounding box invalid\n");
361  Print();
362  }
363  return false; // Bounding box invalid.
364  }
365  if (left_margin_ > bounding_box_.left() ||
366  right_margin_ < bounding_box_.right()) {
367  if (textord_debug_bugs) {
368  tprintf("Margins invalid\n");
369  Print();
370  }
371  return false; // Margins invalid.
372  }
373  if (left_key_ > BoxLeftKey() || right_key_ < BoxRightKey()) {
374  if (textord_debug_bugs) {
375  tprintf("Key inside box: %d v %d or %d v %d\n",
376  left_key_, BoxLeftKey(), right_key_, BoxRightKey());
377  Print();
378  }
379  return false; // Keys inside the box.
380  }
381  return true;
382 }
383 
384 // Returns true if the left and right edges are approximately equal.
385 bool ColPartition::MatchingColumns(const ColPartition& other) const {
386  int y = (MidY() + other.MidY()) / 2;
387  if (!NearlyEqual(other.LeftAtY(y) / kColumnWidthFactor,
388  LeftAtY(y) / kColumnWidthFactor, 1))
389  return false;
390  if (!NearlyEqual(other.RightAtY(y) / kColumnWidthFactor,
391  RightAtY(y) / kColumnWidthFactor, 1))
392  return false;
393  return true;
394 }
395 
396 // Returns true if the colors match for two text partitions.
398  if (color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise &&
399  other.color1_[L_ALPHA_CHANNEL] > kMaxRMSColorNoise)
400  return false; // Too noisy.
401 
402  // Colors must match for other to count.
403  double d_this1_o = ImageFind::ColorDistanceFromLine(other.color1_,
404  other.color2_,
405  color1_);
406  double d_this2_o = ImageFind::ColorDistanceFromLine(other.color1_,
407  other.color2_,
408  color2_);
409  double d_o1_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
410  other.color1_);
411  double d_o2_this = ImageFind::ColorDistanceFromLine(color1_, color2_,
412  other.color2_);
413 // All 4 distances must be small enough.
414  return d_this1_o < kMaxColorDistance && d_this2_o < kMaxColorDistance &&
415  d_o1_this < kMaxColorDistance && d_o2_this < kMaxColorDistance;
416 }
417 
418 // Returns true if the sizes match for two text partitions,
419 // taking orientation into account. See also SizesSimilar.
420 bool ColPartition::MatchingSizes(const ColPartition& other) const {
421  if (blob_type_ == BRT_VERT_TEXT || other.blob_type_ == BRT_VERT_TEXT)
422  return !TabFind::DifferentSizes(median_width_, other.median_width_);
423  else
424  return !TabFind::DifferentSizes(median_size_, other.median_size_);
425 }
426 
427 // Returns true if there is no tabstop violation in merging this and other.
429  if (bounding_box_.right() < other.bounding_box_.left() &&
430  bounding_box_.right() < other.LeftBlobRule())
431  return false;
432  if (other.bounding_box_.right() < bounding_box_.left() &&
433  other.bounding_box_.right() < LeftBlobRule())
434  return false;
435  if (bounding_box_.left() > other.bounding_box_.right() &&
436  bounding_box_.left() > other.RightBlobRule())
437  return false;
438  if (other.bounding_box_.left() > bounding_box_.right() &&
439  other.bounding_box_.left() > RightBlobRule())
440  return false;
441  return true;
442 }
443 
444 // Returns true if other has a similar stroke width to this.
446  double fractional_tolerance,
447  double constant_tolerance) const {
448  int match_count = 0;
449  int nonmatch_count = 0;
450  BLOBNBOX_C_IT box_it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
451  BLOBNBOX_C_IT other_it(const_cast<BLOBNBOX_CLIST*>(&other.boxes_));
452  box_it.mark_cycle_pt();
453  other_it.mark_cycle_pt();
454  while (!box_it.cycled_list() && !other_it.cycled_list()) {
455  if (box_it.data()->MatchingStrokeWidth(*other_it.data(),
456  fractional_tolerance,
457  constant_tolerance))
458  ++match_count;
459  else
460  ++nonmatch_count;
461  box_it.forward();
462  other_it.forward();
463  }
464  return match_count > nonmatch_count;
465 }
466 
467 // Returns true if base is an acceptable diacritic base char merge
468 // with this as the diacritic.
469 // Returns true if:
470 // (1) this is a ColPartition containing only diacritics, and
471 // (2) the base characters indicated on the diacritics all believably lie
472 // within the text line of the candidate ColPartition.
474  bool debug) const {
475  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
476  int min_top = MAX_INT32;
477  int max_bottom = -MAX_INT32;
478  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
479  BLOBNBOX* blob = it.data();
480  if (!blob->IsDiacritic()) {
481  if (debug) {
482  tprintf("Blob is not a diacritic:");
483  blob->bounding_box().print();
484  }
485  return false; // All blobs must have diacritic bases.
486  }
487  if (blob->base_char_top() < min_top)
488  min_top = blob->base_char_top();
489  if (blob->base_char_bottom() > max_bottom)
490  max_bottom = blob->base_char_bottom();
491  }
492  // If the intersection of all vertical ranges of all base characters
493  // overlaps the median range of this, then it is OK.
494  bool result = min_top > candidate.median_bottom_ &&
495  max_bottom < candidate.median_top_;
496  if (debug) {
497  if (result)
498  tprintf("OKDiacritic!\n");
499  else
500  tprintf("y ranges don\'t overlap: %d-%d / %d-%d\n",
501  max_bottom, min_top, median_bottom_, median_top_);
502  }
503  return result;
504 }
505 
506 // Sets the sort key using either the tab vector, or the bounding box if
507 // the tab vector is NULL. If the tab_vector lies inside the bounding_box,
508 // use the edge of the box as a key any way.
509 void ColPartition::SetLeftTab(const TabVector* tab_vector) {
510  if (tab_vector != NULL) {
511  left_key_ = tab_vector->sort_key();
512  left_key_tab_ = left_key_ <= BoxLeftKey();
513  } else {
514  left_key_tab_ = false;
515  }
516  if (!left_key_tab_)
517  left_key_ = BoxLeftKey();
518 }
519 
520 // As SetLeftTab, but with the right.
521 void ColPartition::SetRightTab(const TabVector* tab_vector) {
522  if (tab_vector != NULL) {
523  right_key_ = tab_vector->sort_key();
524  right_key_tab_ = right_key_ >= BoxRightKey();
525  } else {
526  right_key_tab_ = false;
527  }
528  if (!right_key_tab_)
529  right_key_ = BoxRightKey();
530 }
531 
532 // Copies the left/right tab from the src partition, but if take_box is
533 // true, copies the box instead and uses that as a key.
534 void ColPartition::CopyLeftTab(const ColPartition& src, bool take_box) {
535  left_key_tab_ = take_box ? false : src.left_key_tab_;
536  if (left_key_tab_) {
537  left_key_ = src.left_key_;
538  } else {
539  bounding_box_.set_left(XAtY(src.BoxLeftKey(), MidY()));
540  left_key_ = BoxLeftKey();
541  }
542  if (left_margin_ > bounding_box_.left())
543  left_margin_ = src.left_margin_;
544 }
545 
546 // As CopyLeftTab, but with the right.
547 void ColPartition::CopyRightTab(const ColPartition& src, bool take_box) {
548  right_key_tab_ = take_box ? false : src.right_key_tab_;
549  if (right_key_tab_) {
550  right_key_ = src.right_key_;
551  } else {
552  bounding_box_.set_right(XAtY(src.BoxRightKey(), MidY()));
553  right_key_ = BoxRightKey();
554  }
555  if (right_margin_ < bounding_box_.right())
556  right_margin_ = src.right_margin_;
557 }
558 
559 // Returns the left rule line x coord of the leftmost blob.
561  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
562  return it.data()->left_rule();
563 }
564 // Returns the right rule line x coord of the rightmost blob.
566  BLOBNBOX_C_IT it(const_cast<BLOBNBOX_CLIST*>(&boxes_));
567  it.move_to_last();
568  return it.data()->right_rule();
569 }
570 
572  ASSERT_HOST(type < BSTT_COUNT);
573  return special_blobs_densities_[type];
574 }
575 
577  ASSERT_HOST(type < BSTT_COUNT);
578  BLOBNBOX_C_IT blob_it(&boxes_);
579  int count = 0;
580  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
581  BLOBNBOX* blob = blob_it.data();
583  if (blob_type == type) {
584  count++;
585  }
586  }
587 
588  return count;
589 }
590 
592  const BlobSpecialTextType type, const float density) {
593  ASSERT_HOST(type < BSTT_COUNT);
594  special_blobs_densities_[type] = density;
595 }
596 
598  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
599  if (boxes_.empty()) {
600  return;
601  }
602 
603  BLOBNBOX_C_IT blob_it(&boxes_);
604  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
605  BLOBNBOX* blob = blob_it.data();
607  special_blobs_densities_[type]++;
608  }
609 
610  for (int type = 0; type < BSTT_COUNT; ++type) {
611  special_blobs_densities_[type] /= boxes_.length();
612  }
613 }
614 
615 // Add a partner above if upper, otherwise below.
616 // Add them uniquely and keep the list sorted by box left.
617 // Partnerships are added symmetrically to partner and this.
618 void ColPartition::AddPartner(bool upper, ColPartition* partner) {
619  if (upper) {
620  partner->lower_partners_.add_sorted(SortByBoxLeft<ColPartition>,
621  true, this);
622  upper_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
623  } else {
624  partner->upper_partners_.add_sorted(SortByBoxLeft<ColPartition>,
625  true, this);
626  lower_partners_.add_sorted(SortByBoxLeft<ColPartition>, true, partner);
627  }
628 }
629 
630 // Removes the partner from this, but does not remove this from partner.
631 // This asymmetric removal is so as not to mess up the iterator that is
632 // working on partner's partner list.
633 void ColPartition::RemovePartner(bool upper, ColPartition* partner) {
634  ColPartition_C_IT it(upper ? &upper_partners_ : &lower_partners_);
635  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
636  if (it.data() == partner) {
637  it.extract();
638  break;
639  }
640  }
641 }
642 
643 // Returns the partner if the given partner is a singleton, otherwise NULL.
645  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
646  if (!partners->singleton())
647  return NULL;
648  ColPartition_C_IT it(partners);
649  return it.data();
650 }
651 
652 // Merge with the other partition and delete it.
654  // The result has to either own all of the blobs or none of them.
655  // Verify the flag is consisent.
656  ASSERT_HOST(owns_blobs() == other->owns_blobs());
657  // TODO(nbeato): check owns_blobs better. Right now owns_blobs
658  // should always be true when this is called. So there is no issues.
659  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
660  bounding_box_.bottom()) ||
661  TabFind::WithinTestRegion(2, other->bounding_box_.left(),
662  other->bounding_box_.bottom())) {
663  tprintf("Merging:");
664  Print();
665  other->Print();
666  }
667 
668  // Update the special_blobs_densities_.
669  memset(special_blobs_densities_, 0, sizeof(special_blobs_densities_));
670  for (int type = 0; type < BSTT_COUNT; ++type) {
671  int w1 = boxes_.length(), w2 = other->boxes_.length();
672  float new_val = special_blobs_densities_[type] * w1 +
673  other->special_blobs_densities_[type] * w2;
674  if (!w1 || !w2) {
675  special_blobs_densities_[type] = new_val / (w1 + w2);
676  }
677  }
678 
679  // Merge the two sorted lists.
680  BLOBNBOX_C_IT it(&boxes_);
681  BLOBNBOX_C_IT it2(&other->boxes_);
682  for (; !it2.empty(); it2.forward()) {
683  BLOBNBOX* bbox2 = it2.extract();
684  ColPartition* prev_owner = bbox2->owner();
685  if (prev_owner != other && prev_owner != NULL) {
686  // A blob on other's list is owned by someone else; let them have it.
687  continue;
688  }
689  ASSERT_HOST(prev_owner == other || prev_owner == NULL);
690  if (prev_owner == other)
691  bbox2->set_owner(this);
692  it.add_to_end(bbox2);
693  }
694  left_margin_ = MIN(left_margin_, other->left_margin_);
695  right_margin_ = MAX(right_margin_, other->right_margin_);
696  if (other->left_key_ < left_key_) {
697  left_key_ = other->left_key_;
698  left_key_tab_ = other->left_key_tab_;
699  }
700  if (other->right_key_ > right_key_) {
701  right_key_ = other->right_key_;
702  right_key_tab_ = other->right_key_tab_;
703  }
704  // Combine the flow and blob_type in a sensible way.
705  // Dominant flows stay.
706  if (!DominatesInMerge(flow_, other->flow_)) {
707  flow_ = other->flow_;
708  blob_type_ = other->blob_type_;
709  }
710  SetBlobTypes();
711  if (IsVerticalType()) {
712  boxes_.sort(SortByBoxBottom<BLOBNBOX>);
713  last_add_was_vertical_ = true;
714  } else {
715  boxes_.sort(SortByBoxLeft<BLOBNBOX>);
716  last_add_was_vertical_ = false;
717  }
718  ComputeLimits();
719  // Fix partner lists. other is going away, so remove it as a
720  // partner of all its partners and add this in its place.
721  for (int upper = 0; upper < 2; ++upper) {
722  ColPartition_CLIST partners;
723  ColPartition_C_IT part_it(&partners);
724  part_it.add_list_after(upper ? &other->upper_partners_
725  : &other->lower_partners_);
726  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
727  ColPartition* partner = part_it.extract();
728  partner->RemovePartner(!upper, other);
729  partner->RemovePartner(!upper, this);
730  partner->AddPartner(!upper, this);
731  }
732  }
733  delete other;
734  if (cb != NULL) {
735  SetColumnGoodness(cb);
736  }
737 }
738 
739 // Merge1 and merge2 are candidates to be merged, yet their combined box
740 // overlaps this. Is that allowed?
741 // Returns true if the overlap between this and the merged pair of
742 // merge candidates is sufficiently trivial to be allowed.
743 // The merged box can graze the edge of this by the ok_box_overlap
744 // if that exceeds the margin to the median top and bottom.
745 // ok_box_overlap should be set by the caller appropriate to the sizes of
746 // the text involved, and is usually a fraction of the median size of merge1
747 // and/or merge2, or this.
748 // TODO(rays) Determine whether vertical text needs to be considered.
750  const ColPartition& merge2,
751  int ok_box_overlap, bool debug) {
752  // Vertical partitions are not allowed to be involved.
753  if (IsVerticalType() || merge1.IsVerticalType() || merge2.IsVerticalType()) {
754  if (debug)
755  tprintf("Vertical partition\n");
756  return false;
757  }
758  // The merging partitions must strongly overlap each other.
759  if (!merge1.VSignificantCoreOverlap(merge2)) {
760  if (debug)
761  tprintf("Voverlap %d (%d)\n",
762  merge1.VCoreOverlap(merge2),
763  merge1.VSignificantCoreOverlap(merge2));
764  return false;
765  }
766  // The merged box must not overlap the median bounds of this.
767  TBOX merged_box(merge1.bounding_box());
768  merged_box += merge2.bounding_box();
769  if (merged_box.bottom() < median_top_ && merged_box.top() > median_bottom_ &&
770  merged_box.bottom() < bounding_box_.top() - ok_box_overlap &&
771  merged_box.top() > bounding_box_.bottom() + ok_box_overlap) {
772  if (debug)
773  tprintf("Excessive box overlap\n");
774  return false;
775  }
776  // Looks OK!
777  return true;
778 }
779 
780 // Find the blob at which to split this to minimize the overlap with the
781 // given box. Returns the first blob to go in the second partition.
783  if (boxes_.empty() || boxes_.singleton())
784  return NULL;
785  BLOBNBOX_C_IT it(&boxes_);
786  TBOX left_box(it.data()->bounding_box());
787  for (it.forward(); !it.at_first(); it.forward()) {
788  BLOBNBOX* bbox = it.data();
789  left_box += bbox->bounding_box();
790  if (left_box.overlap(box))
791  return bbox;
792  }
793  return NULL;
794 }
795 
796 // Split this partition keeping the first half in this and returning
797 // the second half.
798 // Splits by putting the split_blob and the blobs that follow
799 // in the second half, and the rest in the first half.
801  ColPartition* split_part = ShallowCopy();
802  split_part->set_owns_blobs(owns_blobs());
803  BLOBNBOX_C_IT it(&boxes_);
804  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
805  BLOBNBOX* bbox = it.data();
806  ColPartition* prev_owner = bbox->owner();
807  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
808  if (bbox == split_blob || !split_part->boxes_.empty()) {
809  split_part->AddBox(it.extract());
810  if (owns_blobs() && prev_owner != NULL)
811  bbox->set_owner(split_part);
812  }
813  }
814  ASSERT_HOST(!it.empty());
815  if (split_part->IsEmpty()) {
816  // Split part ended up with nothing. Possible if split_blob is not
817  // in the list of blobs.
818  delete split_part;
819  return NULL;
820  }
821  right_key_tab_ = false;
822  split_part->left_key_tab_ = false;
823  ComputeLimits();
824  // TODO(nbeato) Merge Ray's CL like this:
825  // if (owns_blobs())
826  // SetBlobTextlineGoodness();
827  split_part->ComputeLimits();
828  // TODO(nbeato) Merge Ray's CL like this:
829  // if (split_part->owns_blobs())
830  // split_part->SetBlobTextlineGoodness();
831  return split_part;
832 }
833 
834 // Split this partition at the given x coordinate, returning the right
835 // half and keeping the left half in this.
837  if (split_x <= bounding_box_.left() || split_x >= bounding_box_.right())
838  return NULL; // There will be no change.
839  ColPartition* split_part = ShallowCopy();
840  split_part->set_owns_blobs(owns_blobs());
841  BLOBNBOX_C_IT it(&boxes_);
842  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
843  BLOBNBOX* bbox = it.data();
844  ColPartition* prev_owner = bbox->owner();
845  ASSERT_HOST(!owns_blobs() || prev_owner == this || prev_owner == NULL);
846  const TBOX& box = bbox->bounding_box();
847  if (box.left() >= split_x) {
848  split_part->AddBox(it.extract());
849  if (owns_blobs() && prev_owner != NULL)
850  bbox->set_owner(split_part);
851  }
852  }
853  if (it.empty()) {
854  // Possible if split-x passes through the first blob.
855  it.add_list_after(&split_part->boxes_);
856  }
857  ASSERT_HOST(!it.empty());
858  if (split_part->IsEmpty()) {
859  // Split part ended up with nothing. Possible if split_x passes
860  // through the last blob.
861  delete split_part;
862  return NULL;
863  }
864  right_key_tab_ = false;
865  split_part->left_key_tab_ = false;
866  right_margin_ = split_x;
867  split_part->left_margin_ = split_x;
868  ComputeLimits();
869  split_part->ComputeLimits();
870  return split_part;
871 }
872 
873 // Recalculates all the coordinate limits of the partition.
875  bounding_box_ = TBOX(); // Clear it
876  BLOBNBOX_C_IT it(&boxes_);
877  BLOBNBOX* bbox = NULL;
878  int non_leader_count = 0;
879  if (it.empty()) {
880  bounding_box_.set_left(left_margin_);
881  bounding_box_.set_right(right_margin_);
882  bounding_box_.set_bottom(0);
883  bounding_box_.set_top(0);
884  } else {
885  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
886  bbox = it.data();
887  bounding_box_ += bbox->bounding_box();
888  if (bbox->flow() != BTFT_LEADER)
889  ++non_leader_count;
890  }
891  }
892  if (!left_key_tab_)
893  left_key_ = BoxLeftKey();
894  if (left_key_ > BoxLeftKey() && textord_debug_bugs) {
895  // TODO(rays) investigate the causes of these error messages, to find
896  // out if they are genuinely harmful, or just indicative of junk input.
897  tprintf("Computed left-illegal partition\n");
898  Print();
899  }
900  if (!right_key_tab_)
901  right_key_ = BoxRightKey();
902  if (right_key_ < BoxRightKey() && textord_debug_bugs) {
903  tprintf("Computed right-illegal partition\n");
904  Print();
905  }
906  if (it.empty())
907  return;
908  if (IsImageType() || blob_type() == BRT_RECTIMAGE ||
909  blob_type() == BRT_POLYIMAGE) {
910  median_top_ = bounding_box_.top();
911  median_bottom_ = bounding_box_.bottom();
912  median_size_ = bounding_box_.height();
913  median_left_ = bounding_box_.left();
914  median_right_ = bounding_box_.right();
915  median_width_ = bounding_box_.width();
916  } else {
917  STATS top_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
918  STATS bottom_stats(bounding_box_.bottom(), bounding_box_.top() + 1);
919  STATS size_stats(0, bounding_box_.height() + 1);
920  STATS left_stats(bounding_box_.left(), bounding_box_.right() + 1);
921  STATS right_stats(bounding_box_.left(), bounding_box_.right() + 1);
922  STATS width_stats(0, bounding_box_.width() + 1);
923  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
924  bbox = it.data();
925  if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) {
926  TBOX box = bbox->bounding_box();
927  int area = box.area();
928  top_stats.add(box.top(), area);
929  bottom_stats.add(box.bottom(), area);
930  size_stats.add(box.height(), area);
931  left_stats.add(box.left(), area);
932  right_stats.add(box.right(), area);
933  width_stats.add(box.width(), area);
934  }
935  }
936  median_top_ = static_cast<int>(top_stats.median() + 0.5);
937  median_bottom_ = static_cast<int>(bottom_stats.median() + 0.5);
938  median_size_ = static_cast<int>(size_stats.median() + 0.5);
939  median_left_ = static_cast<int>(left_stats.median() + 0.5);
940  median_right_ = static_cast<int>(right_stats.median() + 0.5);
941  median_width_ = static_cast<int>(width_stats.median() + 0.5);
942  }
943 
944  if (right_margin_ < bounding_box_.right() && textord_debug_bugs) {
945  tprintf("Made partition with bad right coords");
946  Print();
947  }
948  if (left_margin_ > bounding_box_.left() && textord_debug_bugs) {
949  tprintf("Made partition with bad left coords");
950  Print();
951  }
952  // Fix partner lists. The bounding box has changed and partners are stored
953  // in bounding box order, so remove and reinsert this as a partner
954  // of all its partners.
955  for (int upper = 0; upper < 2; ++upper) {
956  ColPartition_CLIST partners;
957  ColPartition_C_IT part_it(&partners);
958  part_it.add_list_after(upper ? &upper_partners_ : &lower_partners_);
959  for (part_it.move_to_first(); !part_it.empty(); part_it.forward()) {
960  ColPartition* partner = part_it.extract();
961  partner->RemovePartner(!upper, this);
962  partner->AddPartner(!upper, this);
963  }
964  }
965  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
966  bounding_box_.bottom())) {
967  tprintf("Recomputed box for partition %p\n", this);
968  Print();
969  }
970 }
971 
972 // Returns the number of boxes that overlap the given box.
974  BLOBNBOX_C_IT it(&boxes_);
975  int overlap_count = 0;
976  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
977  BLOBNBOX* bbox = it.data();
978  if (box.overlap(bbox->bounding_box()))
979  ++overlap_count;
980  }
981  return overlap_count;
982 }
983 
984 // Computes and sets the type_ and first_colum_, last_column_ and column_set_.
985 // resolution refers to the ppi resolution of the image.
986 void ColPartition::SetPartitionType(int resolution, ColPartitionSet* columns) {
987  int first_spanned_col = -1;
988  ColumnSpanningType span_type =
989  columns->SpanningType(resolution,
990  bounding_box_.left(), bounding_box_.right(),
991  MIN(bounding_box_.height(), bounding_box_.width()),
992  MidY(), left_margin_, right_margin_,
993  &first_column_, &last_column_,
994  &first_spanned_col);
995  column_set_ = columns;
996  if (first_column_ < last_column_ && span_type == CST_PULLOUT &&
997  !IsLineType()) {
998  // Unequal columns may indicate that the pullout spans one of the columns
999  // it lies in, so force it to be allocated to just that column.
1000  if (first_spanned_col >= 0) {
1001  first_column_ = first_spanned_col;
1002  last_column_ = first_spanned_col;
1003  } else {
1004  if ((first_column_ & 1) == 0)
1005  last_column_ = first_column_;
1006  else if ((last_column_ & 1) == 0)
1007  first_column_ = last_column_;
1008  else
1009  first_column_ = last_column_ = (first_column_ + last_column_) / 2;
1010  }
1011  }
1012  type_ = PartitionType(span_type);
1013 }
1014 
1015 // Returns the PartitionType from the current BlobRegionType and a column
1016 // flow spanning type ColumnSpanningType, generated by
1017 // ColPartitionSet::SpanningType, that indicates how the partition sits
1018 // in the columns.
1020  if (flow == CST_NOISE) {
1021  if (blob_type_ != BRT_HLINE && blob_type_ != BRT_VLINE &&
1022  blob_type_ != BRT_RECTIMAGE && blob_type_ != BRT_VERT_TEXT)
1023  return PT_NOISE;
1024  flow = CST_FLOWING;
1025  }
1026 
1027  switch (blob_type_) {
1028  case BRT_NOISE:
1029  return PT_NOISE;
1030  case BRT_HLINE:
1031  return PT_HORZ_LINE;
1032  case BRT_VLINE:
1033  return PT_VERT_LINE;
1034  case BRT_RECTIMAGE:
1035  case BRT_POLYIMAGE:
1036  switch (flow) {
1037  case CST_FLOWING:
1038  return PT_FLOWING_IMAGE;
1039  case CST_HEADING:
1040  return PT_HEADING_IMAGE;
1041  case CST_PULLOUT:
1042  return PT_PULLOUT_IMAGE;
1043  default:
1044  ASSERT_HOST(!"Undefined flow type for image!");
1045  }
1046  break;
1047  case BRT_VERT_TEXT:
1048  return PT_VERTICAL_TEXT;
1049  case BRT_TEXT:
1050  case BRT_UNKNOWN:
1051  default:
1052  switch (flow) {
1053  case CST_FLOWING:
1054  return PT_FLOWING_TEXT;
1055  case CST_HEADING:
1056  return PT_HEADING_TEXT;
1057  case CST_PULLOUT:
1058  return PT_PULLOUT_TEXT;
1059  default:
1060  ASSERT_HOST(!"Undefined flow type for text!");
1061  }
1062  }
1063  ASSERT_HOST(!"Should never get here!");
1064  return PT_NOISE;
1065 }
1066 
1067 // Returns the first and last column touched by this partition.
1068 // resolution refers to the ppi resolution of the image.
1069 void ColPartition::ColumnRange(int resolution, ColPartitionSet* columns,
1070  int* first_col, int* last_col) {
1071  int first_spanned_col = -1;
1072  ColumnSpanningType span_type =
1073  columns->SpanningType(resolution,
1074  bounding_box_.left(), bounding_box_.right(),
1075  MIN(bounding_box_.height(), bounding_box_.width()),
1076  MidY(), left_margin_, right_margin_,
1077  first_col, last_col,
1078  &first_spanned_col);
1079  type_ = PartitionType(span_type);
1080 }
1081 
1082 // Sets the internal flags good_width_ and good_column_.
1084  int y = MidY();
1085  int width = RightAtY(y) - LeftAtY(y);
1086  good_width_ = cb->Run(width);
1087  good_column_ = blob_type_ == BRT_TEXT && left_key_tab_ && right_key_tab_;
1088 }
1089 
1090 // Determines whether the blobs in this partition mostly represent
1091 // a leader (fixed pitch sequence) and sets the member blobs accordingly.
1092 // Note that height is assumed to have been tested elsewhere, and that this
1093 // function will find most fixed-pitch text as leader without a height filter.
1094 // Leader detection is limited to sequences of identical width objects,
1095 // such as .... or ----, so patterns, such as .-.-.-.-. will not be found.
1097  bool result = false;
1098  // Gather statistics on the gaps between blobs and the widths of the blobs.
1099  int part_width = bounding_box_.width();
1100  STATS gap_stats(0, part_width);
1101  STATS width_stats(0, part_width);
1102  BLOBNBOX_C_IT it(&boxes_);
1103  BLOBNBOX* prev_blob = it.data();
1104  prev_blob->set_flow(BTFT_NEIGHBOURS);
1105  width_stats.add(prev_blob->bounding_box().width(), 1);
1106  int blob_count = 1;
1107  for (it.forward(); !it.at_first(); it.forward()) {
1108  BLOBNBOX* blob = it.data();
1109  int left = blob->bounding_box().left();
1110  int right = blob->bounding_box().right();
1111  gap_stats.add(left - prev_blob->bounding_box().right(), 1);
1112  width_stats.add(right - left, 1);
1113  blob->set_flow(BTFT_NEIGHBOURS);
1114  prev_blob = blob;
1115  ++blob_count;
1116  }
1117  double median_gap = gap_stats.median();
1118  double median_width = width_stats.median();
1119  double max_width = MAX(median_gap, median_width);
1120  double min_width = MIN(median_gap, median_width);
1121  double gap_iqr = gap_stats.ile(0.75f) - gap_stats.ile(0.25f);
1122  if (textord_debug_tabfind >= 4) {
1123  tprintf("gap iqr = %g, blob_count=%d, limits=%g,%g\n",
1124  gap_iqr, blob_count, max_width * kMaxLeaderGapFractionOfMax,
1125  min_width * kMaxLeaderGapFractionOfMin);
1126  }
1127  if (gap_iqr < max_width * kMaxLeaderGapFractionOfMax &&
1128  gap_iqr < min_width * kMaxLeaderGapFractionOfMin &&
1129  blob_count >= kMinLeaderCount) {
1130  // This is stable enough to be called a leader, so check the widths.
1131  // Since leader dashes can join, run a dp cutting algorithm and go
1132  // on the cost.
1133  int offset = static_cast<int>(ceil(gap_iqr * 2));
1134  int min_step = static_cast<int>(median_gap + median_width + 0.5);
1135  int max_step = min_step + offset;
1136  min_step -= offset;
1137  // Pad the buffer with min_step/2 on each end.
1138  int part_left = bounding_box_.left() - min_step / 2;
1139  part_width += min_step;
1140  DPPoint* projection = new DPPoint[part_width];
1141  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1142  BLOBNBOX* blob = it.data();
1143  int left = blob->bounding_box().left();
1144  int right = blob->bounding_box().right();
1145  int height = blob->bounding_box().height();
1146  for (int x = left; x < right; ++x) {
1147  projection[left - part_left].AddLocalCost(height);
1148  }
1149  }
1150  DPPoint* best_end = DPPoint::Solve(min_step, max_step, false,
1152  part_width, projection);
1153  if (best_end != NULL && best_end->total_cost() < blob_count) {
1154  // Good enough. Call it a leader.
1155  result = true;
1156  bool modified_blob_list = false;
1157  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1158  BLOBNBOX* blob = it.data();
1159  TBOX box = blob->bounding_box();
1160  // If the first or last blob is spaced too much, don't mark it.
1161  if (it.at_first()) {
1162  int gap = it.data_relative(1)->bounding_box().left() -
1163  blob->bounding_box().right();
1164  if (blob->bounding_box().width() + gap > max_step) {
1165  it.extract();
1166  modified_blob_list = true;
1167  continue;
1168  }
1169  }
1170  if (it.at_last()) {
1171  int gap = blob->bounding_box().left() -
1172  it.data_relative(-1)->bounding_box().right();
1173  if (blob->bounding_box().width() + gap > max_step) {
1174  it.extract();
1175  modified_blob_list = true;
1176  break;
1177  }
1178  }
1179  blob->set_region_type(BRT_TEXT);
1180  blob->set_flow(BTFT_LEADER);
1181  }
1182  if (modified_blob_list) ComputeLimits();
1183  blob_type_ = BRT_TEXT;
1184  flow_ = BTFT_LEADER;
1185  } else if (textord_debug_tabfind) {
1186  if (best_end == NULL) {
1187  tprintf("No path\n");
1188  } else {
1189  tprintf("Total cost = %d vs allowed %d\n",
1190  best_end->total_cost() < blob_count);
1191  }
1192  }
1193  delete [] projection;
1194  }
1195  return result;
1196 }
1197 
1198 // Given the result of TextlineProjection::EvaluateColPartition, (positive for
1199 // horizontal text, negative for vertical text, and near zero for non-text),
1200 // sets the blob_type_ and flow_ for this partition to indicate whether it
1201 // is strongly or weakly vertical or horizontal text, or non-text.
1202 // The function assumes that the blob neighbours are valid (from
1203 // StrokeWidth::SetNeighbours) and that those neighbours have their
1204 // region_type() set.
1206  int blob_count = 0; // Total # blobs.
1207  int good_blob_score_ = 0; // Total # good strokewidth neighbours.
1208  int noisy_count = 0; // Total # neighbours marked as noise.
1209  int hline_count = 0;
1210  int vline_count = 0;
1211  BLOBNBOX_C_IT it(&boxes_);
1212  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1213  BLOBNBOX* blob = it.data();
1214  ++blob_count;
1215  noisy_count += blob->NoisyNeighbours();
1216  good_blob_score_ += blob->GoodTextBlob();
1217  if (blob->region_type() == BRT_HLINE) ++hline_count;
1218  if (blob->region_type() == BRT_VLINE) ++vline_count;
1219  }
1220  flow_ = BTFT_NEIGHBOURS;
1221  blob_type_ = BRT_UNKNOWN;
1222  if (hline_count > vline_count) {
1223  flow_ = BTFT_NONE;
1224  blob_type_ = BRT_HLINE;
1225  } else if (vline_count > hline_count) {
1226  flow_ = BTFT_NONE;
1227  blob_type_ = BRT_VLINE;
1228  } else if (value < -1 || 1 < value) {
1229  int long_side;
1230  int short_side;
1231  if (value > 0) {
1232  long_side = bounding_box_.width();
1233  short_side = bounding_box_.height();
1234  blob_type_ = BRT_TEXT;
1235  } else {
1236  long_side = bounding_box_.height();
1237  short_side = bounding_box_.width();
1238  blob_type_ = BRT_VERT_TEXT;
1239  }
1240  // We will combine the old metrics using aspect ratio and blob counts
1241  // with the input value by allowing a strong indication to flip the
1242  // STRONG_CHAIN/CHAIN flow values.
1243  int strong_score = blob_count >= kHorzStrongTextlineCount ? 1 : 0;
1244  if (short_side > kHorzStrongTextlineHeight) ++strong_score;
1245  if (short_side * kHorzStrongTextlineAspect < long_side) ++strong_score;
1246  if (abs(value) >= kMinStrongTextValue)
1247  flow_ = BTFT_STRONG_CHAIN;
1248  else if (abs(value) >= kMinChainTextValue)
1249  flow_ = BTFT_CHAIN;
1250  else
1251  flow_ = BTFT_NEIGHBOURS;
1252  // Upgrade chain to strong chain if the other indicators are good
1253  if (flow_ == BTFT_CHAIN && strong_score == 3)
1254  flow_ = BTFT_STRONG_CHAIN;
1255  // Downgrade strong vertical text to chain if the indicators are bad.
1256  if (flow_ == BTFT_STRONG_CHAIN && value < 0 && strong_score < 2)
1257  flow_ = BTFT_CHAIN;
1258  }
1259  if (flow_ == BTFT_NEIGHBOURS) {
1260  // Check for noisy neighbours.
1261  if (noisy_count >= blob_count) {
1262  flow_ = BTFT_NONTEXT;
1263  blob_type_= BRT_NOISE;
1264  }
1265  }
1266  if (TabFind::WithinTestRegion(2, bounding_box_.left(),
1267  bounding_box_.bottom())) {
1268  tprintf("RegionFlowTypesFromProjectionValue count=%d, noisy=%d, score=%d,",
1269  blob_count, noisy_count, good_blob_score_);
1270  tprintf(" Projection value=%d, flow=%d, blob_type=%d\n",
1271  value, flow_, blob_type_);
1272  Print();
1273  }
1274  SetBlobTypes();
1275 }
1276 
1277 // Sets all blobs with the partition blob type and flow, but never overwrite
1278 // leader blobs, as we need to be able to identify them later.
1280  if (!owns_blobs())
1281  return;
1282  BLOBNBOX_C_IT it(&boxes_);
1283  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1284  BLOBNBOX* blob = it.data();
1285  if (blob->flow() != BTFT_LEADER)
1286  blob->set_flow(flow_);
1287  blob->set_region_type(blob_type_);
1288  ASSERT_HOST(blob->owner() == NULL || blob->owner() == this);
1289  }
1290 }
1291 
1292 // Returns true if a decent baseline can be fitted through the blobs.
1293 // Works for both horizontal and vertical text.
1295  // Approximation of the baseline.
1296  DetLineFit linepoints;
1297  // Calculation of the mean height on this line segment. Note that these
1298  // variable names apply to the context of a horizontal line, and work
1299  // analogously, rather than literally in the case of a vertical line.
1300  int total_height = 0;
1301  int coverage = 0;
1302  int height_count = 0;
1303  int width = 0;
1304  BLOBNBOX_C_IT it(&boxes_);
1305  TBOX box(it.data()->bounding_box());
1306  // Accumulate points representing the baseline at the middle of each blob,
1307  // but add an additional point for each end of the line. This makes it
1308  // harder to fit a severe skew angle, as it is most likely not right.
1309  if (IsVerticalType()) {
1310  // For a vertical line, use the right side as the baseline.
1311  ICOORD first_pt(box.right(), box.bottom());
1312  // Use the bottom-right of the first (bottom) box, the top-right of the
1313  // last, and the middle-right of all others.
1314  linepoints.Add(first_pt);
1315  for (it.forward(); !it.at_last(); it.forward()) {
1316  BLOBNBOX* blob = it.data();
1317  box = blob->bounding_box();
1318  ICOORD box_pt(box.right(), (box.top() + box.bottom()) / 2);
1319  linepoints.Add(box_pt);
1320  total_height += box.width();
1321  coverage += box.height();
1322  ++height_count;
1323  }
1324  box = it.data()->bounding_box();
1325  ICOORD last_pt(box.right(), box.top());
1326  linepoints.Add(last_pt);
1327  width = last_pt.y() - first_pt.y();
1328 
1329  } else {
1330  // Horizontal lines use the bottom as the baseline.
1331  TBOX box(it.data()->bounding_box());
1332  // Use the bottom-left of the first box, the the bottom-right of the last,
1333  // and the middle of all others.
1334  ICOORD first_pt(box.left(), box.bottom());
1335  linepoints.Add(first_pt);
1336  for (it.forward(); !it.at_last(); it.forward()) {
1337  BLOBNBOX* blob = it.data();
1338  box = blob->bounding_box();
1339  ICOORD box_pt((box.left() + box.right()) / 2, box.bottom());
1340  linepoints.Add(box_pt);
1341  total_height += box.height();
1342  coverage += box.width();
1343  ++height_count;
1344  }
1345  box = it.data()->bounding_box();
1346  ICOORD last_pt(box.right(), box.bottom());
1347  linepoints.Add(last_pt);
1348  width = last_pt.x() - first_pt.x();
1349  }
1350  // Maximum median error allowed to be a good text line.
1351  double max_error = kMaxBaselineError * total_height / height_count;
1352  ICOORD start_pt, end_pt;
1353  double error = linepoints.Fit(&start_pt, &end_pt);
1354  return error < max_error && coverage >= kMinBaselineCoverage * width;
1355 }
1356 
1357 // Adds this ColPartition to a matching WorkingPartSet if one can be found,
1358 // otherwise starts a new one in the appropriate column, ending the previous.
1359 void ColPartition::AddToWorkingSet(const ICOORD& bleft, const ICOORD& tright,
1360  int resolution,
1361  ColPartition_LIST* used_parts,
1362  WorkingPartSet_LIST* working_sets) {
1363  if (block_owned_)
1364  return; // Done it already.
1365  block_owned_ = true;
1366  WorkingPartSet_IT it(working_sets);
1367  // If there is an upper partner use its working_set_ directly.
1368  ColPartition* partner = SingletonPartner(true);
1369  if (partner != NULL && partner->working_set_ != NULL) {
1370  working_set_ = partner->working_set_;
1371  working_set_->AddPartition(this);
1372  return;
1373  }
1374  if (partner != NULL && textord_debug_bugs) {
1375  tprintf("Partition with partner has no working set!:");
1376  Print();
1377  partner->Print();
1378  }
1379  // Search for the column that the left edge fits in.
1380  WorkingPartSet* work_set = NULL;
1381  it.move_to_first();
1382  int col_index = 0;
1383  for (it.mark_cycle_pt(); !it.cycled_list() &&
1384  col_index != first_column_;
1385  it.forward(), ++col_index);
1386  if (textord_debug_tabfind >= 2) {
1387  tprintf("Match is %s for:", (col_index & 1) ? "Real" : "Between");
1388  Print();
1389  }
1390  if (it.cycled_list() && textord_debug_bugs) {
1391  tprintf("Target column=%d, only had %d\n", first_column_, col_index);
1392  }
1393  ASSERT_HOST(!it.cycled_list());
1394  work_set = it.data();
1395  // If last_column_ != first_column, then we need to scoop up all blocks
1396  // between here and the last_column_ and put back in work_set.
1397  if (!it.cycled_list() && last_column_ != first_column_ && !IsPulloutType()) {
1398  // Find the column that the right edge falls in.
1399  BLOCK_LIST completed_blocks;
1400  TO_BLOCK_LIST to_blocks;
1401  for (; !it.cycled_list() && col_index <= last_column_;
1402  it.forward(), ++col_index) {
1403  WorkingPartSet* end_set = it.data();
1404  end_set->ExtractCompletedBlocks(bleft, tright, resolution, used_parts,
1405  &completed_blocks, &to_blocks);
1406  }
1407  work_set->InsertCompletedBlocks(&completed_blocks, &to_blocks);
1408  }
1409  working_set_ = work_set;
1410  work_set->AddPartition(this);
1411 }
1412 
1413 // From the given block_parts list, builds one or more BLOCKs and
1414 // corresponding TO_BLOCKs, such that the line spacing is uniform in each.
1415 // Created blocks are appended to the end of completed_blocks and to_blocks.
1416 // The used partitions are put onto used_parts, as they may still be referred
1417 // to in the partition grid. bleft, tright and resolution are the bounds
1418 // and resolution of the original image.
1419 void ColPartition::LineSpacingBlocks(const ICOORD& bleft, const ICOORD& tright,
1420  int resolution,
1421  ColPartition_LIST* block_parts,
1422  ColPartition_LIST* used_parts,
1423  BLOCK_LIST* completed_blocks,
1424  TO_BLOCK_LIST* to_blocks) {
1425  int page_height = tright.y() - bleft.y();
1426  // Compute the initial spacing stats.
1427  ColPartition_IT it(block_parts);
1428  int part_count = 0;
1429  int max_line_height = 0;
1430 
1431  // TODO(joeliu): We should add some special logic for PT_INLINE_EQUATION type
1432  // because their line spacing with their neighbors maybe smaller and their
1433  // height may be slightly larger.
1434 
1435  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1436  ColPartition* part = it.data();
1437  ASSERT_HOST(!part->boxes()->empty());
1438  STATS side_steps(0, part->bounding_box().height());
1439  if (part->bounding_box().height() > max_line_height)
1440  max_line_height = part->bounding_box().height();
1441  BLOBNBOX_C_IT blob_it(part->boxes());
1442  int prev_bottom = blob_it.data()->bounding_box().bottom();
1443  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1444  BLOBNBOX* blob = blob_it.data();
1445  int bottom = blob->bounding_box().bottom();
1446  int step = bottom - prev_bottom;
1447  if (step < 0)
1448  step = -step;
1449  side_steps.add(step, 1);
1450  prev_bottom = bottom;
1451  }
1452  part->set_side_step(static_cast<int>(side_steps.median() + 0.5));
1453  if (!it.at_last()) {
1454  ColPartition* next_part = it.data_relative(1);
1455  part->set_bottom_spacing(part->median_bottom() -
1456  next_part->median_bottom());
1457  part->set_top_spacing(part->median_top() - next_part->median_top());
1458  } else {
1459  part->set_bottom_spacing(page_height);
1460  part->set_top_spacing(page_height);
1461  }
1462  if (textord_debug_tabfind) {
1463  part->Print();
1464  tprintf("side step = %.2f, top spacing = %d, bottom spacing=%d\n",
1465  side_steps.median(), part->top_spacing(), part->bottom_spacing());
1466  }
1467  ++part_count;
1468  }
1469  if (part_count == 0)
1470  return;
1471 
1472  SmoothSpacings(resolution, page_height, block_parts);
1473 
1474  // Move the partitions into individual block lists and make the blocks.
1475  BLOCK_IT block_it(completed_blocks);
1476  TO_BLOCK_IT to_block_it(to_blocks);
1477  ColPartition_LIST spacing_parts;
1478  ColPartition_IT sp_block_it(&spacing_parts);
1479  int same_block_threshold = max_line_height * kMaxSameBlockLineSpacing;
1480  for (it.mark_cycle_pt(); !it.empty();) {
1481  ColPartition* part = it.extract();
1482  sp_block_it.add_to_end(part);
1483  it.forward();
1484  if (it.empty() || part->bottom_spacing() > same_block_threshold ||
1485  !part->SpacingsEqual(*it.data(), resolution)) {
1486  // There is a spacing boundary. Check to see if it.data() belongs
1487  // better in the current block or the next one.
1488  if (!it.empty() && part->bottom_spacing() <= same_block_threshold) {
1489  ColPartition* next_part = it.data();
1490  // If there is a size match one-way, then the middle line goes with
1491  // its matched size, otherwise it goes with the smallest spacing.
1492  ColPartition* third_part = it.at_last() ? NULL : it.data_relative(1);
1493  if (textord_debug_tabfind) {
1494  tprintf("Spacings unequal: upper:%d/%d, lower:%d/%d,"
1495  " sizes %d %d %d\n",
1496  part->top_spacing(), part->bottom_spacing(),
1497  next_part->top_spacing(), next_part->bottom_spacing(),
1498  part->median_size(), next_part->median_size(),
1499  third_part != NULL ? third_part->median_size() : 0);
1500  }
1501  // We can only consider adding the next line to the block if the sizes
1502  // match and the lines are close enough for their size.
1503  if (part->SizesSimilar(*next_part) &&
1504  next_part->median_size() * kMaxSameBlockLineSpacing >
1505  part->bottom_spacing() &&
1506  part->median_size() * kMaxSameBlockLineSpacing >
1507  part->top_spacing()) {
1508  // Even now, we can only add it as long as the third line doesn't
1509  // match in the same way and have a smaller bottom spacing.
1510  if (third_part == NULL ||
1511  !next_part->SizesSimilar(*third_part) ||
1512  third_part->median_size() * kMaxSameBlockLineSpacing <=
1513  next_part->bottom_spacing() ||
1514  next_part->median_size() * kMaxSameBlockLineSpacing <=
1515  next_part->top_spacing() ||
1516  next_part->bottom_spacing() > part->bottom_spacing()) {
1517  // Add to the current block.
1518  sp_block_it.add_to_end(it.extract());
1519  it.forward();
1520  if (textord_debug_tabfind) {
1521  tprintf("Added line to current block.\n");
1522  }
1523  }
1524  }
1525  }
1526  TO_BLOCK* to_block = MakeBlock(bleft, tright, &spacing_parts, used_parts);
1527  if (to_block != NULL) {
1528  to_block_it.add_to_end(to_block);
1529  block_it.add_to_end(to_block->block);
1530  }
1531  sp_block_it.set_to_list(&spacing_parts);
1532  } else {
1533  if (textord_debug_tabfind && !it.empty()) {
1534  ColPartition* next_part = it.data();
1535  tprintf("Spacings equal: upper:%d/%d, lower:%d/%d\n",
1536  part->top_spacing(), part->bottom_spacing(),
1537  next_part->top_spacing(), next_part->bottom_spacing(),
1538  part->median_size(), next_part->median_size());
1539  }
1540  }
1541  }
1542 }
1543 
1544 // Helper function to clip the input pos to the given bleft, tright bounds.
1545 static void ClipCoord(const ICOORD& bleft, const ICOORD& tright, ICOORD* pos) {
1546  if (pos->x() < bleft.x())
1547  pos->set_x(bleft.x());
1548  if (pos->x() > tright.x())
1549  pos->set_x(tright.x());
1550  if (pos->y() < bleft.y())
1551  pos->set_y(bleft.y());
1552  if (pos->y() > tright.y())
1553  pos->set_y(tright.y());
1554 }
1555 
1556 // Helper moves the blobs from the given list of block_parts into the block
1557 // itself. Sets up the block for (old) textline formation correctly for
1558 // vertical and horizontal text. The partitions are moved to used_parts
1559 // afterwards, as they cannot be deleted yet.
1560 static TO_BLOCK* MoveBlobsToBlock(bool vertical_text, int line_spacing,
1561  BLOCK* block,
1562  ColPartition_LIST* block_parts,
1563  ColPartition_LIST* used_parts) {
1564  // Make a matching TO_BLOCK and put all the BLOBNBOXes from the parts in it.
1565  // Move all the parts to a done list as they are no longer needed, except
1566  // that have have to continue to exist until the part grid is deleted.
1567  // Compute the median blob size as we go, as the block needs to know.
1568  TBOX block_box(block->bounding_box());
1569  STATS sizes(0, MAX(block_box.width(), block_box.height()));
1570  bool text_type = block->poly_block()->IsText();
1571  ColPartition_IT it(block_parts);
1572  TO_BLOCK* to_block = new TO_BLOCK(block);
1573  BLOBNBOX_IT blob_it(&to_block->blobs);
1574  ColPartition_IT used_it(used_parts);
1575  for (it.move_to_first(); !it.empty(); it.forward()) {
1576  ColPartition* part = it.extract();
1577  // Transfer blobs from all regions to the output blocks.
1578  // Blobs for non-text regions will be used to define the polygonal
1579  // bounds of the region.
1580  for (BLOBNBOX_C_IT bb_it(part->boxes()); !bb_it.empty();
1581  bb_it.forward()) {
1582  BLOBNBOX* bblob = bb_it.extract();
1583  if (bblob->owner() != part) {
1584  tprintf("Ownership incorrect for blob:");
1585  bblob->bounding_box().print();
1586  tprintf("Part=");
1587  part->Print();
1588  if (bblob->owner() == NULL) {
1589  tprintf("Not owned\n");
1590  } else {
1591  tprintf("Owner part:");
1592  bblob->owner()->Print();
1593  }
1594  }
1595  ASSERT_HOST(bblob->owner() == part);
1596  // Assert failure here is caused by arbitrarily changing the partition
1597  // type without also changing the blob type, such as in
1598  // InsertSmallBlobsAsUnknowns.
1599  ASSERT_HOST(!text_type || bblob->region_type() >= BRT_UNKNOWN);
1600  C_OUTLINE_LIST* outlines = bblob->cblob()->out_list();
1601  C_OUTLINE_IT ol_it(outlines);
1602  ASSERT_HOST(!text_type || ol_it.data()->pathlength() > 0);
1603  if (vertical_text)
1604  sizes.add(bblob->bounding_box().width(), 1);
1605  else
1606  sizes.add(bblob->bounding_box().height(), 1);
1607  blob_it.add_after_then_move(bblob);
1608  }
1609  used_it.add_to_end(part);
1610  }
1611  if (text_type && blob_it.empty()) {
1612  delete block;
1613  delete to_block;
1614  return NULL;
1615  }
1616  to_block->line_size = sizes.median();
1617  if (vertical_text) {
1618  int block_width = block->bounding_box().width();
1619  if (block_width < line_spacing)
1620  line_spacing = block_width;
1621  to_block->line_spacing = static_cast<float>(line_spacing);
1622  to_block->max_blob_size = static_cast<float>(block_width + 1);
1623  } else {
1624  int block_height = block->bounding_box().height();
1625  if (block_height < line_spacing)
1626  line_spacing = block_height;
1627  to_block->line_spacing = static_cast<float>(line_spacing);
1628  to_block->max_blob_size = static_cast<float>(block_height + 1);
1629  }
1630  return to_block;
1631 }
1632 
1633 // Constructs a block from the given list of partitions.
1634 // Arguments are as LineSpacingBlocks above.
1635 TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright,
1636  ColPartition_LIST* block_parts,
1637  ColPartition_LIST* used_parts) {
1638  if (block_parts->empty())
1639  return NULL; // Nothing to do.
1640  ColPartition_IT it(block_parts);
1641  ColPartition* part = it.data();
1642  PolyBlockType type = part->type();
1643  if (type == PT_VERTICAL_TEXT)
1644  return MakeVerticalTextBlock(bleft, tright, block_parts, used_parts);
1645  // LineSpacingBlocks has handed us a collection of evenly spaced lines and
1646  // put the average spacing in each partition, so we can just take the
1647  // linespacing from the first partition.
1648  int line_spacing = part->bottom_spacing();
1649  if (line_spacing < part->median_size())
1650  line_spacing = part->bounding_box().height();
1651  ICOORDELT_LIST vertices;
1652  ICOORDELT_IT vert_it(&vertices);
1653  ICOORD start, end;
1654  int min_x = MAX_INT32;
1655  int max_x = -MAX_INT32;
1656  int min_y = MAX_INT32;
1657  int max_y = -MAX_INT32;
1658  int iteration = 0;
1659  do {
1660  if (iteration == 0)
1661  ColPartition::LeftEdgeRun(&it, &start, &end);
1662  else
1663  ColPartition::RightEdgeRun(&it, &start, &end);
1664  ClipCoord(bleft, tright, &start);
1665  ClipCoord(bleft, tright, &end);
1666  vert_it.add_after_then_move(new ICOORDELT(start));
1667  vert_it.add_after_then_move(new ICOORDELT(end));
1668  UpdateRange(start.x(), &min_x, &max_x);
1669  UpdateRange(end.x(), &min_x, &max_x);
1670  UpdateRange(start.y(), &min_y, &max_y);
1671  UpdateRange(end.y(), &min_y, &max_y);
1672  if ((iteration == 0 && it.at_first()) ||
1673  (iteration == 1 && it.at_last())) {
1674  ++iteration;
1675  it.move_to_last();
1676  }
1677  } while (iteration < 2);
1679  tprintf("Making block at (%d,%d)->(%d,%d)\n",
1680  min_x, min_y, max_x, max_y);
1681  BLOCK* block = new BLOCK("", true, 0, 0, min_x, min_y, max_x, max_y);
1682  block->set_poly_block(new POLY_BLOCK(&vertices, type));
1683  return MoveBlobsToBlock(false, line_spacing, block, block_parts, used_parts);
1684 }
1685 
1686 // Constructs a block from the given list of vertical text partitions.
1687 // Currently only creates rectangular blocks.
1689  const ICOORD& tright,
1690  ColPartition_LIST* block_parts,
1691  ColPartition_LIST* used_parts) {
1692  if (block_parts->empty())
1693  return NULL; // Nothing to do.
1694  ColPartition_IT it(block_parts);
1695  ColPartition* part = it.data();
1696  TBOX block_box = part->bounding_box();
1697  int line_spacing = block_box.width();
1698  PolyBlockType type = it.data()->type();
1699  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1700  block_box += it.data()->bounding_box();
1701  }
1702  if (textord_debug_tabfind) {
1703  tprintf("Making block at:");
1704  block_box.print();
1705  }
1706  BLOCK* block = new BLOCK("", true, 0, 0, block_box.left(), block_box.bottom(),
1707  block_box.right(), block_box.top());
1708  block->set_poly_block(new POLY_BLOCK(block_box, type));
1709  return MoveBlobsToBlock(true, line_spacing, block, block_parts, used_parts);
1710 }
1711 
1712 // Makes a TO_ROW matching this and moves all the blobs to it, transferring
1713 // ownership to to returned TO_ROW.
1715  BLOBNBOX_C_IT blob_it(&boxes_);
1716  TO_ROW* row = NULL;
1717  int line_size = IsVerticalType() ? median_width_ : median_size_;
1718  // Add all the blobs to a single TO_ROW.
1719  for (; !blob_it.empty(); blob_it.forward()) {
1720  BLOBNBOX* blob = blob_it.extract();
1721 // blob->compute_bounding_box();
1722  int top = blob->bounding_box().top();
1723  int bottom = blob->bounding_box().bottom();
1724  if (row == NULL) {
1725  row = new TO_ROW(blob, static_cast<float>(top),
1726  static_cast<float>(bottom),
1727  static_cast<float>(line_size));
1728  } else {
1729  row->add_blob(blob, static_cast<float>(top),
1730  static_cast<float>(bottom),
1731  static_cast<float>(line_size));
1732  }
1733  }
1734  return row;
1735 }
1736 
1737 // Returns a copy of everything except the list of boxes. The resulting
1738 // ColPartition is only suitable for keeping in a column candidate list.
1740  ColPartition* part = new ColPartition(blob_type_, vertical_);
1741  part->left_margin_ = left_margin_;
1742  part->right_margin_ = right_margin_;
1743  part->bounding_box_ = bounding_box_;
1744  memcpy(part->special_blobs_densities_, special_blobs_densities_,
1745  sizeof(special_blobs_densities_));
1746  part->median_bottom_ = median_bottom_;
1747  part->median_top_ = median_top_;
1748  part->median_size_ = median_size_;
1749  part->median_left_ = median_left_;
1750  part->median_right_ = median_right_;
1751  part->median_width_ = median_width_;
1752  part->good_width_ = good_width_;
1753  part->good_column_ = good_column_;
1754  part->left_key_tab_ = left_key_tab_;
1755  part->right_key_tab_ = right_key_tab_;
1756  part->type_ = type_;
1757  part->flow_ = flow_;
1758  part->left_key_ = left_key_;
1759  part->right_key_ = right_key_;
1760  part->first_column_ = first_column_;
1761  part->last_column_ = last_column_;
1762  part->owns_blobs_ = false;
1763  return part;
1764 }
1765 
1767  ColPartition* copy = ShallowCopy();
1768  copy->set_owns_blobs(false);
1769  BLOBNBOX_C_IT inserter(copy->boxes());
1770  BLOBNBOX_C_IT traverser(boxes());
1771  for (traverser.mark_cycle_pt(); !traverser.cycled_list(); traverser.forward())
1772  inserter.add_after_then_move(traverser.data());
1773  return copy;
1774 }
1775 
1776 #ifndef GRAPHICS_DISABLED
1777 // Provides a color for BBGrid to draw the rectangle.
1778 // Must be kept in sync with PolyBlockType.
1780  if (type_ == PT_UNKNOWN)
1781  return BLOBNBOX::TextlineColor(blob_type_, flow_);
1782  return POLY_BLOCK::ColorForPolyBlockType(type_);
1783 }
1784 #endif // GRAPHICS_DISABLED
1785 
1786 // Keep in sync with BlobRegionType.
1787 static char kBlobTypes[BRT_COUNT + 1] = "NHSRIUVT";
1788 
1789 // Prints debug information on this.
1790 void ColPartition::Print() const {
1791  int y = MidY();
1792  tprintf("ColPart:%c(M%d-%c%d-B%d/%d,%d/%d)->(%dB-%d%c-%dM/%d,%d/%d)"
1793  " w-ok=%d, v-ok=%d, type=%d%c%d, fc=%d, lc=%d, boxes=%d"
1794  " ts=%d bs=%d ls=%d rs=%d\n",
1795  boxes_.empty() ? 'E' : ' ',
1796  left_margin_, left_key_tab_ ? 'T' : 'B', LeftAtY(y),
1797  bounding_box_.left(), median_left_,
1798  bounding_box_.bottom(), median_bottom_,
1799  bounding_box_.right(), RightAtY(y), right_key_tab_ ? 'T' : 'B',
1800  right_margin_, median_right_, bounding_box_.top(), median_top_,
1801  good_width_, good_column_, type_,
1802  kBlobTypes[blob_type_], flow_,
1803  first_column_, last_column_, boxes_.length(),
1804  space_above_, space_below_, space_to_left_, space_to_right_);
1805 }
1806 
1807 // Prints debug information on the colors.
1809  tprintf("Colors:(%d, %d, %d)%d -> (%d, %d, %d)\n",
1810  color1_[COLOR_RED], color1_[COLOR_GREEN], color1_[COLOR_BLUE],
1811  color1_[L_ALPHA_CHANNEL],
1812  color2_[COLOR_RED], color2_[COLOR_GREEN], color2_[COLOR_BLUE]);
1813 }
1814 
1815 // Sets the types of all partitions in the run to be the max of the types.
1816 void ColPartition::SmoothPartnerRun(int working_set_count) {
1817  STATS left_stats(0, working_set_count);
1818  STATS right_stats(0, working_set_count);
1819  PolyBlockType max_type = type_;
1820  ColPartition* partner;
1821  for (partner = SingletonPartner(false); partner != NULL;
1822  partner = partner->SingletonPartner(false)) {
1823  if (partner->type_ > max_type)
1824  max_type = partner->type_;
1825  if (column_set_ == partner->column_set_) {
1826  left_stats.add(partner->first_column_, 1);
1827  right_stats.add(partner->last_column_, 1);
1828  }
1829  }
1830  type_ = max_type;
1831  // TODO(rays) Either establish that it isn't necessary to set the columns,
1832  // or find a way to do it that does not cause an assert failure in
1833  // AddToWorkingSet.
1834 #if 0
1835  first_column_ = left_stats.mode();
1836  last_column_ = right_stats.mode();
1837  if (last_column_ < first_column_)
1838  last_column_ = first_column_;
1839 #endif
1840 
1841  for (partner = SingletonPartner(false); partner != NULL;
1842  partner = partner->SingletonPartner(false)) {
1843  partner->type_ = max_type;
1844 #if 0 // See TODO above
1845  if (column_set_ == partner->column_set_) {
1846  partner->first_column_ = first_column_;
1847  partner->last_column_ = last_column_;
1848  }
1849 #endif
1850  }
1851 }
1852 
1853 // ======= Scenario common to all Refine*Partners* functions =======
1854 // ColPartitions are aiming to represent textlines, or horizontal slices
1855 // of images, and we are trying to form bi-directional (upper/lower) chains
1856 // of UNIQUE partner ColPartitions that can be made into blocks.
1857 // The ColPartitions have previously been typed (see SetPartitionType)
1858 // according to a combination of the content type and
1859 // how they lie on the columns. We want to chain text into
1860 // groups of a single type, but image ColPartitions may have been typed
1861 // differently in different parts of the image, due to being non-rectangular.
1862 //
1863 // We previously ran a search for upper and lower partners, but there may
1864 // be more than one, and they may be of mixed types, so now we wish to
1865 // refine the partners down to at most one.
1866 // A heading may have multiple partners:
1867 // ===============================
1868 // ======== ========== =========
1869 // ======== ========== =========
1870 // but it should be a different type.
1871 // A regular flowing text line may have multiple partners:
1872 // ================== ===================
1873 // ======= ================= ===========
1874 // This could be the start of a pull-out, or it might all be in a single
1875 // column and might be caused by tightly spaced text, bold words, bullets,
1876 // funny punctuation etc, all of which can cause textlines to be split into
1877 // multiple ColPartitions. Pullouts and figure captions should now be different
1878 // types so we can more aggressively merge groups of partners that all sit
1879 // in a single column.
1880 //
1881 // Cleans up the partners of the given type so that there is at most
1882 // one partner. This makes block creation simpler.
1883 // If get_desperate is true, goes to more desperate merge methods
1884 // to merge flowing text before breaking partnerships.
1885 void ColPartition::RefinePartners(PolyBlockType type, bool get_desperate,
1886  ColPartitionGrid* grid) {
1887  if (TypesSimilar(type_, type)) {
1888  RefinePartnersInternal(true, get_desperate, grid);
1889  RefinePartnersInternal(false, get_desperate, grid);
1890  } else if (type == PT_COUNT) {
1891  // This is the final pass. Make sure only the correctly typed
1892  // partners surivive, however many there are.
1893  RefinePartnersByType(true, &upper_partners_);
1894  RefinePartnersByType(false, &lower_partners_);
1895  // It is possible for a merge to have given a partition multiple
1896  // partners again, so the last resort is to use overlap which is
1897  // guaranteed to leave at most one partner left.
1898  if (!upper_partners_.empty() && !upper_partners_.singleton())
1899  RefinePartnersByOverlap(true, &upper_partners_);
1900  if (!lower_partners_.empty() && !lower_partners_.singleton())
1901  RefinePartnersByOverlap(false, &lower_partners_);
1902  }
1903 }
1904 
1906 
1907 // Cleans up the partners above if upper is true, else below.
1908 // If get_desperate is true, goes to more desperate merge methods
1909 // to merge flowing text before breaking partnerships.
1910 void ColPartition::RefinePartnersInternal(bool upper, bool get_desperate,
1911  ColPartitionGrid* grid) {
1912  ColPartition_CLIST* partners = upper ? &upper_partners_ : &lower_partners_;
1913  if (!partners->empty() && !partners->singleton()) {
1914  RefinePartnersByType(upper, partners);
1915  if (!partners->empty() && !partners->singleton()) {
1916  // Check for transitive partnerships and break the cycle.
1917  RefinePartnerShortcuts(upper, partners);
1918  if (!partners->empty() && !partners->singleton()) {
1919  // Types didn't fix it. Flowing text keeps the one with the longest
1920  // sequence of singleton matching partners. All others max overlap.
1921  if (TypesSimilar(type_, PT_FLOWING_TEXT) && get_desperate) {
1922  RefineTextPartnersByMerge(upper, false, partners, grid);
1923  if (!partners->empty() && !partners->singleton())
1924  RefineTextPartnersByMerge(upper, true, partners, grid);
1925  }
1926  // The last resort is to use overlap.
1927  if (!partners->empty() && !partners->singleton())
1928  RefinePartnersByOverlap(upper, partners);
1929  }
1930  }
1931  }
1932 }
1933 
1934 // Cleans up the partners above if upper is true, else below.
1935 // Restricts the partners to only desirable types. For text and BRT_HLINE this
1936 // means the same type_ , and for image types it means any image type.
1937 void ColPartition::RefinePartnersByType(bool upper,
1938  ColPartition_CLIST* partners) {
1939  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
1940  bounding_box_.bottom());
1941  if (debug) {
1942  tprintf("Refining %d %s partners by type for:\n",
1943  partners->length(), upper ? "Upper" : "Lower");
1944  Print();
1945  }
1946  ColPartition_C_IT it(partners);
1947  // Purify text by type.
1948  if (!IsImageType() && !IsLineType() && type() != PT_TABLE) {
1949  // Keep only partners matching type_.
1950  // Exception: PT_VERTICAL_TEXT is allowed to stay with the other
1951  // text types if it is the only partner.
1952  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1953  ColPartition* partner = it.data();
1954  if (!TypesSimilar(type_, partner->type_)) {
1955  if (debug) {
1956  tprintf("Removing partner:");
1957  partner->Print();
1958  }
1959  partner->RemovePartner(!upper, this);
1960  it.extract();
1961  } else if (debug) {
1962  tprintf("Keeping partner:");
1963  partner->Print();
1964  }
1965  }
1966  } else {
1967  // Only polyimages are allowed to have partners of any kind!
1968  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1969  ColPartition* partner = it.data();
1970  if (partner->blob_type() != BRT_POLYIMAGE ||
1971  blob_type() != BRT_POLYIMAGE) {
1972  if (debug) {
1973  tprintf("Removing partner:");
1974  partner->Print();
1975  }
1976  partner->RemovePartner(!upper, this);
1977  it.extract();
1978  } else if (debug) {
1979  tprintf("Keeping partner:");
1980  partner->Print();
1981  }
1982  }
1983  }
1984 }
1985 
1986 // Cleans up the partners above if upper is true, else below.
1987 // Remove transitive partnerships: this<->a, and a<->b and this<->b.
1988 // Gets rid of this<->b, leaving a clean chain.
1989 // Also if we have this<->a and a<->this, then gets rid of this<->a, as
1990 // this has multiple partners.
1991 void ColPartition::RefinePartnerShortcuts(bool upper,
1992  ColPartition_CLIST* partners) {
1993  bool done_any = false;
1994  do {
1995  done_any = false;
1996  ColPartition_C_IT it(partners);
1997  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1998  ColPartition* a = it.data();
1999  // Check for a match between all of a's partners (it1/b1) and all
2000  // of this's partners (it2/b2).
2001  ColPartition_C_IT it1(upper ? &a->upper_partners_ : &a->lower_partners_);
2002  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
2003  ColPartition* b1 = it1.data();
2004  if (b1 == this) {
2005  done_any = true;
2006  it.extract();
2007  a->RemovePartner(!upper, this);
2008  break;
2009  }
2010  ColPartition_C_IT it2(partners);
2011  for (it2.mark_cycle_pt(); !it2.cycled_list(); it2.forward()) {
2012  ColPartition* b2 = it2.data();
2013  if (b1 == b2) {
2014  // Jackpot! b2 should not be a partner of this.
2015  it2.extract();
2016  b2->RemovePartner(!upper, this);
2017  done_any = true;
2018  // That potentially invalidated all the iterators, so break out
2019  // and start again.
2020  break;
2021  }
2022  }
2023  if (done_any)
2024  break;
2025  }
2026  if (done_any)
2027  break;
2028  }
2029  } while (done_any && !partners->empty() && !partners->singleton());
2030 }
2031 
2032 // Cleans up the partners above if upper is true, else below.
2033 // If multiple text partners can be merged, (with each other, NOT with this),
2034 // then do so.
2035 // If desperate is true, then an increase in overlap with the merge is
2036 // allowed. If the overlap increases, then the desperately_merged_ flag
2037 // is set, indicating that the textlines probably need to be regenerated
2038 // by aggressive line fitting/splitting, as there are probably vertically
2039 // joined blobs that cross textlines.
2040 void ColPartition::RefineTextPartnersByMerge(bool upper, bool desperate,
2041  ColPartition_CLIST* partners,
2042  ColPartitionGrid* grid) {
2043  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2044  bounding_box_.bottom());
2045  if (debug) {
2046  tprintf("Refining %d %s partners by merge for:\n",
2047  partners->length(), upper ? "Upper" : "Lower");
2048  Print();
2049  }
2050  while (!partners->empty() && !partners->singleton()) {
2051  // Absorb will mess up the iterators, so we have to merge one partition
2052  // at a time and rebuild the iterators each time.
2053  ColPartition_C_IT it(partners);
2054  ColPartition* part = it.data();
2055  // Gather a list of merge candidates, from the list of partners, that
2056  // are all in the same single column. See general scenario comment above.
2057  ColPartition_CLIST candidates;
2058  ColPartition_C_IT cand_it(&candidates);
2059  for (it.forward(); !it.at_first(); it.forward()) {
2060  ColPartition* candidate = it.data();
2061  if (part->first_column_ == candidate->last_column_ &&
2062  part->last_column_ == candidate->first_column_)
2063  cand_it.add_after_then_move(it.data());
2064  }
2065  int overlap_increase;
2066  ColPartition* candidate = grid->BestMergeCandidate(part, &candidates, debug,
2067  NULL, &overlap_increase);
2068  if (candidate != NULL && (overlap_increase <= 0 || desperate)) {
2069  if (debug) {
2070  tprintf("Merging:hoverlap=%d, voverlap=%d, OLI=%d\n",
2071  part->HCoreOverlap(*candidate), part->VCoreOverlap(*candidate),
2072  overlap_increase);
2073  }
2074  // Remove before merge and re-insert to keep the integrity of the grid.
2075  grid->RemoveBBox(candidate);
2076  grid->RemoveBBox(part);
2077  part->Absorb(candidate, NULL);
2078  // We modified the box of part, so re-insert it into the grid.
2079  grid->InsertBBox(true, true, part);
2080  if (overlap_increase > 0)
2081  part->desperately_merged_ = true;
2082  } else {
2083  break; // Can't merge.
2084  }
2085  }
2086 }
2087 
2088 // Cleans up the partners above if upper is true, else below.
2089 // Keep the partner with the biggest overlap.
2090 void ColPartition::RefinePartnersByOverlap(bool upper,
2091  ColPartition_CLIST* partners) {
2092  bool debug = TabFind::WithinTestRegion(2, bounding_box_.left(),
2093  bounding_box_.bottom());
2094  if (debug) {
2095  tprintf("Refining %d %s partners by overlap for:\n",
2096  partners->length(), upper ? "Upper" : "Lower");
2097  Print();
2098  }
2099  ColPartition_C_IT it(partners);
2100  ColPartition* best_partner = it.data();
2101  // Find the partner with the best overlap.
2102  int best_overlap = 0;
2103  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2104  ColPartition* partner = it.data();
2105  int overlap = MIN(bounding_box_.right(), partner->bounding_box_.right())
2106  - MAX(bounding_box_.left(), partner->bounding_box_.left());
2107  if (overlap > best_overlap) {
2108  best_overlap = overlap;
2109  best_partner = partner;
2110  }
2111  }
2112  // Keep only the best partner.
2113  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2114  ColPartition* partner = it.data();
2115  if (partner != best_partner) {
2116  if (debug) {
2117  tprintf("Removing partner:");
2118  partner->Print();
2119  }
2120  partner->RemovePartner(!upper, this);
2121  it.extract();
2122  }
2123  }
2124 }
2125 
2126 // Return true if bbox belongs better in this than other.
2127 bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox,
2128  const ColPartition& other) {
2129  TBOX box = bbox->bounding_box();
2130  // Margins take priority.
2131  int left = box.left();
2132  int right = box.right();
2133  if (left < left_margin_ || right > right_margin_)
2134  return false;
2135  if (left < other.left_margin_ || right > other.right_margin_)
2136  return true;
2137  int top = box.top();
2138  int bottom = box.bottom();
2139  int this_overlap = MIN(top, median_top_) - MAX(bottom, median_bottom_);
2140  int other_overlap = MIN(top, other.median_top_) -
2141  MAX(bottom, other.median_bottom_);
2142  int this_miss = median_top_ - median_bottom_ - this_overlap;
2143  int other_miss = other.median_top_ - other.median_bottom_ - other_overlap;
2144  if (TabFind::WithinTestRegion(3, box.left(), box.bottom())) {
2145  tprintf("Unique on (%d,%d)->(%d,%d) overlap %d/%d, miss %d/%d, mt=%d/%d\n",
2146  box.left(), box.bottom(), box.right(), box.top(),
2147  this_overlap, other_overlap, this_miss, other_miss,
2148  median_top_, other.median_top_);
2149  }
2150  if (this_miss < other_miss)
2151  return true;
2152  if (this_miss > other_miss)
2153  return false;
2154  if (this_overlap > other_overlap)
2155  return true;
2156  if (this_overlap < other_overlap)
2157  return false;
2158  return median_top_ >= other.median_top_;
2159 }
2160 
2161 // Returns the median line-spacing between the current position and the end
2162 // of the list.
2163 // The iterator is passed by value so the iteration does not modify the
2164 // caller's iterator.
2165 static int MedianSpacing(int page_height, ColPartition_IT it) {
2166  STATS stats(0, page_height);
2167  while (!it.cycled_list()) {
2168  ColPartition* part = it.data();
2169  it.forward();
2170  stats.add(part->bottom_spacing(), 1);
2171  stats.add(part->top_spacing(), 1);
2172  }
2173  return static_cast<int>(stats.median() + 0.5);
2174 }
2175 
2176 // Returns true if this column partition is in the same column as
2177 // part. This function will only work after the SetPartitionType function
2178 // has been called on both column partitions. This is useful for
2179 // doing a SideSearch when you want things in the same page column.
2180 //
2181 // Currently called by the table detection code to identify if potential table
2182 // partitions exist in the same column.
2184  // Overlap does not occur when last < part.first or first > part.last.
2185  // In other words, one is completely to the side of the other.
2186  // This is just DeMorgan's law applied to that so the function returns true.
2187  return (last_column_ >= part.first_column_) &&
2188  (first_column_ <= part.last_column_);
2189 }
2190 
2191 // Smoothes the spacings in the list into groups of equal linespacing.
2192 // resolution is the resolution of the original image, used as a basis
2193 // for thresholds in change of spacing. page_height is in pixels.
2194 void ColPartition::SmoothSpacings(int resolution, int page_height,
2195  ColPartition_LIST* parts) {
2196  // The task would be trivial if we didn't have to allow for blips -
2197  // occasional offsets in spacing caused by anomolous text, such as all
2198  // caps, groups of descenders, joined words, Arabic etc.
2199  // The neighbourhood stores a consecutive group of partitions so that
2200  // blips can be detected correctly, yet conservatively enough to not
2201  // mistake genuine spacing changes for blips. See example below.
2202  ColPartition* neighbourhood[PN_COUNT];
2203  ColPartition_IT it(parts);
2204  it.mark_cycle_pt();
2205  // Although we know nothing about the spacings is this list, the median is
2206  // used as an approximation to allow blips.
2207  // If parts of this block aren't spaced to the median, then we can't
2208  // accept blips in those parts, but we'll recalculate it each time we
2209  // split the block, so the median becomes more likely to match all the text.
2210  int median_space = MedianSpacing(page_height, it);
2211  ColPartition_IT start_it(it);
2212  ColPartition_IT end_it(it);
2213  for (int i = 0; i < PN_COUNT; ++i) {
2214  if (i < PN_UPPER || it.cycled_list()) {
2215  neighbourhood[i] = NULL;
2216  } else {
2217  if (i == PN_LOWER)
2218  end_it = it;
2219  neighbourhood[i] = it.data();
2220  it.forward();
2221  }
2222  }
2223  while (neighbourhood[PN_UPPER] != NULL) {
2224  // Test for end of a group. Normally SpacingsEqual is true within a group,
2225  // but in the case of a blip, it will be false. Here is an example:
2226  // Line enum Spacing below (spacing between tops of lines)
2227  // 1 ABOVE2 20
2228  // 2 ABOVE1 20
2229  // 3 UPPER 15
2230  // 4 LOWER 25
2231  // 5 BELOW1 20
2232  // 6 BELOW2 20
2233  // Line 4 is all in caps (regular caps), so the spacing between line 3
2234  // and line 4 (looking at the tops) is smaller than normal, and the
2235  // spacing between line 4 and line 5 is larger than normal, but the
2236  // two of them add to twice the normal spacing.
2237  // The following if has to accept unequal spacings 3 times to pass the
2238  // blip (20/15, 15/25 and 25/20)
2239  // When the blip is in the middle, OKSpacingBlip tests that one of
2240  // ABOVE1 and BELOW1 matches the median.
2241  // The first time, everything is shifted down 1, so we present
2242  // OKSpacingBlip with neighbourhood+1 and check that PN_UPPER is median.
2243  // The last time, everything is shifted up 1, so we present OKSpacingBlip
2244  // with neighbourhood-1 and check that PN_LOWER matches the median.
2245  if (neighbourhood[PN_LOWER] == NULL ||
2246  (!neighbourhood[PN_UPPER]->SpacingsEqual(*neighbourhood[PN_LOWER],
2247  resolution) &&
2248  !OKSpacingBlip(resolution, median_space, neighbourhood) &&
2249  (!OKSpacingBlip(resolution, median_space, neighbourhood - 1) ||
2250  !neighbourhood[PN_LOWER]->SpacingEqual(median_space, resolution)) &&
2251  (!OKSpacingBlip(resolution, median_space, neighbourhood + 1) ||
2252  !neighbourhood[PN_UPPER]->SpacingEqual(median_space, resolution)))) {
2253  // The group has ended. PN_UPPER is the last member.
2254  // Compute the mean spacing over the group.
2255  ColPartition_IT sum_it(start_it);
2256  ColPartition* last_part = neighbourhood[PN_UPPER];
2257  double total_bottom = 0.0;
2258  double total_top = 0.0;
2259  int total_count = 0;
2260  ColPartition* upper = sum_it.data();
2261  // We do not process last_part, as its spacing is different.
2262  while (upper != last_part) {
2263  total_bottom += upper->bottom_spacing();
2264  total_top += upper->top_spacing();
2265  ++total_count;
2266  sum_it.forward();
2267  upper = sum_it.data();
2268  }
2269  if (total_count > 0) {
2270  // There were at least 2 lines, so set them all to the mean.
2271  int top_spacing = static_cast<int>(total_top / total_count + 0.5);
2272  int bottom_spacing = static_cast<int>(total_bottom / total_count + 0.5);
2273  if (textord_debug_tabfind) {
2274  tprintf("Spacing run ended. Cause:");
2275  if (neighbourhood[PN_LOWER] == NULL) {
2276  tprintf("No more lines\n");
2277  } else {
2278  tprintf("Spacing change. Spacings:\n");
2279  for (int i = 0; i < PN_COUNT; ++i) {
2280  if (neighbourhood[i] == NULL) {
2281  tprintf("NULL");
2282  if (i > 0 && neighbourhood[i - 1] != NULL) {
2283  if (neighbourhood[i - 1]->SingletonPartner(false) != NULL) {
2284  tprintf(" Lower partner:");
2285  neighbourhood[i - 1]->SingletonPartner(false)->Print();
2286  } else {
2287  tprintf(" NULL lower partner:\n");
2288  }
2289  } else {
2290  tprintf("\n");
2291  }
2292  } else {
2293  tprintf("Top = %d, bottom = %d\n",
2294  neighbourhood[i]->top_spacing(),
2295  neighbourhood[i]->bottom_spacing());
2296  }
2297  }
2298  }
2299  tprintf("Mean spacing = %d/%d\n", top_spacing, bottom_spacing);
2300  }
2301  sum_it = start_it;
2302  upper = sum_it.data();
2303  while (upper != last_part) {
2304  upper->set_top_spacing(top_spacing);
2305  upper->set_bottom_spacing(bottom_spacing);
2306  if (textord_debug_tabfind) {
2307  tprintf("Setting mean on:");
2308  upper->Print();
2309  }
2310  sum_it.forward();
2311  upper = sum_it.data();
2312  }
2313  }
2314  // PN_LOWER starts the next group and end_it is the next start_it.
2315  start_it = end_it;
2316  // Recalculate the median spacing to maximize the chances of detecting
2317  // spacing blips.
2318  median_space = MedianSpacing(page_height, end_it);
2319  }
2320  // Shuffle pointers.
2321  for (int j = 1; j < PN_COUNT; ++j) {
2322  neighbourhood[j - 1] = neighbourhood[j];
2323  }
2324  if (it.cycled_list()) {
2325  neighbourhood[PN_COUNT - 1] = NULL;
2326  } else {
2327  neighbourhood[PN_COUNT - 1] = it.data();
2328  it.forward();
2329  }
2330  end_it.forward();
2331  }
2332 }
2333 
2334 // Returns true if the parts array of pointers to partitions matches the
2335 // condition for a spacing blip. See SmoothSpacings for what this means
2336 // and how it is used.
2337 bool ColPartition::OKSpacingBlip(int resolution, int median_spacing,
2338  ColPartition** parts) {
2339  if (parts[PN_UPPER] == NULL || parts[PN_LOWER] == NULL)
2340  return false;
2341  // The blip is OK if upper and lower sum to an OK value and at least
2342  // one of above1 and below1 is equal to the median.
2343  return parts[PN_UPPER]->SummedSpacingOK(*parts[PN_LOWER],
2344  median_spacing, resolution) &&
2345  ((parts[PN_ABOVE1] != NULL &&
2346  parts[PN_ABOVE1]->SpacingEqual(median_spacing, resolution)) ||
2347  (parts[PN_BELOW1] != NULL &&
2348  parts[PN_BELOW1]->SpacingEqual(median_spacing, resolution)));
2349 }
2350 
2351 // Returns true if both the top and bottom spacings of this match the given
2352 // spacing to within suitable margins dictated by the image resolution.
2353 bool ColPartition::SpacingEqual(int spacing, int resolution) const {
2354  int bottom_error = BottomSpacingMargin(resolution);
2355  int top_error = TopSpacingMargin(resolution);
2356  return NearlyEqual(bottom_spacing_, spacing, bottom_error) &&
2357  NearlyEqual(top_spacing_, spacing, top_error);
2358 }
2359 
2360 // Returns true if both the top and bottom spacings of this and other
2361 // match to within suitable margins dictated by the image resolution.
2362 bool ColPartition::SpacingsEqual(const ColPartition& other,
2363  int resolution) const {
2364  int bottom_error = MAX(BottomSpacingMargin(resolution),
2365  other.BottomSpacingMargin(resolution));
2366  int top_error = MAX(TopSpacingMargin(resolution),
2367  other.TopSpacingMargin(resolution));
2368  return NearlyEqual(bottom_spacing_, other.bottom_spacing_, bottom_error) &&
2369  (NearlyEqual(top_spacing_, other.top_spacing_, top_error) ||
2370  NearlyEqual(top_spacing_ + other.top_spacing_, bottom_spacing_ * 2,
2371  bottom_error));
2372 }
2373 
2374 // Returns true if the sum spacing of this and other match the given
2375 // spacing (or twice the given spacing) to within a suitable margin dictated
2376 // by the image resolution.
2377 bool ColPartition::SummedSpacingOK(const ColPartition& other,
2378  int spacing, int resolution) const {
2379  int bottom_error = MAX(BottomSpacingMargin(resolution),
2380  other.BottomSpacingMargin(resolution));
2381  int top_error = MAX(TopSpacingMargin(resolution),
2382  other.TopSpacingMargin(resolution));
2383  int bottom_total = bottom_spacing_ + other.bottom_spacing_;
2384  int top_total = top_spacing_ + other.top_spacing_;
2385  return (NearlyEqual(spacing, bottom_total, bottom_error) &&
2386  NearlyEqual(spacing, top_total, top_error)) ||
2387  (NearlyEqual(spacing * 2, bottom_total, bottom_error) &&
2388  NearlyEqual(spacing * 2, top_total, top_error));
2389 }
2390 
2391 // Returns a suitable spacing margin that can be applied to bottoms of
2392 // text lines, based on the resolution and the stored side_step_.
2393 int ColPartition::BottomSpacingMargin(int resolution) const {
2394  return static_cast<int>(kMaxSpacingDrift * resolution + 0.5) + side_step_;
2395 }
2396 
2397 // Returns a suitable spacing margin that can be applied to tops of
2398 // text lines, based on the resolution and the stored side_step_.
2399 int ColPartition::TopSpacingMargin(int resolution) const {
2400  return static_cast<int>(kMaxTopSpacingFraction * median_size_ + 0.5) +
2401  BottomSpacingMargin(resolution);
2402 }
2403 
2404 // Returns true if the median text sizes of this and other agree to within
2405 // a reasonable multiplicative factor.
2406 bool ColPartition::SizesSimilar(const ColPartition& other) const {
2407  return median_size_ <= other.median_size_ * kMaxSizeRatio &&
2408  other.median_size_ <= median_size_ * kMaxSizeRatio;
2409 }
2410 
2411 // Helper updates margin_left and margin_right, being the bounds of the left
2412 // margin of part of a block. Returns false and does not update the bounds if
2413 // this partition has a disjoint margin with the established margin.
2414 static bool UpdateLeftMargin(const ColPartition& part,
2415  int* margin_left, int* margin_right) {
2416  const TBOX& part_box = part.bounding_box();
2417  int top = part_box.top();
2418  int bottom = part_box.bottom();
2419  int tl_key = part.SortKey(part.left_margin(), top);
2420  int tr_key = part.SortKey(part_box.left(), top);
2421  int bl_key = part.SortKey(part.left_margin(), bottom);
2422  int br_key = part.SortKey(part_box.left(), bottom);
2423  int left_key = MAX(tl_key, bl_key);
2424  int right_key = MIN(tr_key, br_key);
2425  if (left_key <= *margin_right && right_key >= *margin_left) {
2426  // This part is good - let's keep it.
2427  *margin_right = MIN(*margin_right, right_key);
2428  *margin_left = MAX(*margin_left, left_key);
2429  return true;
2430  }
2431  return false;
2432 }
2433 
2434 // Computes and returns in start, end a line segment formed from a
2435 // forwards-iterated group of left edges of partitions that satisfy the
2436 // condition that the intersection of the left margins is non-empty, ie the
2437 // rightmost left margin is to the left of the leftmost left bounding box edge.
2438 // On return the iterator is set to the start of the next run.
2439 void ColPartition::LeftEdgeRun(ColPartition_IT* part_it,
2440  ICOORD* start, ICOORD* end) {
2441  ColPartition* part = part_it->data();
2442  ColPartition* start_part = part;
2443  int start_y = part->bounding_box_.top();
2444  if (!part_it->at_first()) {
2445  int prev_bottom = part_it->data_relative(-1)->bounding_box_.bottom();
2446  if (prev_bottom < start_y)
2447  start_y = prev_bottom;
2448  else if (prev_bottom > start_y)
2449  start_y = (start_y + prev_bottom) / 2;
2450  }
2451  int end_y = part->bounding_box_.bottom();
2452  int margin_right = MAX_INT32;
2453  int margin_left = -MAX_INT32;
2454  UpdateLeftMargin(*part, &margin_left, &margin_right);
2455  do {
2456  part_it->forward();
2457  part = part_it->data();
2458  } while (!part_it->at_first() &&
2459  UpdateLeftMargin(*part, &margin_left, &margin_right));
2460  // The run ended. If we were pushed inwards, compute the next run and
2461  // extend it backwards into the run we just calculated to find the end of
2462  // this run that provides a tight box.
2463  int next_margin_right = MAX_INT32;
2464  int next_margin_left = -MAX_INT32;
2465  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right);
2466  if (next_margin_left > margin_right) {
2467  ColPartition_IT next_it(*part_it);
2468  do {
2469  next_it.forward();
2470  part = next_it.data();
2471  } while (!next_it.at_first() &&
2472  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2473  // Now extend the next run backwards into the original run to get the
2474  // tightest fit.
2475  do {
2476  part_it->backward();
2477  part = part_it->data();
2478  } while (part != start_part &&
2479  UpdateLeftMargin(*part, &next_margin_left, &next_margin_right));
2480  part_it->forward();
2481  }
2482  // Now calculate the end_y.
2483  part = part_it->data_relative(-1);
2484  end_y = part->bounding_box_.bottom();
2485  if (!part_it->at_first() && part_it->data()->bounding_box_.top() < end_y)
2486  end_y = (end_y + part_it->data()->bounding_box_.top()) / 2;
2487  start->set_y(start_y);
2488  start->set_x(part->XAtY(margin_right, start_y));
2489  end->set_y(end_y);
2490  end->set_x(part->XAtY(margin_right, end_y));
2491  if (textord_debug_tabfind && !part_it->at_first())
2492  tprintf("Left run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2493  start_y, end_y, part->XAtY(margin_left, end_y),
2494  end->x(), part->left_margin_, part->bounding_box_.left());
2495 }
2496 
2497 // Helper updates margin_left and margin_right, being the bounds of the right
2498 // margin of part of a block. Returns false and does not update the bounds if
2499 // this partition has a disjoint margin with the established margin.
2500 static bool UpdateRightMargin(const ColPartition& part,
2501  int* margin_left, int* margin_right) {
2502  const TBOX& part_box = part.bounding_box();
2503  int top = part_box.top();
2504  int bottom = part_box.bottom();
2505  int tl_key = part.SortKey(part_box.right(), top);
2506  int tr_key = part.SortKey(part.right_margin(), top);
2507  int bl_key = part.SortKey(part_box.right(), bottom);
2508  int br_key = part.SortKey(part.right_margin(), bottom);
2509  int left_key = MAX(tl_key, bl_key);
2510  int right_key = MIN(tr_key, br_key);
2511  if (left_key <= *margin_right && right_key >= *margin_left) {
2512  // This part is good - let's keep it.
2513  *margin_right = MIN(*margin_right, right_key);
2514  *margin_left = MAX(*margin_left, left_key);
2515  return true;
2516  }
2517  return false;
2518 }
2519 
2520 // Computes and returns in start, end a line segment formed from a
2521 // backwards-iterated group of right edges of partitions that satisfy the
2522 // condition that the intersection of the right margins is non-empty, ie the
2523 // leftmost right margin is to the right of the rightmost right bounding box
2524 // edge.
2525 // On return the iterator is set to the start of the next run.
2526 void ColPartition::RightEdgeRun(ColPartition_IT* part_it,
2527  ICOORD* start, ICOORD* end) {
2528  ColPartition* part = part_it->data();
2529  ColPartition* start_part = part;
2530  int start_y = part->bounding_box_.bottom();
2531  if (!part_it->at_last()) {
2532  int next_y = part_it->data_relative(1)->bounding_box_.top();
2533  if (next_y > start_y)
2534  start_y = next_y;
2535  else if (next_y < start_y)
2536  start_y = (start_y + next_y) / 2;
2537  }
2538  int end_y = part->bounding_box_.top();
2539  int margin_right = MAX_INT32;
2540  int margin_left = -MAX_INT32;
2541  UpdateRightMargin(*part, &margin_left, &margin_right);
2542  do {
2543  part_it->backward();
2544  part = part_it->data();
2545  } while (!part_it->at_last() &&
2546  UpdateRightMargin(*part, &margin_left, &margin_right));
2547  // The run ended. If we were pushed inwards, compute the next run and
2548  // extend it backwards to find the end of this run for a tight box.
2549  int next_margin_right = MAX_INT32;
2550  int next_margin_left = -MAX_INT32;
2551  UpdateRightMargin(*part, &next_margin_left, &next_margin_right);
2552  if (next_margin_right < margin_left) {
2553  ColPartition_IT next_it(*part_it);
2554  do {
2555  next_it.backward();
2556  part = next_it.data();
2557  } while (!next_it.at_last() &&
2558  UpdateRightMargin(*part, &next_margin_left,
2559  &next_margin_right));
2560  // Now extend the next run forwards into the original run to get the
2561  // tightest fit.
2562  do {
2563  part_it->forward();
2564  part = part_it->data();
2565  } while (part != start_part &&
2566  UpdateRightMargin(*part, &next_margin_left,
2567  &next_margin_right));
2568  part_it->backward();
2569  }
2570  // Now calculate the end_y.
2571  part = part_it->data_relative(1);
2572  end_y = part->bounding_box().top();
2573  if (!part_it->at_last() &&
2574  part_it->data()->bounding_box_.bottom() > end_y)
2575  end_y = (end_y + part_it->data()->bounding_box_.bottom()) / 2;
2576  start->set_y(start_y);
2577  start->set_x(part->XAtY(margin_left, start_y));
2578  end->set_y(end_y);
2579  end->set_x(part->XAtY(margin_left, end_y));
2580  if (textord_debug_tabfind && !part_it->at_last())
2581  tprintf("Right run from y=%d to %d terminated with sum %d-%d, new %d-%d\n",
2582  start_y, end_y, end->x(), part->XAtY(margin_right, end_y),
2583  part->bounding_box_.right(), part->right_margin_);
2584 }
2585 
2586 } // namespace tesseract.
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
const int kMaxPartnerDepth
void RefinePartners(PolyBlockType type, bool get_desparate, ColPartitionGrid *grid)
const double kMaxTopSpacingFraction
const double kMinBaselineCoverage
BLOBNBOX_CLIST * boxes()
Definition: colpartition.h:187
bool IsLineType() const
Definition: colpartition.h:419
bool IsVerticalType() const
Definition: colpartition.h:435
int XAtY(int sort_key, int y) const
Definition: colpartition.h:320
static bool WithinTestRegion(int detail_level, int x, int y)
static bool TypesSimilar(PolyBlockType type1, PolyBlockType type2)
Definition: colpartition.h:412
const int kHorzStrongTextlineHeight
float SpecialBlobsDensity(const BlobSpecialTextType type) const
bool ConfirmNoTabViolation(const ColPartition &other) const
void set_top_spacing(int spacing)
Definition: colpartition.h:229
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:437
BlobRegionType
Definition: blobbox.h:57
#define MAX(x, y)
Definition: ndminx.h:24
static TO_BLOCK * MakeBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
ColPartition * ShallowCopy() const
void SetRegionAndFlowTypesFromProjectionValue(int value)
BlobTextFlowType
Definition: blobbox.h:99
BlobSpecialTextType
Definition: blobbox.h:81
void set_poly_block(POLY_BLOCK *blk)
set the poly block
Definition: pdblock.h:63
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:429
void CopyRightTab(const ColPartition &src, bool take_box)
void SetRightTab(const TabVector *tab_vector)
const int kMinStrongTextValue
const TBOX & bounding_box() const
Definition: colpartition.h:109
int NoisyNeighbours() const
Definition: blobbox.cpp:230
static ScrollView::Color ColorForPolyBlockType(PolyBlockType type)
Returns a color to draw the given type.
Definition: polyblk.cpp:395
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
void set_owner(tesseract::ColPartition *new_owner)
Definition: blobbox.h:340
Definition: statistc.h:33
void set_right(int x)
Definition: rect.h:78
void print() const
Definition: rect.h:270
bool IsInSameColumnAs(const ColPartition &part) const
void set_bottom_spacing(int spacing)
Definition: colpartition.h:223
static DPPoint * Solve(int min_step, int max_step, bool debug, CostFunc cost_func, int size, DPPoint *points)
Definition: dppoint.cpp:30
int median_width() const
Definition: colpartition.h:142
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:1010
inT32 mode() const
Definition: statistc.cpp:118
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
BlobRegionType blob_type() const
Definition: colpartition.h:148
void SetLeftTab(const TabVector *tab_vector)
void AddBox(BLOBNBOX *box)
BLOBNBOX * OverlapSplitBlob(const TBOX &box)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125
bool IsText() const
Definition: polyblk.h:52
int top_spacing() const
Definition: colpartition.h:226
ColPartition * SplitAt(int split_x)
int LeftAtY(int y) const
Definition: colpartition.h:340
inT16 right() const
Definition: rect.h:75
int median_size() const
Definition: colpartition.h:136
void set_left(int x)
Definition: rect.h:71
void SmoothPartnerRun(int working_set_count)
Definition: capi.h:78
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_flow(BlobTextFlowType value)
Definition: blobbox.h:283
const int kHorzStrongTextlineAspect
Definition: capi.h:79
void SetSpecialBlobsDensity(const BlobSpecialTextType type, const float density)
int median_bottom() const
Definition: colpartition.h:127
int bottom_spacing() const
Definition: colpartition.h:220
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
void ColumnRange(int resolution, ColPartitionSet *columns, int *first_col, int *last_col)
void add_blob(BLOBNBOX *blob, float top, float bottom, float row_size)
Definition: blobbox.cpp:726
const double kMaxSameBlockLineSpacing
TBOX BoundsWithoutBox(BLOBNBOX *box)
double median() const
Definition: statistc.cpp:243
bool owns_blobs() const
Definition: colpartition.h:291
inT32 area() const
Definition: rect.h:118
void CopyLeftTab(const ColPartition &src, bool take_box)
void SetPartitionType(int resolution, ColPartitionSet *columns)
static double ColorDistanceFromLine(const uinT8 *line1, const uinT8 *line2, const uinT8 *point)
Definition: imagefind.cpp:331
int textord_debug_tabfind
Definition: alignedblob.cpp:27
void set_left_margin(int margin)
Definition: colpartition.h:115
void set_owns_blobs(bool owns_blobs)
Definition: colpartition.h:294
ScrollView::Color BoxColor() const
double ile(double frac) const
Definition: statistc.cpp:177
void set_right_margin(int margin)
Definition: colpartition.h:121
#define CLISTIZE(CLASSNAME)
Definition: clst.h:958
int BoxRightKey() const
Definition: colpartition.h:336
bool IsImageType() const
Definition: colpartition.h:423
void set_bottom(int y)
Definition: rect.h:64
bool OKMergeOverlap(const ColPartition &merge1, const ColPartition &merge2, int ok_box_overlap, bool debug)
inT16 y() const
access_function
Definition: points.h:56
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:148
inT16 left() const
Definition: rect.h:68
ColPartition * SingletonPartner(bool upper)
const int kMaxColorDistance
Definition: ocrblock.h:30
const int kLeaderCutCost
BlobRegionType region_type() const
Definition: blobbox.h:268
const double kMaxSpacingDrift
void AddToWorkingSet(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, WorkingPartSet_LIST *working_set)
const int kMinChainTextValue
C_BLOB * cblob() const
Definition: blobbox.h:253
static void LineSpacingBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts, BLOCK_LIST *completed_blocks, TO_BLOCK_LIST *to_blocks)
ColPartition * SplitAtBlob(BLOBNBOX *split_blob)
const double kMaxSizeRatio
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:67
Definition: capi.h:79
int total_cost() const
Definition: dppoint.h:68
bool IsPulloutType() const
Definition: colpartition.h:431
static TO_BLOCK * MakeVerticalTextBlock(const ICOORD &bleft, const ICOORD &tright, ColPartition_LIST *block_parts, ColPartition_LIST *used_parts)
BlobSpecialTextType special_text_type() const
Definition: blobbox.h:274
bool MatchingSizes(const ColPartition &other) const
ColumnSpanningType SpanningType(int resolution, int left, int right, int height, int y, int left_margin, int right_margin, int *first_col, int *last_col, int *first_spanned_col)
#define MAX_INT32
Definition: host.h:120
inT64 CostWithVariance(const DPPoint *prev)
Definition: dppoint.cpp:68
integer coordinate
Definition: points.h:30
inT16 bottom() const
Definition: rect.h:61
int RightAtY(int y) const
Definition: colpartition.h:344
virtual R Run(A1)=0
const int kMinLeaderCount
tesseract::ColPartition * owner() const
Definition: blobbox.h:337
ColPartition * CopyButDontOwnBlobs()
PolyBlockType type() const
Definition: colpartition.h:181
inT16 height() const
Definition: rect.h:104
void Absorb(ColPartition *other, WidthCallback *cb)
inT16 width() const
Definition: rect.h:111
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:52
void set_flow(BlobTextFlowType f)
Definition: colpartition.h:157
const double kMaxLeaderGapFractionOfMin
const double kMaxBaselineError
void ExtractCompletedBlocks(const ICOORD &bleft, const ICOORD &tright, int resolution, ColPartition_LIST *used_parts, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
int textord_debug_bugs
Definition: alignedblob.cpp:28
bool MatchingTextColor(const ColPartition &other) const
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
PolyBlockType PartitionType(ColumnSpanningType flow) const
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
static ColPartition * MakeLinePartition(BlobRegionType blob_type, const ICOORD &vertical, int left, int bottom, int right, int top)
int count(LIST var_list)
Definition: oldlist.cpp:108
void set_block_owned(bool owned)
Definition: colpartition.h:208
const int kMaxRMSColorNoise
static C_BLOB * FakeBlob(const TBOX &box)
Definition: stepblob.cpp:238
inT16 x() const
access function
Definition: points.h:52
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
void set_side_step(int step)
Definition: colpartition.h:217
bool DominatesInMerge(BlobTextFlowType type1, BlobTextFlowType type2)
Definition: blobbox.h:114
Definition: rect.h:30
void set_region_type(BlobRegionType new_type)
Definition: blobbox.h:271
float line_spacing
Definition: blobbox.h:775
bool MatchingColumns(const ColPartition &other) const
static ColPartition * FakePartition(const TBOX &box, PolyBlockType block_type, BlobRegionType blob_type, BlobTextFlowType flow)
const int kHorzStrongTextlineCount
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
int VCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:375
void SetColumnGoodness(WidthCallback *cb)
int SpecialBlobsCount(const BlobSpecialTextType type)
void RemovePartner(bool upper, ColPartition *partner)
int sort_key() const
Definition: tabvector.h:158
void AddLocalCost(int new_cost)
Definition: dppoint.h:77
#define NULL
Definition: host.h:144
bool IsDiacritic() const
Definition: blobbox.h:365
const TBOX & bounding_box() const
Definition: blobbox.h:215
void set_top(int y)
Definition: rect.h:57
PolyBlockType
Definition: publictypes.h:41
const double kMaxLeaderGapFractionOfMax
int GoodTextBlob() const
Definition: blobbox.cpp:219
const int kColumnWidthFactor
Definition: tabfind.h:42
void AddPartition(ColPartition *part)
int CountOverlappingBoxes(const TBOX &box)
bool OKDiacriticMerge(const ColPartition &candidate, bool debug) const
inT16 top() const
Definition: rect.h:54
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
int base_char_bottom() const
Definition: blobbox.h:371
BlobTextFlowType flow() const
Definition: blobbox.h:280
float max_blob_size
Definition: blobbox.h:782
void AddPartner(bool upper, ColPartition *partner)
bool overlap(const TBOX &box) const
Definition: rect.h:345
bool VSignificantCoreOverlap(const ColPartition &other) const
Definition: colpartition.h:387
void RemoveBox(BLOBNBOX *box)
BLOCK * block
Definition: blobbox.h:773
int base_char_top() const
Definition: blobbox.h:368
float line_size
Definition: blobbox.h:781
BLOBNBOX_LIST blobs
Definition: blobbox.h:768
void set_type(PolyBlockType t)
Definition: colpartition.h:184