All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tabvector.cpp
Go to the documentation of this file.
1 // File: tabvector.cpp
3 // Description: Class to hold a near-vertical vector representing a tab-stop.
4 // Author: Ray Smith
5 // Created: Thu Apr 10 16:28:01 PST 2008
6 //
7 // (C) Copyright 2008, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #ifdef HAVE_CONFIG_H
25 #include "config_auto.h"
26 #endif
27 
28 #include "tabvector.h"
29 #include "blobbox.h"
30 #include "colfind.h"
31 #include "colpartitionset.h"
32 #include "detlinefit.h"
33 #include "statistc.h"
34 
35 namespace tesseract {
36 
37 // Multiple of height used as a gutter for evaluation search.
38 const int kGutterMultiple = 4;
39 // Multiple of neighbour gap that we expect the gutter gap to be at minimum.
41 // Pixel distance for tab vectors to be considered the same.
42 const int kSimilarVectorDist = 10;
43 // Pixel distance for ragged tab vectors to be considered the same if there
44 // is nothing in the overlap box
45 const int kSimilarRaggedDist = 50;
46 // Max multiple of height to allow filling in between blobs when evaluating.
47 const int kMaxFillinMultiple = 11;
48 // Min fraction of mean gutter size to allow a gutter on a good tab blob.
49 const double kMinGutterFraction = 0.5;
50 // Multiple of 1/n lines as a minimum gutter in evaluation.
51 const double kLineCountReciprocal = 4.0;
52 // Constant add-on for minimum gutter for aligned tabs.
53 const double kMinAlignedGutter = 0.25;
54 // Constant add-on for minimum gutter for ragged tabs.
55 const double kMinRaggedGutter = 1.5;
56 
58  "max fraction of mean blob width allowed for vertical gaps in vertical text");
59 
61  "Fraction of box matches required to declare a line vertical");
62 
64 
65 // Create a constraint for the top or bottom of this TabVector.
66 void TabConstraint::CreateConstraint(TabVector* vector, bool is_top) {
67  TabConstraint* constraint = new TabConstraint(vector, is_top);
68  TabConstraint_LIST* constraints = new TabConstraint_LIST;
69  TabConstraint_IT it(constraints);
70  it.add_to_end(constraint);
71  if (is_top)
72  vector->set_top_constraints(constraints);
73  else
74  vector->set_bottom_constraints(constraints);
75 }
76 
77 // Test to see if the constraints are compatible enough to merge.
78 bool TabConstraint::CompatibleConstraints(TabConstraint_LIST* list1,
79  TabConstraint_LIST* list2) {
80  if (list1 == list2)
81  return false;
82  int y_min = -MAX_INT32;
83  int y_max = MAX_INT32;
84  if (textord_debug_tabfind > 3)
85  tprintf("Testing constraint compatibility\n");
86  GetConstraints(list1, &y_min, &y_max);
87  GetConstraints(list2, &y_min, &y_max);
88  if (textord_debug_tabfind > 3)
89  tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
90  return y_max >= y_min;
91 }
92 
93 // Merge the lists of constraints and update the TabVector pointers.
94 // The second list is deleted.
95 void TabConstraint::MergeConstraints(TabConstraint_LIST* list1,
96  TabConstraint_LIST* list2) {
97  if (list1 == list2)
98  return;
99  TabConstraint_IT it(list2);
100  if (textord_debug_tabfind > 3)
101  tprintf("Merging constraints\n");
102  // The vectors of all constraints on list2 are now going to be on list1.
103  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
104  TabConstraint* constraint = it.data();
105  if (textord_debug_tabfind> 3)
106  constraint->vector_->Print("Merge");
107  if (constraint->is_top_)
108  constraint->vector_->set_top_constraints(list1);
109  else
110  constraint->vector_->set_bottom_constraints(list1);
111  }
112  it = list1;
113  it.add_list_before(list2);
114  delete list2;
115 }
116 
117 // Set all the tops and bottoms as appropriate to a mean of the
118 // constrained range. Delete all the constraints and list.
119 void TabConstraint::ApplyConstraints(TabConstraint_LIST* constraints) {
120  int y_min = -MAX_INT32;
121  int y_max = MAX_INT32;
122  GetConstraints(constraints, &y_min, &y_max);
123  int y = (y_min + y_max) / 2;
124  TabConstraint_IT it(constraints);
125  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
126  TabConstraint* constraint = it.data();
127  TabVector* v = constraint->vector_;
128  if (constraint->is_top_) {
129  v->SetYEnd(y);
131  } else {
132  v->SetYStart(y);
134  }
135  }
136  delete constraints;
137 }
138 
139 TabConstraint::TabConstraint(TabVector* vector, bool is_top)
140  : vector_(vector), is_top_(is_top) {
141  if (is_top) {
142  y_min_ = vector->endpt().y();
143  y_max_ = vector->extended_ymax();
144  } else {
145  y_max_ = vector->startpt().y();
146  y_min_ = vector->extended_ymin();
147  }
148 }
149 
150 // Get the max of the mins and the min of the maxes.
151 void TabConstraint::GetConstraints(TabConstraint_LIST* constraints,
152  int* y_min, int* y_max) {
153  TabConstraint_IT it(constraints);
154  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
155  TabConstraint* constraint = it.data();
156  if (textord_debug_tabfind > 3) {
157  tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
158  constraint->vector_->Print(" for");
159  }
160  *y_min = MAX(*y_min, constraint->y_min_);
161  *y_max = MIN(*y_max, constraint->y_max_);
162  }
163 }
164 
165 ELIST2IZE(TabVector)
166 CLISTIZE(TabVector)
167 
168 // The constructor is private. See the bottom of the file...
169 
171 }
172 
173 
174 // Public factory to build a TabVector from a list of boxes.
175 // The TabVector will be of the given alignment type.
176 // The input vertical vector is used in fitting, and the output
177 // vertical_x, vertical_y have the resulting line vector added to them
178 // if the alignment is not ragged.
179 // The extended_start_y and extended_end_y are the maximum possible
180 // extension to the line segment that can be used to align with others.
181 // The input CLIST of BLOBNBOX good_points is consumed and taken over.
183  int extended_start_y, int extended_end_y,
184  BLOBNBOX_CLIST* good_points,
185  int* vertical_x, int* vertical_y) {
186  TabVector* vector = new TabVector(extended_start_y, extended_end_y,
187  alignment, good_points);
188  if (!vector->Fit(vertical, false)) {
189  delete vector;
190  return NULL;
191  }
192  if (!vector->IsRagged()) {
193  vertical = vector->endpt_ - vector->startpt_;
194  int weight = vector->BoxCount();
195  *vertical_x += vertical.x() * weight;
196  *vertical_y += vertical.y() * weight;
197  }
198  return vector;
199 }
200 
201 // Build a ragged TabVector by copying another's direction, shifting it
202 // to match the given blob, and making its initial extent the height
203 // of the blob, but its extended bounds from the bounds of the original.
205  const ICOORD& vertical_skew, BLOBNBOX* blob)
206  : extended_ymin_(src.extended_ymin_), extended_ymax_(src.extended_ymax_),
207  sort_key_(0), percent_score_(0), mean_width_(0),
208  needs_refit_(true), needs_evaluation_(true), intersects_other_lines_(false),
209  alignment_(alignment),
210  top_constraints_(NULL), bottom_constraints_(NULL) {
211  BLOBNBOX_C_IT it(&boxes_);
212  it.add_to_end(blob);
213  TBOX box = blob->bounding_box();
214  if (IsLeftTab()) {
215  startpt_ = box.botleft();
216  endpt_ = box.topleft();
217  } else {
218  startpt_ = box.botright();
219  endpt_ = box.topright();
220  }
221  sort_key_ = SortKey(vertical_skew,
222  (startpt_.x() + endpt_.x()) / 2,
223  (startpt_.y() + endpt_.y()) / 2);
224  if (textord_debug_tabfind > 3)
225  Print("Constructed a new tab vector:");
226 }
227 
228 // Copies basic attributes of a tab vector for simple operations.
229 // Copies things such startpt, endpt, range.
230 // Does not copy things such as partners, boxes, or constraints.
231 // This is useful if you only need vector information for processing, such
232 // as in the table detection code.
234  TabVector* copy = new TabVector();
235  copy->startpt_ = startpt_;
236  copy->endpt_ = endpt_;
237  copy->alignment_ = alignment_;
238  copy->extended_ymax_ = extended_ymax_;
239  copy->extended_ymin_ = extended_ymin_;
240  copy->intersects_other_lines_ = intersects_other_lines_;
241  return copy;
242 }
243 
244 // Extend this vector to include the supplied blob if it doesn't
245 // already have it.
247  TBOX new_box = new_blob->bounding_box();
248  BLOBNBOX_C_IT it(&boxes_);
249  if (!it.empty()) {
250  BLOBNBOX* blob = it.data();
251  TBOX box = blob->bounding_box();
252  while (!it.at_last() && box.top() <= new_box.top()) {
253  if (blob == new_blob)
254  return; // We have it already.
255  it.forward();
256  blob = it.data();
257  box = blob->bounding_box();
258  }
259  if (box.top() >= new_box.top()) {
260  it.add_before_stay_put(new_blob);
261  needs_refit_ = true;
262  return;
263  }
264  }
265  needs_refit_ = true;
266  it.add_after_stay_put(new_blob);
267 }
268 
269 // Set the ycoord of the start and move the xcoord to match.
270 void TabVector::SetYStart(int start_y) {
271  startpt_.set_x(XAtY(start_y));
272  startpt_.set_y(start_y);
273 }
274 // Set the ycoord of the end and move the xcoord to match.
275 void TabVector::SetYEnd(int end_y) {
276  endpt_.set_x(XAtY(end_y));
277  endpt_.set_y(end_y);
278 }
279 
280 // Rotate the ends by the given vector. Auto flip start and end if needed.
281 void TabVector::Rotate(const FCOORD& rotation) {
282  startpt_.rotate(rotation);
283  endpt_.rotate(rotation);
284  int dx = endpt_.x() - startpt_.x();
285  int dy = endpt_.y() - startpt_.y();
286  if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
287  // Need to flip start/end.
288  ICOORD tmp = startpt_;
289  startpt_ = endpt_;
290  endpt_ = tmp;
291  }
292 }
293 
294 // Setup the initial constraints, being the limits of
295 // the vector and the extended ends.
297  TabConstraint::CreateConstraint(this, false);
299 }
300 
301 // Setup the constraints between the partners of this TabVector.
303  // With the first and last partner, we want a common bottom and top,
304  // respectively, and for each change of partner, we want a common
305  // top of first with bottom of next.
306  TabVector_C_IT it(&partners_);
307  TabVector* prev_partner = NULL;
308  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
309  TabVector* partner = it.data();
310  if (partner->top_constraints_ == NULL ||
311  partner->bottom_constraints_ == NULL) {
312  partner->Print("Impossible: has no constraints");
313  Print("This vector has it as a partner");
314  continue;
315  }
316  if (prev_partner == NULL) {
317  // This is the first partner, so common bottom.
318  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
319  partner->bottom_constraints_))
320  TabConstraint::MergeConstraints(bottom_constraints_,
321  partner->bottom_constraints_);
322  } else {
323  // We need prev top to be common with partner bottom.
324  if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
325  partner->bottom_constraints_))
326  TabConstraint::MergeConstraints(prev_partner->top_constraints_,
327  partner->bottom_constraints_);
328  }
329  prev_partner = partner;
330  if (it.at_last()) {
331  // This is the last partner, so common top.
332  if (TabConstraint::CompatibleConstraints(top_constraints_,
333  partner->top_constraints_))
334  TabConstraint::MergeConstraints(top_constraints_,
335  partner->top_constraints_);
336  }
337  }
338 }
339 
340 // Setup the constraints between this and its partner.
342  if (TabConstraint::CompatibleConstraints(bottom_constraints_,
343  partner->bottom_constraints_))
344  TabConstraint::MergeConstraints(bottom_constraints_,
345  partner->bottom_constraints_);
346  if (TabConstraint::CompatibleConstraints(top_constraints_,
347  partner->top_constraints_))
348  TabConstraint::MergeConstraints(top_constraints_,
349  partner->top_constraints_);
350 }
351 
352 // Use the constraints to modify the top and bottom.
354  if (top_constraints_ != NULL)
355  TabConstraint::ApplyConstraints(top_constraints_);
356  if (bottom_constraints_ != NULL)
357  TabConstraint::ApplyConstraints(bottom_constraints_);
358 }
359 
360 // Merge close tab vectors of the same side that overlap.
362  TabVector_LIST* vectors,
363  BlobGrid* grid) {
364  TabVector_IT it1(vectors);
365  for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
366  TabVector* v1 = it1.data();
367  TabVector_IT it2(it1);
368  for (it2.forward(); !it2.at_first(); it2.forward()) {
369  TabVector* v2 = it2.data();
370  if (v2->SimilarTo(vertical, *v1, grid)) {
371  // Merge into the forward one, in case the combined vector now
372  // overlaps one in between.
373  if (textord_debug_tabfind) {
374  v2->Print("Merging");
375  v1->Print("by deleting");
376  }
377  v2->MergeWith(vertical, it1.extract());
378  if (textord_debug_tabfind) {
379  v2->Print("Producing");
380  }
381  ICOORD merged_vector = v2->endpt();
382  merged_vector -= v2->startpt();
383  if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
384  v2->Print("Garbage result of merge?");
385  }
386  break;
387  }
388  }
389  }
390 }
391 
392 // Return true if this vector is the same side, overlaps, and close
393 // enough to the other to be merged.
394 bool TabVector::SimilarTo(const ICOORD& vertical,
395  const TabVector& other, BlobGrid* grid) const {
396  if ((IsRightTab() && other.IsRightTab()) ||
397  (IsLeftTab() && other.IsLeftTab())) {
398  // If they don't overlap, at least in extensions, then there is no chance.
399  if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0)
400  return false;
401  // A fast approximation to the scale factor of the sort_key_.
402  int v_scale = abs(vertical.y());
403  if (v_scale == 0)
404  v_scale = 1;
405  // If they are close enough, then OK.
406  if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
407  sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_)
408  return true;
409  // Ragged tabs get a bigger threshold.
410  if (!IsRagged() || !other.IsRagged() ||
411  sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
412  sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_)
413  return false;
414  if (grid == NULL) {
415  // There is nothing else to test!
416  return true;
417  }
418  // If there is nothing in the rectangle between the vector that is going to
419  // move, and the place it is moving to, then they can be merged.
420  // Setup a vertical search for any blob.
421  const TabVector* mover = (IsRightTab() &&
422  sort_key_ < other.sort_key_) ? this : &other;
423  int top_y = mover->endpt_.y();
424  int bottom_y = mover->startpt_.y();
425  int left = MIN(mover->XAtY(top_y), mover->XAtY(bottom_y));
426  int right = MAX(mover->XAtY(top_y), mover->XAtY(bottom_y));
427  int shift = abs(sort_key_ - other.sort_key_) / v_scale;
428  if (IsRightTab()) {
429  right += shift;
430  } else {
431  left -= shift;
432  }
433 
435  vsearch.StartVerticalSearch(left, right, top_y);
436  BLOBNBOX* blob;
437  while ((blob = vsearch.NextVerticalSearch(true)) != NULL) {
438  TBOX box = blob->bounding_box();
439  if (box.top() > bottom_y)
440  return true; // Nothing found.
441  if (box.bottom() < top_y)
442  continue; // Doesn't overlap.
443  int left_at_box = XAtY(box.bottom());
444  int right_at_box = left_at_box;
445  if (IsRightTab())
446  right_at_box += shift;
447  else
448  left_at_box -= shift;
449  if (MIN(right_at_box, box.right()) > MAX(left_at_box, box.left()))
450  return false;
451  }
452  return true; // Nothing found.
453  }
454  return false;
455 }
456 
457 // Eat the other TabVector into this and delete it.
458 void TabVector::MergeWith(const ICOORD& vertical, TabVector* other) {
459  extended_ymin_ = MIN(extended_ymin_, other->extended_ymin_);
460  extended_ymax_ = MAX(extended_ymax_, other->extended_ymax_);
461  if (other->IsRagged()) {
462  alignment_ = other->alignment_;
463  }
464  // Merge sort the two lists of boxes.
465  BLOBNBOX_C_IT it1(&boxes_);
466  BLOBNBOX_C_IT it2(&other->boxes_);
467  while (!it2.empty()) {
468  BLOBNBOX* bbox2 = it2.extract();
469  it2.forward();
470  TBOX box2 = bbox2->bounding_box();
471  BLOBNBOX* bbox1 = it1.data();
472  TBOX box1 = bbox1->bounding_box();
473  while (box1.bottom() < box2.bottom() && !it1.at_last()) {
474  it1.forward();
475  bbox1 = it1.data();
476  box1 = bbox1->bounding_box();
477  }
478  if (box1.bottom() < box2.bottom()) {
479  it1.add_to_end(bbox2);
480  } else if (bbox1 != bbox2) {
481  it1.add_before_stay_put(bbox2);
482  }
483  }
484  Fit(vertical, true);
485  other->Delete(this);
486 }
487 
488 // Add a new element to the list of partner TabVectors.
489 // Partners must be added in order of increasing y coordinate of the text line
490 // that makes them partners.
491 // Groups of identical partners are merged into one.
493  if (IsSeparator() || partner->IsSeparator())
494  return;
495  TabVector_C_IT it(&partners_);
496  if (!it.empty()) {
497  it.move_to_last();
498  if (it.data() == partner)
499  return;
500  }
501  it.add_after_then_move(partner);
502 }
503 
504 // Return true if other is a partner of this.
505 bool TabVector::IsAPartner(const TabVector* other) {
506  TabVector_C_IT it(&partners_);
507  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
508  if (it.data() == other)
509  return true;
510  }
511  return false;
512 }
513 
514 // These names must be synced with the TabAlignment enum in tabvector.h.
515 const char* kAlignmentNames[] = {
516  "Left Aligned",
517  "Left Ragged",
518  "Center",
519  "Right Aligned",
520  "Right Ragged",
521  "Separator"
522 };
523 
524 // Print basic information about this tab vector.
525 void TabVector::Print(const char* prefix) {
526  if (this == NULL) {
527  tprintf("%s <null>\n", prefix);
528  } else {
529  tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
530  " partners=%d\n",
531  prefix, kAlignmentNames[alignment_],
532  startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),
533  mean_width_, percent_score_, sort_key_,
534  boxes_.length(), partners_.length());
535  }
536 }
537 
538 // Print basic information about this tab vector and every box in it.
539 void TabVector::Debug(const char* prefix) {
540  Print(prefix);
541  BLOBNBOX_C_IT it(&boxes_);
542  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
543  BLOBNBOX* bbox = it.data();
544  const TBOX& box = bbox->bounding_box();
545  tprintf("Box at (%d,%d)->(%d,%d)\n",
546  box.left(), box.bottom(), box.right(), box.top());
547  }
548 }
549 
550 // Draw this tabvector in place in the given window.
552 #ifndef GRAPHICS_DISABLED
554  tab_win->Pen(ScrollView::BLUE);
555  else if (alignment_ == TA_LEFT_ALIGNED)
556  tab_win->Pen(ScrollView::LIME_GREEN);
557  else if (alignment_ == TA_LEFT_RAGGED)
558  tab_win->Pen(ScrollView::DARK_GREEN);
559  else if (alignment_ == TA_RIGHT_ALIGNED)
560  tab_win->Pen(ScrollView::PINK);
561  else if (alignment_ == TA_RIGHT_RAGGED)
562  tab_win->Pen(ScrollView::CORAL);
563  else
564  tab_win->Pen(ScrollView::WHITE);
565  tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
566  tab_win->Pen(ScrollView::GREY);
567  tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
568  tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
569  char score_buf[64];
570  snprintf(score_buf, sizeof(score_buf), "%d", percent_score_);
571  tab_win->TextAttributes("Times", 50, false, false, false);
572  tab_win->Text(startpt_.x(), startpt_.y(), score_buf);
573 #endif
574 }
575 
576 // Refit the line and/or re-evaluate the vector if the dirty flags are set.
578  TabFind* finder) {
579  if (needs_refit_)
580  Fit(vertical, true);
581  if (needs_evaluation_)
582  Evaluate(vertical, finder);
583 }
584 
585 // Evaluate the vector in terms of coverage of its length by good-looking
586 // box edges. A good looking box is one where its nearest neighbour on the
587 // inside is nearer than half the distance its nearest neighbour on the
588 // outside of the putative column. Bad boxes are removed from the line.
589 // A second pass then further filters boxes by requiring that the gutter
590 // width be a minimum fraction of the mean gutter along the line.
591 void TabVector::Evaluate(const ICOORD& vertical, TabFind* finder) {
592  bool debug = false;
593  needs_evaluation_ = false;
594  int length = endpt_.y() - startpt_.y();
595  if (length == 0 || boxes_.empty()) {
596  percent_score_ = 0;
597  Print("Zero length in evaluate");
598  return;
599  }
600  // Compute the mean box height.
601  BLOBNBOX_C_IT it(&boxes_);
602  int mean_height = 0;
603  int height_count = 0;
604  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
605  BLOBNBOX* bbox = it.data();
606  const TBOX& box = bbox->bounding_box();
607  int height = box.height();
608  mean_height += height;
609  ++height_count;
610  }
611  if (height_count > 0) mean_height /= height_count;
612  int max_gutter = kGutterMultiple * mean_height;
613  if (IsRagged()) {
614  // Ragged edges face a tougher test in that the gap must always be within
615  // the height of the blob.
616  max_gutter = kGutterToNeighbourRatio * mean_height;
617  }
618 
619  STATS gutters(0, max_gutter + 1);
620  // Evaluate the boxes for their goodness, calculating the coverage as we go.
621  // Remove boxes that are not good and shorten the list to the first and
622  // last good boxes.
623  int num_deleted_boxes = 0;
624  bool text_on_image = false;
625  int good_length = 0;
626  const TBOX* prev_good_box = NULL;
627  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
628  BLOBNBOX* bbox = it.data();
629  const TBOX& box = bbox->bounding_box();
630  int mid_y = (box.top() + box.bottom()) / 2;
631  if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
632  if (!debug) {
633  tprintf("After already deleting %d boxes, ", num_deleted_boxes);
634  Print("Starting evaluation");
635  }
636  debug = true;
637  }
638  // A good box is one where the nearest neighbour on the inside is closer
639  // than half the distance to the nearest neighbour on the outside
640  // (of the putative column).
641  bool left = IsLeftTab();
642  int tab_x = XAtY(mid_y);
643  int gutter_width;
644  int neighbour_gap;
645  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
646  bbox, &gutter_width, &neighbour_gap);
647  if (debug) {
648  tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n",
649  box.left(), box.bottom(), box.right(), box.top(),
650  gutter_width, neighbour_gap);
651  }
652  // Now we can make the test.
653  if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
654  // A good box contributes its height to the good_length.
655  good_length += box.top() - box.bottom();
656  gutters.add(gutter_width, 1);
657  // Two good boxes together contribute the gap between them
658  // to the good_length as well, as long as the gap is not
659  // too big.
660  if (prev_good_box != NULL) {
661  int vertical_gap = box.bottom() - prev_good_box->top();
662  double size1 = sqrt(static_cast<double>(prev_good_box->area()));
663  double size2 = sqrt(static_cast<double>(box.area()));
664  if (vertical_gap < kMaxFillinMultiple * MIN(size1, size2))
665  good_length += vertical_gap;
666  if (debug) {
667  tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n",
668  vertical_gap, kMaxFillinMultiple * MIN(size1, size2),
669  good_length);
670  }
671  } else {
672  // Adjust the start to the first good box.
673  SetYStart(box.bottom());
674  }
675  prev_good_box = &box;
676  if (bbox->flow() == BTFT_TEXT_ON_IMAGE)
677  text_on_image = true;
678  } else {
679  // Get rid of boxes that are not good.
680  if (debug) {
681  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n",
682  box.left(), box.bottom(), box.right(), box.top(),
683  gutter_width, neighbour_gap);
684  }
685  it.extract();
686  ++num_deleted_boxes;
687  }
688  }
689  if (debug) {
690  Print("Evaluating:");
691  }
692  // If there are any good boxes, do it again, except this time get rid of
693  // boxes that have a gutter that is a small fraction of the mean gutter.
694  // This filters out ends that run into a coincidental gap in the text.
695  int search_top = endpt_.y();
696  int search_bottom = startpt_.y();
697  int median_gutter = IntCastRounded(gutters.median());
698  if (gutters.get_total() > 0) {
699  prev_good_box = NULL;
700  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
701  BLOBNBOX* bbox = it.data();
702  const TBOX& box = bbox->bounding_box();
703  int mid_y = (box.top() + box.bottom()) / 2;
704  // A good box is one where the gutter width is at least some constant
705  // fraction of the mean gutter width.
706  bool left = IsLeftTab();
707  int tab_x = XAtY(mid_y);
708  int max_gutter = kGutterMultiple * mean_height;
709  if (IsRagged()) {
710  // Ragged edges face a tougher test in that the gap must always be
711  // within the height of the blob.
712  max_gutter = kGutterToNeighbourRatio * mean_height;
713  }
714  int gutter_width;
715  int neighbour_gap;
716  finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left,
717  bbox, &gutter_width, &neighbour_gap);
718  // Now we can make the test.
719  if (gutter_width >= median_gutter * kMinGutterFraction) {
720  if (prev_good_box == NULL) {
721  // Adjust the start to the first good box.
722  SetYStart(box.bottom());
723  search_bottom = box.top();
724  }
725  prev_good_box = &box;
726  search_top = box.bottom();
727  } else {
728  // Get rid of boxes that are not good.
729  if (debug) {
730  tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n",
731  box.left(), box.bottom(), box.right(), box.top(),
732  gutter_width, median_gutter);
733  }
734  it.extract();
735  ++num_deleted_boxes;
736  }
737  }
738  }
739  // If there has been a good box, adjust the end.
740  if (prev_good_box != NULL) {
741  SetYEnd(prev_good_box->top());
742  // Compute the percentage of the vector that is occupied by good boxes.
743  int length = endpt_.y() - startpt_.y();
744  percent_score_ = 100 * good_length / length;
745  if (num_deleted_boxes > 0) {
746  needs_refit_ = true;
747  FitAndEvaluateIfNeeded(vertical, finder);
748  if (boxes_.empty())
749  return;
750  }
751  // Test the gutter over the whole vector, instead of just at the boxes.
752  int required_shift;
753  if (search_bottom > search_top) {
754  search_bottom = startpt_.y();
755  search_top = endpt_.y();
756  }
757  double min_gutter_width = kLineCountReciprocal / boxes_.length();
758  min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
759  min_gutter_width *= mean_height;
760  int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
761  if (median_gutter > max_gutter_width)
762  max_gutter_width = median_gutter;
763  int gutter_width = finder->GutterWidth(search_bottom, search_top, *this,
764  text_on_image, max_gutter_width,
765  &required_shift);
766  if (gutter_width < min_gutter_width) {
767  if (debug) {
768  tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n",
769  gutter_width, min_gutter_width);
770  }
771  boxes_.shallow_clear();
772  percent_score_ = 0;
773  } else if (debug) {
774  tprintf("Final gutter %d, vs limit of %g, required shift = %d\n",
775  gutter_width, min_gutter_width, required_shift);
776  }
777  } else {
778  // There are no good boxes left, so score is 0.
779  percent_score_ = 0;
780  }
781 
782  if (debug) {
783  Print("Evaluation complete:");
784  }
785 }
786 
787 // (Re)Fit a line to the stored points. Returns false if the line
788 // is degenerate. Althougth the TabVector code mostly doesn't care about the
789 // direction of lines, XAtY would give silly results for a horizontal line.
790 // The class is mostly aimed at use for vertical lines representing
791 // horizontal tab stops.
792 bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
793  needs_refit_ = false;
794  if (boxes_.empty()) {
795  // Don't refit something with no boxes, as that only happens
796  // in Evaluate, and we don't want to end up with a zero vector.
797  if (!force_parallel)
798  return false;
799  // If we are forcing parallel, then we just need to set the sort_key_.
800  ICOORD midpt = startpt_;
801  midpt += endpt_;
802  midpt /= 2;
803  sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
804  return startpt_.y() != endpt_.y();
805  }
806  if (!force_parallel && !IsRagged()) {
807  // Use a fitted line as the vertical.
808  DetLineFit linepoints;
809  BLOBNBOX_C_IT it(&boxes_);
810  // Fit a line to all the boxes in the list.
811  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
812  BLOBNBOX* bbox = it.data();
813  TBOX box = bbox->bounding_box();
814  int x1 = IsRightTab() ? box.right() : box.left();
815  ICOORD boxpt(x1, box.bottom());
816  linepoints.Add(boxpt);
817  if (it.at_last()) {
818  ICOORD top_pt(x1, box.top());
819  linepoints.Add(top_pt);
820  }
821  }
822  linepoints.Fit(&startpt_, &endpt_);
823  if (startpt_.y() != endpt_.y()) {
824  vertical = endpt_;
825  vertical -= startpt_;
826  }
827  }
828  int start_y = startpt_.y();
829  int end_y = endpt_.y();
830  sort_key_ = IsLeftTab() ? MAX_INT32 : -MAX_INT32;
831  BLOBNBOX_C_IT it(&boxes_);
832  // Choose a line parallel to the vertical such that all boxes are on the
833  // correct side of it.
834  mean_width_ = 0;
835  int width_count = 0;
836  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
837  BLOBNBOX* bbox = it.data();
838  TBOX box = bbox->bounding_box();
839  mean_width_ += box.width();
840  ++width_count;
841  int x1 = IsRightTab() ? box.right() : box.left();
842  // Test both the bottom and the top, as one will be more extreme, depending
843  // on the direction of skew.
844  int bottom_y = box.bottom();
845  int top_y = box.top();
846  int key = SortKey(vertical, x1, bottom_y);
847  if (IsLeftTab() == (key < sort_key_)) {
848  sort_key_ = key;
849  startpt_ = ICOORD(x1, bottom_y);
850  }
851  key = SortKey(vertical, x1, top_y);
852  if (IsLeftTab() == (key < sort_key_)) {
853  sort_key_ = key;
854  startpt_ = ICOORD(x1, top_y);
855  }
856  if (it.at_first())
857  start_y = bottom_y;
858  if (it.at_last())
859  end_y = top_y;
860  }
861  if (width_count > 0) {
862  mean_width_ = (mean_width_ + width_count - 1) / width_count;
863  }
864  endpt_ = startpt_ + vertical;
865  needs_evaluation_ = true;
866  if (start_y != end_y) {
867  // Set the ends of the vector to fully include the first and last blobs.
868  startpt_.set_x(XAtY(vertical, sort_key_, start_y));
869  startpt_.set_y(start_y);
870  endpt_.set_x(XAtY(vertical, sort_key_, end_y));
871  endpt_.set_y(end_y);
872  return true;
873  }
874  return false;
875 }
876 
877 // Returns the singleton partner if there is one, or NULL otherwise.
879  if (!partners_.singleton())
880  return NULL;
881  TabVector_C_IT partner_it(&partners_);
882  TabVector* partner = partner_it.data();
883  return partner;
884 }
885 
886 // Return the partner of this TabVector if the vector qualifies as
887 // being a vertical text line, otherwise NULL.
889  if (!partners_.singleton())
890  return NULL;
891  TabVector_C_IT partner_it(&partners_);
892  TabVector* partner = partner_it.data();
893  BLOBNBOX_C_IT box_it1(&boxes_);
894  BLOBNBOX_C_IT box_it2(&partner->boxes_);
895  // Count how many boxes are also in the other list.
896  // At the same time, gather the mean width and median vertical gap.
897  if (textord_debug_tabfind > 1) {
898  Print("Testing for vertical text");
899  partner->Print(" partner");
900  }
901  int num_matched = 0;
902  int num_unmatched = 0;
903  int total_widths = 0;
904  int width = startpt().x() - partner->startpt().x();
905  if (width < 0)
906  width = -width;
907  STATS gaps(0, width * 2);
908  BLOBNBOX* prev_bbox = NULL;
909  box_it2.mark_cycle_pt();
910  for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
911  BLOBNBOX* bbox = box_it1.data();
912  TBOX box = bbox->bounding_box();
913  if (prev_bbox != NULL) {
914  gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
915  }
916  while (!box_it2.cycled_list() && box_it2.data() != bbox &&
917  box_it2.data()->bounding_box().bottom() < box.bottom()) {
918  box_it2.forward();
919  }
920  if (!box_it2.cycled_list() && box_it2.data() == bbox &&
921  bbox->region_type() >= BRT_UNKNOWN &&
922  (prev_bbox == NULL || prev_bbox->region_type() >= BRT_UNKNOWN))
923  ++num_matched;
924  else
925  ++num_unmatched;
926  total_widths += box.width();
927  prev_bbox = bbox;
928  }
929  if (num_unmatched + num_matched == 0) return NULL;
930  double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
931  double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
932  int min_box_match = static_cast<int>((num_matched + num_unmatched) *
934  bool is_vertical = (gaps.get_total() > 0 &&
935  num_matched >= min_box_match &&
936  gaps.median() <= max_gap);
937  if (textord_debug_tabfind > 1) {
938  tprintf("gaps=%d, matched=%d, unmatched=%d, min_match=%d "
939  "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
940  gaps.get_total(), num_matched, num_unmatched, min_box_match,
941  gaps.median(), avg_width, max_gap, is_vertical?"Yes":"No");
942  }
943  return (is_vertical) ? partner : NULL;
944 }
945 
946 // The constructor is private.
947 TabVector::TabVector(int extended_ymin, int extended_ymax,
948  TabAlignment alignment, BLOBNBOX_CLIST* boxes)
949  : extended_ymin_(extended_ymin), extended_ymax_(extended_ymax),
950  sort_key_(0), percent_score_(0), mean_width_(0),
951  needs_refit_(true), needs_evaluation_(true), alignment_(alignment),
952  top_constraints_(NULL), bottom_constraints_(NULL) {
953  BLOBNBOX_C_IT it(&boxes_);
954  it.add_list_after(boxes);
955 }
956 
957 // Delete this, but first, repoint all the partners to point to
958 // replacement. If replacement is NULL, then partner relationships
959 // are removed.
960 void TabVector::Delete(TabVector* replacement) {
961  TabVector_C_IT it(&partners_);
962  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
963  TabVector* partner = it.data();
964  TabVector_C_IT p_it(&partner->partners_);
965  // If partner already has replacement in its list, then make
966  // replacement null, and just remove this TabVector when we find it.
967  TabVector* partner_replacement = replacement;
968  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
969  TabVector* p_partner = p_it.data();
970  if (p_partner == partner_replacement) {
971  partner_replacement = NULL;
972  break;
973  }
974  }
975  // Remove all references to this, and replace with replacement if not NULL.
976  for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
977  TabVector* p_partner = p_it.data();
978  if (p_partner == this) {
979  p_it.extract();
980  if (partner_replacement != NULL)
981  p_it.add_before_stay_put(partner_replacement);
982  }
983  }
984  if (partner_replacement != NULL) {
985  partner_replacement->AddPartner(partner);
986  }
987  }
988  delete this;
989 }
990 
991 
992 } // namespace tesseract.
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:246
inT32 get_total() const
Definition: statistc.h:86
int extended_ymin() const
Definition: tabvector.h:155
const int kSimilarRaggedDist
Definition: tabvector.cpp:45
void Pen(Color color)
Definition: scrollview.cpp:726
static bool WithinTestRegion(int detail_level, int x, int y)
const int kGutterToNeighbourRatio
Definition: tabvector.cpp:40
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:792
const ICOORD & botleft() const
Definition: rect.h:88
bool IsRagged() const
Definition: tabvector.h:229
#define MAX(x, y)
Definition: ndminx.h:24
const ICOORD & endpt() const
Definition: tabvector.h:149
const int kSimilarVectorDist
Definition: tabvector.cpp:42
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:505
int XAtY(int y) const
Definition: tabvector.h:189
#define double_VAR(name, val, comment)
Definition: params.h:286
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:805
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:492
void SetYStart(int start_y)
Definition: tabvector.cpp:270
#define tprintf(...)
Definition: tprintf.h:31
#define MIN(x, y)
Definition: ndminx.h:28
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:658
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:888
Definition: statistc.h:33
#define ELIST2IZE(CLASSNAME)
Definition: elst2.h:1010
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:641
void add(inT32 value, inT32 count)
Definition: statistc.cpp:104
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:281
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:591
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:394
TabVector * ShallowCopy() const
Definition: tabvector.cpp:233
inT16 right() const
Definition: rect.h:75
const double kLineCountReciprocal
Definition: tabvector.cpp:51
static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:78
TabVector * GetSinglePartner()
Definition: tabvector.cpp:878
ICOORD topleft() const
Definition: rect.h:96
void rotate(const FCOORD &vec)
Definition: ipoints.h:241
double median() const
Definition: statistc.cpp:243
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:361
inT32 area() const
Definition: rect.h:118
int textord_debug_tabfind
Definition: alignedblob.cpp:27
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:208
int extended_ymax() const
Definition: tabvector.h:152
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:280
void Print(const char *prefix)
Definition: tabvector.cpp:525
#define CLISTIZE(CLASSNAME)
Definition: clst.h:958
inT16 y() const
access_function
Definition: points.h:56
inT16 left() const
Definition: rect.h:68
void SetYEnd(int end_y)
Definition: tabvector.cpp:275
void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap)
Definition: tabfind.cpp:230
bool textord_debug_printable
Definition: alignedblob.cpp:34
BlobRegionType region_type() const
Definition: blobbox.h:268
const char * kAlignmentNames[]
Definition: tabvector.cpp:515
void SetupPartnerConstraints()
Definition: tabvector.cpp:302
const int kGutterMultiple
Definition: tabvector.cpp:38
#define MAX_INT32
Definition: host.h:120
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:167
integer coordinate
Definition: points.h:30
const ICOORD & startpt() const
Definition: tabvector.h:146
inT16 bottom() const
Definition: rect.h:61
const double kMinGutterFraction
Definition: tabvector.cpp:49
static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:95
ELISTIZE(AmbigSpec)
const int kMaxFillinMultiple
Definition: tabvector.cpp:47
inT16 height() const
Definition: rect.h:104
bool IsLeftTab() const
Definition: tabvector.h:213
const double kMinAlignedGutter
Definition: tabvector.cpp:53
inT16 width() const
Definition: rect.h:111
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:52
static void CreateConstraint(TabVector *vector, bool is_top)
Definition: tabvector.cpp:66
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
int IntCastRounded(double x)
Definition: helpers.h:172
inT16 x() const
access function
Definition: points.h:52
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:75
Definition: rect.h:30
bool IsSeparator() const
Definition: tabvector.h:221
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:58
const double kMinRaggedGutter
Definition: tabvector.cpp:55
ICOORD botright() const
Definition: rect.h:92
void Debug(const char *prefix)
Definition: tabvector.cpp:539
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:61
#define NULL
Definition: host.h:144
const ICOORD & topright() const
Definition: rect.h:100
const TBOX & bounding_box() const
Definition: blobbox.h:215
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:164
static TabVector * FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y, int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x, int *vertical_y)
Definition: tabvector.cpp:182
inT16 top() const
Definition: rect.h:54
BlobTextFlowType flow() const
Definition: blobbox.h:280
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:538
Definition: points.h:189
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:551
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:458
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:577
static void ApplyConstraints(TabConstraint_LIST *constraints)
Definition: tabvector.cpp:119
int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, int max_gutter_width, int *required_shift)
Definition: tabfind.cpp:183
bool IsRightTab() const
Definition: tabvector.h:217
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:791