tesseract v5.3.3.20231005
tabvector.cpp
Go to the documentation of this file.
1
2// File: tabvector.cpp
3// Description: Class to hold a near-vertical vector representing a tab-stop.
4// Author: Ray Smith
5//
6// (C) Copyright 2008, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20# include "config_auto.h"
21#endif
22
23#include "blobbox.h"
24#include "colfind.h"
25#include "colpartitionset.h"
26#include "detlinefit.h"
27#include "helpers.h" // for IntCastRounded
28#include "statistc.h"
29#include "tabvector.h"
30
31#include <algorithm>
32
33namespace tesseract {
34
35// Multiple of height used as a gutter for evaluation search.
36const int kGutterMultiple = 4;
37// Multiple of neighbour gap that we expect the gutter gap to be at minimum.
39// Pixel distance for tab vectors to be considered the same.
40const int kSimilarVectorDist = 10;
41// Pixel distance for ragged tab vectors to be considered the same if there
42// is nothing in the overlap box
43const int kSimilarRaggedDist = 50;
44// Max multiple of height to allow filling in between blobs when evaluating.
45const int kMaxFillinMultiple = 11;
46// Min fraction of mean gutter size to allow a gutter on a good tab blob.
47const double kMinGutterFraction = 0.5;
48// Multiple of 1/n lines as a minimum gutter in evaluation.
49const double kLineCountReciprocal = 4.0;
50// Constant add-on for minimum gutter for aligned tabs.
51const double kMinAlignedGutter = 0.25;
52// Constant add-on for minimum gutter for ragged tabs.
53const double kMinRaggedGutter = 1.5;
54
56 "max fraction of mean blob width allowed for vertical gaps in "
57 "vertical text");
58
60 "Fraction of box matches required to declare a line vertical");
61
62// Create a constraint for the top or bottom of this TabVector.
63void TabConstraint::CreateConstraint(TabVector *vector, bool is_top) {
64 auto *constraint = new TabConstraint(vector, is_top);
65 auto *constraints = new TabConstraint_LIST;
66 TabConstraint_IT it(constraints);
67 it.add_to_end(constraint);
68 if (is_top) {
69 vector->set_top_constraints(constraints);
70 } else {
71 vector->set_bottom_constraints(constraints);
72 }
73}
74
75// Test to see if the constraints are compatible enough to merge.
76bool TabConstraint::CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {
77 if (list1 == list2) {
78 return false;
79 }
80 int y_min = -INT32_MAX;
81 int y_max = INT32_MAX;
82 if (textord_debug_tabfind > 3) {
83 tprintf("Testing constraint compatibility\n");
84 }
85 GetConstraints(list1, &y_min, &y_max);
86 GetConstraints(list2, &y_min, &y_max);
87 if (textord_debug_tabfind > 3) {
88 tprintf("Resulting range = [%d,%d]\n", y_min, y_max);
89 }
90 return y_max >= y_min;
91}
92
93// Merge the lists of constraints and update the TabVector pointers.
94// The second list is deleted.
95void TabConstraint::MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2) {
96 if (list1 == list2) {
97 return;
98 }
99 TabConstraint_IT it(list2);
100 if (textord_debug_tabfind > 3) {
101 tprintf("Merging constraints\n");
102 }
103 // The vectors of all constraints on list2 are now going to be on list1.
104 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
105 TabConstraint *constraint = it.data();
106 if (textord_debug_tabfind > 3) {
107 constraint->vector_->Print("Merge");
108 }
109 if (constraint->is_top_) {
110 constraint->vector_->set_top_constraints(list1);
111 } else {
112 constraint->vector_->set_bottom_constraints(list1);
113 }
114 }
115 it = list1;
116 it.add_list_before(list2);
117 delete list2;
118}
119
120// Set all the tops and bottoms as appropriate to a mean of the
121// constrained range. Delete all the constraints and list.
122void TabConstraint::ApplyConstraints(TabConstraint_LIST *constraints) {
123 int y_min = -INT32_MAX;
124 int y_max = INT32_MAX;
125 GetConstraints(constraints, &y_min, &y_max);
126 int y = (y_min + y_max) / 2;
127 TabConstraint_IT it(constraints);
128 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
129 TabConstraint *constraint = it.data();
130 TabVector *v = constraint->vector_;
131 if (constraint->is_top_) {
132 v->SetYEnd(y);
133 v->set_top_constraints(nullptr);
134 } else {
135 v->SetYStart(y);
136 v->set_bottom_constraints(nullptr);
137 }
138 }
139 delete constraints;
140}
141
142TabConstraint::TabConstraint(TabVector *vector, bool is_top) : vector_(vector), is_top_(is_top) {
143 if (is_top) {
144 y_min_ = vector->endpt().y();
145 y_max_ = vector->extended_ymax();
146 } else {
147 y_max_ = vector->startpt().y();
148 y_min_ = vector->extended_ymin();
149 }
150}
151
152// Get the max of the mins and the min of the maxes.
153void TabConstraint::GetConstraints(TabConstraint_LIST *constraints, int *y_min, int *y_max) {
154 TabConstraint_IT it(constraints);
155 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
156 TabConstraint *constraint = it.data();
157 if (textord_debug_tabfind > 3) {
158 tprintf("Constraint is [%d,%d]", constraint->y_min_, constraint->y_max_);
159 constraint->vector_->Print(" for");
160 }
161 *y_min = std::max(*y_min, constraint->y_min_);
162 *y_max = std::min(*y_max, constraint->y_max_);
163 }
164}
165
166// The constructor is private. See the bottom of the file...
167
168// Public factory to build a TabVector from a list of boxes.
169// The TabVector will be of the given alignment type.
170// The input vertical vector is used in fitting, and the output
171// vertical_x, vertical_y have the resulting line vector added to them
172// if the alignment is not ragged.
173// The extended_start_y and extended_end_y are the maximum possible
174// extension to the line segment that can be used to align with others.
175// The input CLIST of BLOBNBOX good_points is consumed and taken over.
176TabVector *TabVector::FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y,
177 int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x,
178 int *vertical_y) {
179 auto *vector = new TabVector(extended_start_y, extended_end_y, alignment, good_points);
180 if (!vector->Fit(vertical, false)) {
181 delete vector;
182 return nullptr;
183 }
184 if (!vector->IsRagged()) {
185 vertical = vector->endpt_ - vector->startpt_;
186 int weight = vector->BoxCount();
187 *vertical_x += vertical.x() * weight;
188 *vertical_y += vertical.y() * weight;
189 }
190 return vector;
191}
192
193// Build a ragged TabVector by copying another's direction, shifting it
194// to match the given blob, and making its initial extent the height
195// of the blob, but its extended bounds from the bounds of the original.
196TabVector::TabVector(const TabVector &src, TabAlignment alignment, const ICOORD &vertical_skew,
197 BLOBNBOX *blob)
198 : extended_ymin_(src.extended_ymin_)
199 , extended_ymax_(src.extended_ymax_)
200 , needs_refit_(true)
201 , needs_evaluation_(true)
202 , alignment_(alignment) {
203 BLOBNBOX_C_IT it(&boxes_);
204 it.add_to_end(blob);
205 TBOX box = blob->bounding_box();
206 if (IsLeftTab()) {
207 startpt_ = box.botleft();
208 endpt_ = box.topleft();
209 } else {
210 startpt_ = box.botright();
211 endpt_ = box.topright();
212 }
213 sort_key_ =
214 SortKey(vertical_skew, (startpt_.x() + endpt_.x()) / 2, (startpt_.y() + endpt_.y()) / 2);
215 if (textord_debug_tabfind > 3) {
216 Print("Constructed a new tab vector:");
217 }
218}
219
220// Copies basic attributes of a tab vector for simple operations.
221// Copies things such startpt, endpt, range.
222// Does not copy things such as partners, boxes, or constraints.
223// This is useful if you only need vector information for processing, such
224// as in the table detection code.
226 auto *copy = new TabVector();
227 copy->startpt_ = startpt_;
228 copy->endpt_ = endpt_;
229 copy->alignment_ = alignment_;
230 copy->extended_ymax_ = extended_ymax_;
231 copy->extended_ymin_ = extended_ymin_;
232 copy->intersects_other_lines_ = intersects_other_lines_;
233 return copy;
234}
235
236// Extend this vector to include the supplied blob if it doesn't
237// already have it.
239 TBOX new_box = new_blob->bounding_box();
240 BLOBNBOX_C_IT it(&boxes_);
241 if (!it.empty()) {
242 BLOBNBOX *blob = it.data();
243 TBOX box = blob->bounding_box();
244 while (!it.at_last() && box.top() <= new_box.top()) {
245 if (blob == new_blob) {
246 return; // We have it already.
247 }
248 it.forward();
249 blob = it.data();
250 box = blob->bounding_box();
251 }
252 if (box.top() >= new_box.top()) {
253 it.add_before_stay_put(new_blob);
254 needs_refit_ = true;
255 return;
256 }
257 }
258 needs_refit_ = true;
259 it.add_after_stay_put(new_blob);
260}
261
262// Set the ycoord of the start and move the xcoord to match.
263void TabVector::SetYStart(int start_y) {
264 startpt_.set_x(XAtY(start_y));
265 startpt_.set_y(start_y);
266}
267// Set the ycoord of the end and move the xcoord to match.
268void TabVector::SetYEnd(int end_y) {
269 endpt_.set_x(XAtY(end_y));
270 endpt_.set_y(end_y);
271}
272
273// Rotate the ends by the given vector. Auto flip start and end if needed.
274void TabVector::Rotate(const FCOORD &rotation) {
275 startpt_.rotate(rotation);
276 endpt_.rotate(rotation);
277 int dx = endpt_.x() - startpt_.x();
278 int dy = endpt_.y() - startpt_.y();
279 if ((dy < 0 && abs(dy) > abs(dx)) || (dx < 0 && abs(dx) > abs(dy))) {
280 // Need to flip start/end.
281 ICOORD tmp = startpt_;
282 startpt_ = endpt_;
283 endpt_ = tmp;
284 }
285}
286
287// Setup the initial constraints, being the limits of
288// the vector and the extended ends.
292}
293
294// Setup the constraints between the partners of this TabVector.
296 // With the first and last partner, we want a common bottom and top,
297 // respectively, and for each change of partner, we want a common
298 // top of first with bottom of next.
299 TabVector_C_IT it(&partners_);
300 TabVector *prev_partner = nullptr;
301 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
302 TabVector *partner = it.data();
303 if (partner->top_constraints_ == nullptr || partner->bottom_constraints_ == nullptr) {
304 partner->Print("Impossible: has no constraints");
305 Print("This vector has it as a partner");
306 continue;
307 }
308 if (prev_partner == nullptr) {
309 // This is the first partner, so common bottom.
310 if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {
311 TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);
312 }
313 } else {
314 // We need prev top to be common with partner bottom.
315 if (TabConstraint::CompatibleConstraints(prev_partner->top_constraints_,
316 partner->bottom_constraints_)) {
317 TabConstraint::MergeConstraints(prev_partner->top_constraints_,
318 partner->bottom_constraints_);
319 }
320 }
321 prev_partner = partner;
322 if (it.at_last()) {
323 // This is the last partner, so common top.
324 if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {
325 TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);
326 }
327 }
328 }
329}
330
331// Setup the constraints between this and its partner.
333 if (TabConstraint::CompatibleConstraints(bottom_constraints_, partner->bottom_constraints_)) {
334 TabConstraint::MergeConstraints(bottom_constraints_, partner->bottom_constraints_);
335 }
336 if (TabConstraint::CompatibleConstraints(top_constraints_, partner->top_constraints_)) {
337 TabConstraint::MergeConstraints(top_constraints_, partner->top_constraints_);
338 }
339}
340
341// Use the constraints to modify the top and bottom.
343 if (top_constraints_ != nullptr) {
344 TabConstraint::ApplyConstraints(top_constraints_);
345 }
346 if (bottom_constraints_ != nullptr) {
347 TabConstraint::ApplyConstraints(bottom_constraints_);
348 }
349}
350
351// Merge close tab vectors of the same side that overlap.
352void TabVector::MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors,
353 BlobGrid *grid) {
354 TabVector_IT it1(vectors);
355 for (it1.mark_cycle_pt(); !it1.cycled_list(); it1.forward()) {
356 TabVector *v1 = it1.data();
357 TabVector_IT it2(it1);
358 for (it2.forward(); !it2.at_first(); it2.forward()) {
359 TabVector *v2 = it2.data();
360 if (v2->SimilarTo(vertical, *v1, grid)) {
361 // Merge into the forward one, in case the combined vector now
362 // overlaps one in between.
364 v2->Print("Merging");
365 v1->Print("by deleting");
366 }
367 v2->MergeWith(vertical, it1.extract());
369 v2->Print("Producing");
370 }
371 ICOORD merged_vector = v2->endpt();
372 merged_vector -= v2->startpt();
373 if (textord_debug_tabfind && abs(merged_vector.x()) > 100) {
374 v2->Print("Garbage result of merge?");
375 }
376 break;
377 }
378 }
379 }
380}
381
382// Return true if this vector is the same side, overlaps, and close
383// enough to the other to be merged.
384bool TabVector::SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const {
385 if ((IsRightTab() && other.IsRightTab()) || (IsLeftTab() && other.IsLeftTab())) {
386 // If they don't overlap, at least in extensions, then there is no chance.
387 if (ExtendedOverlap(other.extended_ymax_, other.extended_ymin_) < 0) {
388 return false;
389 }
390 // A fast approximation to the scale factor of the sort_key_.
391 int v_scale = abs(vertical.y());
392 if (v_scale == 0) {
393 v_scale = 1;
394 }
395 // If they are close enough, then OK.
396 if (sort_key_ + kSimilarVectorDist * v_scale >= other.sort_key_ &&
397 sort_key_ - kSimilarVectorDist * v_scale <= other.sort_key_) {
398 return true;
399 }
400 // Ragged tabs get a bigger threshold.
401 if (!IsRagged() || !other.IsRagged() ||
402 sort_key_ + kSimilarRaggedDist * v_scale < other.sort_key_ ||
403 sort_key_ - kSimilarRaggedDist * v_scale > other.sort_key_) {
404 return false;
405 }
406 if (grid == nullptr) {
407 // There is nothing else to test!
408 return true;
409 }
410 // If there is nothing in the rectangle between the vector that is going to
411 // move, and the place it is moving to, then they can be merged.
412 // Setup a vertical search for any blob.
413 const TabVector *mover = (IsRightTab() && sort_key_ < other.sort_key_) ? this : &other;
414 int top_y = mover->endpt_.y();
415 int bottom_y = mover->startpt_.y();
416 int left = std::min(mover->XAtY(top_y), mover->XAtY(bottom_y));
417 int right = std::max(mover->XAtY(top_y), mover->XAtY(bottom_y));
418 int shift = abs(sort_key_ - other.sort_key_) / v_scale;
419 if (IsRightTab()) {
420 right += shift;
421 } else {
422 left -= shift;
423 }
424
426 vsearch.StartVerticalSearch(left, right, top_y);
427 BLOBNBOX *blob;
428 while ((blob = vsearch.NextVerticalSearch(true)) != nullptr) {
429 const TBOX &box = blob->bounding_box();
430 if (box.top() > bottom_y) {
431 return true; // Nothing found.
432 }
433 if (box.bottom() < top_y) {
434 continue; // Doesn't overlap.
435 }
436 int left_at_box = XAtY(box.bottom());
437 int right_at_box = left_at_box;
438 if (IsRightTab()) {
439 right_at_box += shift;
440 } else {
441 left_at_box -= shift;
442 }
443 if (std::min(right_at_box, static_cast<int>(box.right())) >
444 std::max(left_at_box, static_cast<int>(box.left()))) {
445 return false;
446 }
447 }
448 return true; // Nothing found.
449 }
450 return false;
451}
452
453// Eat the other TabVector into this and delete it.
454void TabVector::MergeWith(const ICOORD &vertical, TabVector *other) {
455 extended_ymin_ = std::min(extended_ymin_, other->extended_ymin_);
456 extended_ymax_ = std::max(extended_ymax_, other->extended_ymax_);
457 if (other->IsRagged()) {
458 alignment_ = other->alignment_;
459 }
460 // Merge sort the two lists of boxes.
461 BLOBNBOX_C_IT it1(&boxes_);
462 BLOBNBOX_C_IT it2(&other->boxes_);
463 while (!it2.empty()) {
464 BLOBNBOX *bbox2 = it2.extract();
465 it2.forward();
466 TBOX box2 = bbox2->bounding_box();
467 BLOBNBOX *bbox1 = it1.data();
468 TBOX box1 = bbox1->bounding_box();
469 while (box1.bottom() < box2.bottom() && !it1.at_last()) {
470 it1.forward();
471 bbox1 = it1.data();
472 box1 = bbox1->bounding_box();
473 }
474 if (box1.bottom() < box2.bottom()) {
475 it1.add_to_end(bbox2);
476 } else if (bbox1 != bbox2) {
477 it1.add_before_stay_put(bbox2);
478 }
479 }
480 Fit(vertical, true);
481 other->Delete(this);
482}
483
484// Add a new element to the list of partner TabVectors.
485// Partners must be added in order of increasing y coordinate of the text line
486// that makes them partners.
487// Groups of identical partners are merged into one.
489 if (IsSeparator() || partner->IsSeparator()) {
490 return;
491 }
492 TabVector_C_IT it(&partners_);
493 if (!it.empty()) {
494 it.move_to_last();
495 if (it.data() == partner) {
496 return;
497 }
498 }
499 it.add_after_then_move(partner);
500}
501
502// Return true if other is a partner of this.
504 TabVector_C_IT it(&partners_);
505 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
506 if (it.data() == other) {
507 return true;
508 }
509 }
510 return false;
511}
512
513// These names must be synced with the TabAlignment enum in tabvector.h.
514static const char *const kAlignmentNames[] = {"Left Aligned", "Left Ragged", "Center",
515 "Right Aligned", "Right Ragged", "Separator"};
516
517// Print basic information about this tab vector.
518void TabVector::Print(const char *prefix) {
519 tprintf(
520 "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d,"
521 " partners=%d\n",
522 prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(),
523 mean_width_, percent_score_, sort_key_, boxes_.length(), partners_.length());
524}
525
526// Print basic information about this tab vector and every box in it.
527void TabVector::Debug(const char *prefix) {
528 Print(prefix);
529 BLOBNBOX_C_IT it(&boxes_);
530 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
531 BLOBNBOX *bbox = it.data();
532 const TBOX &box = bbox->bounding_box();
533 tprintf("Box at (%d,%d)->(%d,%d)\n", box.left(), box.bottom(), box.right(), box.top());
534 }
535}
536
537#ifndef GRAPHICS_DISABLED
538
539// Draw this tabvector in place in the given window.
542 tab_win->Pen(ScrollView::BLUE);
543 } else if (alignment_ == TA_LEFT_ALIGNED) {
544 tab_win->Pen(ScrollView::LIME_GREEN);
545 } else if (alignment_ == TA_LEFT_RAGGED) {
546 tab_win->Pen(ScrollView::DARK_GREEN);
547 } else if (alignment_ == TA_RIGHT_ALIGNED) {
548 tab_win->Pen(ScrollView::PINK);
549 } else if (alignment_ == TA_RIGHT_RAGGED) {
550 tab_win->Pen(ScrollView::CORAL);
551 } else {
552 tab_win->Pen(ScrollView::WHITE);
553 }
554 tab_win->Line(startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y());
555 tab_win->Pen(ScrollView::GREY);
556 tab_win->Line(startpt_.x(), startpt_.y(), startpt_.x(), extended_ymin_);
557 tab_win->Line(endpt_.x(), extended_ymax_, endpt_.x(), endpt_.y());
558 auto score_string = std::to_string(percent_score_);
559 tab_win->TextAttributes("Times", 50, false, false, false);
560 tab_win->Text(startpt_.x(), startpt_.y(), score_string.c_str());
561}
562
563#endif
564
565// Refit the line and/or re-evaluate the vector if the dirty flags are set.
566void TabVector::FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder) {
567 if (needs_refit_) {
568 Fit(vertical, true);
569 }
570 if (needs_evaluation_) {
571 Evaluate(vertical, finder);
572 }
573}
574
575// Evaluate the vector in terms of coverage of its length by good-looking
576// box edges. A good looking box is one where its nearest neighbour on the
577// inside is nearer than half the distance its nearest neighbour on the
578// outside of the putative column. Bad boxes are removed from the line.
579// A second pass then further filters boxes by requiring that the gutter
580// width be a minimum fraction of the mean gutter along the line.
581void TabVector::Evaluate(const ICOORD &vertical, TabFind *finder) {
582 bool debug = false;
583 needs_evaluation_ = false;
584 int length = endpt_.y() - startpt_.y();
585 if (length == 0 || boxes_.empty()) {
586 percent_score_ = 0;
587 Print("Zero length in evaluate");
588 return;
589 }
590 // Compute the mean box height.
591 BLOBNBOX_C_IT it(&boxes_);
592 int mean_height = 0;
593 int height_count = 0;
594 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
595 BLOBNBOX *bbox = it.data();
596 const TBOX &box = bbox->bounding_box();
597 int height = box.height();
598 mean_height += height;
599 ++height_count;
600 }
601 if (height_count > 0) {
602 mean_height /= height_count;
603 }
604 int max_gutter = kGutterMultiple * mean_height;
605 if (IsRagged()) {
606 // Ragged edges face a tougher test in that the gap must always be within
607 // the height of the blob.
608 max_gutter = kGutterToNeighbourRatio * mean_height;
609 }
610
611 STATS gutters(0, max_gutter);
612 // Evaluate the boxes for their goodness, calculating the coverage as we go.
613 // Remove boxes that are not good and shorten the list to the first and
614 // last good boxes.
615 int num_deleted_boxes = 0;
616 bool text_on_image = false;
617 int good_length = 0;
618 const TBOX *prev_good_box = nullptr;
619 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
620 BLOBNBOX *bbox = it.data();
621 const TBOX &box = bbox->bounding_box();
622 int mid_y = (box.top() + box.bottom()) / 2;
623 if (TabFind::WithinTestRegion(2, XAtY(box.bottom()), box.bottom())) {
624 if (!debug) {
625 tprintf("After already deleting %d boxes, ", num_deleted_boxes);
626 Print("Starting evaluation");
627 }
628 debug = true;
629 }
630 // A good box is one where the nearest neighbour on the inside is closer
631 // than half the distance to the nearest neighbour on the outside
632 // (of the putative column).
633 bool left = IsLeftTab();
634 int tab_x = XAtY(mid_y);
635 int gutter_width;
636 int neighbour_gap;
637 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,
638 &neighbour_gap);
639 if (debug) {
640 tprintf("Box (%d,%d)->(%d,%d) has gutter %d, ndist %d\n", box.left(), box.bottom(),
641 box.right(), box.top(), gutter_width, neighbour_gap);
642 }
643 // Now we can make the test.
644 if (neighbour_gap * kGutterToNeighbourRatio <= gutter_width) {
645 // A good box contributes its height to the good_length.
646 good_length += box.top() - box.bottom();
647 gutters.add(gutter_width, 1);
648 // Two good boxes together contribute the gap between them
649 // to the good_length as well, as long as the gap is not
650 // too big.
651 if (prev_good_box != nullptr) {
652 int vertical_gap = box.bottom() - prev_good_box->top();
653 double size1 = sqrt(static_cast<double>(prev_good_box->area()));
654 double size2 = sqrt(static_cast<double>(box.area()));
655 if (vertical_gap < kMaxFillinMultiple * std::min(size1, size2)) {
656 good_length += vertical_gap;
657 }
658 if (debug) {
659 tprintf("Box and prev good, gap=%d, target %g, goodlength=%d\n", vertical_gap,
660 kMaxFillinMultiple * std::min(size1, size2), good_length);
661 }
662 } else {
663 // Adjust the start to the first good box.
664 SetYStart(box.bottom());
665 }
666 prev_good_box = &box;
667 if (bbox->flow() == BTFT_TEXT_ON_IMAGE) {
668 text_on_image = true;
669 }
670 } else {
671 // Get rid of boxes that are not good.
672 if (debug) {
673 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, ndist %d\n", box.left(), box.bottom(),
674 box.right(), box.top(), gutter_width, neighbour_gap);
675 }
676 it.extract();
677 ++num_deleted_boxes;
678 }
679 }
680 if (debug) {
681 Print("Evaluating:");
682 }
683 // If there are any good boxes, do it again, except this time get rid of
684 // boxes that have a gutter that is a small fraction of the mean gutter.
685 // This filters out ends that run into a coincidental gap in the text.
686 int search_top = endpt_.y();
687 int search_bottom = startpt_.y();
688 int median_gutter = IntCastRounded(gutters.median());
689 if (gutters.get_total() > 0) {
690 prev_good_box = nullptr;
691 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
692 BLOBNBOX *bbox = it.data();
693 const TBOX &box = bbox->bounding_box();
694 int mid_y = (box.top() + box.bottom()) / 2;
695 // A good box is one where the gutter width is at least some constant
696 // fraction of the mean gutter width.
697 bool left = IsLeftTab();
698 int tab_x = XAtY(mid_y);
699 int max_gutter = kGutterMultiple * mean_height;
700 if (IsRagged()) {
701 // Ragged edges face a tougher test in that the gap must always be
702 // within the height of the blob.
703 max_gutter = kGutterToNeighbourRatio * mean_height;
704 }
705 int gutter_width;
706 int neighbour_gap;
707 finder->GutterWidthAndNeighbourGap(tab_x, mean_height, max_gutter, left, bbox, &gutter_width,
708 &neighbour_gap);
709 // Now we can make the test.
710 if (gutter_width >= median_gutter * kMinGutterFraction) {
711 if (prev_good_box == nullptr) {
712 // Adjust the start to the first good box.
713 SetYStart(box.bottom());
714 search_bottom = box.top();
715 }
716 prev_good_box = &box;
717 search_top = box.bottom();
718 } else {
719 // Get rid of boxes that are not good.
720 if (debug) {
721 tprintf("Bad Box (%d,%d)->(%d,%d) with gutter %d, mean gutter %d\n", box.left(),
722 box.bottom(), box.right(), box.top(), gutter_width, median_gutter);
723 }
724 it.extract();
725 ++num_deleted_boxes;
726 }
727 }
728 }
729 // If there has been a good box, adjust the end.
730 if (prev_good_box != nullptr) {
731 SetYEnd(prev_good_box->top());
732 // Compute the percentage of the vector that is occupied by good boxes.
733 int length = endpt_.y() - startpt_.y();
734 percent_score_ = 100 * good_length / length;
735 if (num_deleted_boxes > 0) {
736 needs_refit_ = true;
737 FitAndEvaluateIfNeeded(vertical, finder);
738 if (boxes_.empty()) {
739 return;
740 }
741 }
742 // Test the gutter over the whole vector, instead of just at the boxes.
743 int required_shift;
744 if (search_bottom > search_top) {
745 search_bottom = startpt_.y();
746 search_top = endpt_.y();
747 }
748 double min_gutter_width = kLineCountReciprocal / boxes_.length();
749 min_gutter_width += IsRagged() ? kMinRaggedGutter : kMinAlignedGutter;
750 min_gutter_width *= mean_height;
751 int max_gutter_width = IntCastRounded(min_gutter_width) + 1;
752 if (median_gutter > max_gutter_width) {
753 max_gutter_width = median_gutter;
754 }
755 int gutter_width = finder->GutterWidth(search_bottom, search_top, *this, text_on_image,
756 max_gutter_width, &required_shift);
757 if (gutter_width < min_gutter_width) {
758 if (debug) {
759 tprintf("Rejecting bad tab Vector with %d gutter vs %g min\n", gutter_width,
760 min_gutter_width);
761 }
762 boxes_.shallow_clear();
763 percent_score_ = 0;
764 } else if (debug) {
765 tprintf("Final gutter %d, vs limit of %g, required shift = %d\n", gutter_width,
766 min_gutter_width, required_shift);
767 }
768 } else {
769 // There are no good boxes left, so score is 0.
770 percent_score_ = 0;
771 }
772
773 if (debug) {
774 Print("Evaluation complete:");
775 }
776}
777
778// (Re)Fit a line to the stored points. Returns false if the line
779// is degenerate. Although the TabVector code mostly doesn't care about the
780// direction of lines, XAtY would give silly results for a horizontal line.
781// The class is mostly aimed at use for vertical lines representing
782// horizontal tab stops.
783bool TabVector::Fit(ICOORD vertical, bool force_parallel) {
784 needs_refit_ = false;
785 if (boxes_.empty()) {
786 // Don't refit something with no boxes, as that only happens
787 // in Evaluate, and we don't want to end up with a zero vector.
788 if (!force_parallel) {
789 return false;
790 }
791 // If we are forcing parallel, then we just need to set the sort_key_.
792 ICOORD midpt = startpt_;
793 midpt += endpt_;
794 midpt /= 2;
795 sort_key_ = SortKey(vertical, midpt.x(), midpt.y());
796 return startpt_.y() != endpt_.y();
797 }
798 if (!force_parallel && !IsRagged()) {
799 // Use a fitted line as the vertical.
800 DetLineFit linepoints;
801 BLOBNBOX_C_IT it(&boxes_);
802 // Fit a line to all the boxes in the list.
803 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
804 BLOBNBOX *bbox = it.data();
805 const TBOX &box = bbox->bounding_box();
806 int x1 = IsRightTab() ? box.right() : box.left();
807 ICOORD boxpt(x1, box.bottom());
808 linepoints.Add(boxpt);
809 if (it.at_last()) {
810 ICOORD top_pt(x1, box.top());
811 linepoints.Add(top_pt);
812 }
813 }
814 linepoints.Fit(&startpt_, &endpt_);
815 if (startpt_.y() != endpt_.y()) {
816 vertical = endpt_;
817 vertical -= startpt_;
818 }
819 }
820 int start_y = startpt_.y();
821 int end_y = endpt_.y();
822 sort_key_ = IsLeftTab() ? INT32_MAX : -INT32_MAX;
823 BLOBNBOX_C_IT it(&boxes_);
824 // Choose a line parallel to the vertical such that all boxes are on the
825 // correct side of it.
826 mean_width_ = 0;
827 int width_count = 0;
828 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
829 BLOBNBOX *bbox = it.data();
830 const TBOX &box = bbox->bounding_box();
831 mean_width_ += box.width();
832 ++width_count;
833 int x1 = IsRightTab() ? box.right() : box.left();
834 // Test both the bottom and the top, as one will be more extreme, depending
835 // on the direction of skew.
836 int bottom_y = box.bottom();
837 int top_y = box.top();
838 int key = SortKey(vertical, x1, bottom_y);
839 if (IsLeftTab() == (key < sort_key_)) {
840 sort_key_ = key;
841 startpt_ = ICOORD(x1, bottom_y);
842 }
843 key = SortKey(vertical, x1, top_y);
844 if (IsLeftTab() == (key < sort_key_)) {
845 sort_key_ = key;
846 startpt_ = ICOORD(x1, top_y);
847 }
848 if (it.at_first()) {
849 start_y = bottom_y;
850 }
851 if (it.at_last()) {
852 end_y = top_y;
853 }
854 }
855 if (width_count > 0) {
856 mean_width_ = (mean_width_ + width_count - 1) / width_count;
857 }
858 endpt_ = startpt_ + vertical;
859 needs_evaluation_ = true;
860 if (start_y != end_y) {
861 // Set the ends of the vector to fully include the first and last blobs.
862 startpt_.set_x(XAtY(vertical, sort_key_, start_y));
863 startpt_.set_y(start_y);
864 endpt_.set_x(XAtY(vertical, sort_key_, end_y));
865 endpt_.set_y(end_y);
866 return true;
867 }
868 return false;
869}
870
871// Returns the singleton partner if there is one, or nullptr otherwise.
873 if (!partners_.singleton()) {
874 return nullptr;
875 }
876 TabVector_C_IT partner_it(&partners_);
877 TabVector *partner = partner_it.data();
878 return partner;
879}
880
881// Return the partner of this TabVector if the vector qualifies as
882// being a vertical text line, otherwise nullptr.
884 if (!partners_.singleton()) {
885 return nullptr;
886 }
887 TabVector_C_IT partner_it(&partners_);
888 TabVector *partner = partner_it.data();
889 BLOBNBOX_C_IT box_it1(&boxes_);
890 BLOBNBOX_C_IT box_it2(&partner->boxes_);
891 // Count how many boxes are also in the other list.
892 // At the same time, gather the mean width and median vertical gap.
893 if (textord_debug_tabfind > 1) {
894 Print("Testing for vertical text");
895 partner->Print(" partner");
896 }
897 int num_matched = 0;
898 int num_unmatched = 0;
899 int total_widths = 0;
900 int width = startpt().x() - partner->startpt().x();
901 if (width < 0) {
902 width = -width;
903 }
904 STATS gaps(0, width * 2 - 1);
905 BLOBNBOX *prev_bbox = nullptr;
906 box_it2.mark_cycle_pt();
907 for (box_it1.mark_cycle_pt(); !box_it1.cycled_list(); box_it1.forward()) {
908 BLOBNBOX *bbox = box_it1.data();
909 TBOX box = bbox->bounding_box();
910 if (prev_bbox != nullptr) {
911 gaps.add(box.bottom() - prev_bbox->bounding_box().top(), 1);
912 }
913 while (!box_it2.cycled_list() && box_it2.data() != bbox &&
914 box_it2.data()->bounding_box().bottom() < box.bottom()) {
915 box_it2.forward();
916 }
917 if (!box_it2.cycled_list() && box_it2.data() == bbox && bbox->region_type() >= BRT_UNKNOWN &&
918 (prev_bbox == nullptr || prev_bbox->region_type() >= BRT_UNKNOWN)) {
919 ++num_matched;
920 } else {
921 ++num_unmatched;
922 }
923 total_widths += box.width();
924 prev_bbox = bbox;
925 }
926 if (num_unmatched + num_matched == 0) {
927 return nullptr;
928 }
929 double avg_width = total_widths * 1.0 / (num_unmatched + num_matched);
930 double max_gap = textord_tabvector_vertical_gap_fraction * avg_width;
931 int min_box_match =
932 static_cast<int>((num_matched + num_unmatched) * textord_tabvector_vertical_box_ratio);
933 bool is_vertical =
934 (gaps.get_total() > 0 && num_matched >= min_box_match && gaps.median() <= max_gap);
935 if (textord_debug_tabfind > 1) {
936 tprintf(
937 "gaps=%d, matched=%d, unmatched=%d, min_match=%d "
938 "median gap=%.2f, width=%.2f max_gap=%.2f Vertical=%s\n",
939 gaps.get_total(), num_matched, num_unmatched, min_box_match, gaps.median(), avg_width,
940 max_gap, is_vertical ? "Yes" : "No");
941 }
942 return (is_vertical) ? partner : nullptr;
943}
944
945// The constructor is private.
946TabVector::TabVector(int extended_ymin, int extended_ymax, TabAlignment alignment,
947 BLOBNBOX_CLIST *boxes)
948 : extended_ymin_(extended_ymin)
949 , extended_ymax_(extended_ymax)
950 , sort_key_(0)
951 , percent_score_(0)
952 , mean_width_(0)
953 , needs_refit_(true)
954 , needs_evaluation_(true)
955 , alignment_(alignment)
956 , top_constraints_(nullptr)
957 , bottom_constraints_(nullptr) {
958 BLOBNBOX_C_IT it(&boxes_);
959 it.add_list_after(boxes);
960}
961
962// Delete this, but first, repoint all the partners to point to
963// replacement. If replacement is nullptr, then partner relationships
964// are removed.
965void TabVector::Delete(TabVector *replacement) {
966 TabVector_C_IT it(&partners_);
967 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
968 TabVector *partner = it.data();
969 TabVector_C_IT p_it(&partner->partners_);
970 // If partner already has replacement in its list, then make
971 // replacement null, and just remove this TabVector when we find it.
972 TabVector *partner_replacement = replacement;
973 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
974 TabVector *p_partner = p_it.data();
975 if (p_partner == partner_replacement) {
976 partner_replacement = nullptr;
977 break;
978 }
979 }
980 // Remove all references to this, and replace with replacement if not
981 // nullptr.
982 for (p_it.mark_cycle_pt(); !p_it.cycled_list(); p_it.forward()) {
983 TabVector *p_partner = p_it.data();
984 if (p_partner == this) {
985 p_it.extract();
986 if (partner_replacement != nullptr) {
987 p_it.add_before_stay_put(partner_replacement);
988 }
989 }
990 }
991 if (partner_replacement != nullptr) {
992 partner_replacement->AddPartner(partner);
993 }
994 }
995 delete this;
996}
997
998} // namespace tesseract.
#define double_VAR(name, val, comment)
Definition: params.h:366
const double y
double textord_tabvector_vertical_gap_fraction
Definition: tabvector.cpp:57
const double kMinAlignedGutter
Definition: tabvector.cpp:51
@ BRT_UNKNOWN
Definition: blobbox.h:80
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
const int kGutterMultiple
Definition: tabvector.cpp:36
bool textord_debug_printable
Definition: alignedblob.cpp:43
const int kSimilarVectorDist
Definition: tabvector.cpp:40
const double kMinRaggedGutter
Definition: tabvector.cpp:53
int textord_debug_tabfind
Definition: alignedblob.cpp:29
@ TA_RIGHT_ALIGNED
Definition: tabvector.h:45
@ TA_RIGHT_RAGGED
Definition: tabvector.h:46
@ TA_LEFT_ALIGNED
Definition: tabvector.h:42
@ TA_LEFT_RAGGED
Definition: tabvector.h:43
@ BTFT_TEXT_ON_IMAGE
Definition: blobbox.h:116
const double kLineCountReciprocal
Definition: tabvector.cpp:49
const int kMaxFillinMultiple
Definition: tabvector.cpp:45
const double kMinGutterFraction
Definition: tabvector.cpp:47
const int kGutterToNeighbourRatio
Definition: tabvector.cpp:38
const int kSimilarRaggedDist
Definition: tabvector.cpp:43
double textord_tabvector_vertical_box_ratio
Definition: tabvector.cpp:60
const TBOX & bounding_box() const
Definition: blobbox.h:239
BlobRegionType region_type() const
Definition: blobbox.h:298
BlobTextFlowType flow() const
Definition: blobbox.h:310
void Add(const ICOORD &pt)
Definition: detlinefit.cpp:50
double Fit(ICOORD *pt1, ICOORD *pt2)
Definition: detlinefit.h:73
integer coordinate
Definition: points.h:36
void rotate(const FCOORD &vec)
Definition: points.h:511
void set_x(TDimension xin)
rewrite function
Definition: points.h:67
TDimension y() const
access_function
Definition: points.h:62
void set_y(TDimension yin)
rewrite function
Definition: points.h:71
TDimension x() const
access function
Definition: points.h:58
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
const ICOORD & botleft() const
Definition: rect.h:102
ICOORD botright() const
Definition: rect.h:106
ICOORD topleft() const
Definition: rect.h:110
const ICOORD & topright() const
Definition: rect.h:114
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
int32_t area() const
Definition: rect.h:134
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t get_total() const
Definition: statistc.h:85
double median() const
Definition: statistc.cpp:241
static bool WithinTestRegion(int detail_level, int x, int y)
void StartVerticalSearch(int xmin, int xmax, int y)
Definition: bbgrid.h:837
BBC * NextVerticalSearch(bool top_to_bottom)
Definition: bbgrid.h:849
void GutterWidthAndNeighbourGap(int tab_x, int mean_height, int max_gutter, bool left, BLOBNBOX *bbox, int *gutter_width, int *neighbour_gap)
Definition: tabfind.cpp:206
int GutterWidth(int bottom_y, int top_y, const TabVector &v, bool ignore_unmergeables, int max_gutter_width, int *required_shift)
Definition: tabfind.cpp:156
static void CreateConstraint(TabVector *vector, bool is_top)
Definition: tabvector.cpp:63
static void ApplyConstraints(TabConstraint_LIST *constraints)
Definition: tabvector.cpp:122
static void MergeConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:95
static bool CompatibleConstraints(TabConstraint_LIST *list1, TabConstraint_LIST *list2)
Definition: tabvector.cpp:76
int XAtY(int y) const
Definition: tabvector.h:181
void set_bottom_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:159
static int SortKey(const ICOORD &vertical, int x, int y)
Definition: tabvector.h:274
const ICOORD & endpt() const
Definition: tabvector.h:141
void Rotate(const FCOORD &rotation)
Definition: tabvector.cpp:274
void AddPartner(TabVector *partner)
Definition: tabvector.cpp:488
bool IsSeparator() const
Definition: tabvector.h:213
static void MergeSimilarTabVectors(const ICOORD &vertical, TabVector_LIST *vectors, BlobGrid *grid)
Definition: tabvector.cpp:352
int ExtendedOverlap(int top_y, int bottom_y) const
Definition: tabvector.h:200
int extended_ymin() const
Definition: tabvector.h:147
bool IsAPartner(const TabVector *other)
Definition: tabvector.cpp:503
static TabVector * FitVector(TabAlignment alignment, ICOORD vertical, int extended_start_y, int extended_end_y, BLOBNBOX_CLIST *good_points, int *vertical_x, int *vertical_y)
Definition: tabvector.cpp:176
bool IsRightTab() const
Definition: tabvector.h:209
TabVector * VerticalTextlinePartner()
Definition: tabvector.cpp:883
void Evaluate(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:581
int extended_ymax() const
Definition: tabvector.h:144
void SetupPartnerConstraints()
Definition: tabvector.cpp:295
void SetYEnd(int end_y)
Definition: tabvector.cpp:268
void set_top_constraints(TabConstraint_LIST *constraints)
Definition: tabvector.h:156
bool Fit(ICOORD vertical, bool force_parallel)
Definition: tabvector.cpp:783
bool IsRagged() const
Definition: tabvector.h:221
void SetYStart(int start_y)
Definition: tabvector.cpp:263
bool IsLeftTab() const
Definition: tabvector.h:205
void Print(const char *prefix)
Definition: tabvector.cpp:518
void FitAndEvaluateIfNeeded(const ICOORD &vertical, TabFind *finder)
Definition: tabvector.cpp:566
const ICOORD & startpt() const
Definition: tabvector.h:138
void Display(ScrollView *tab_win)
Definition: tabvector.cpp:540
TabVector * ShallowCopy() const
Definition: tabvector.cpp:225
void MergeWith(const ICOORD &vertical, TabVector *other)
Definition: tabvector.cpp:454
void Debug(const char *prefix)
Definition: tabvector.cpp:527
TabVector * GetSinglePartner()
Definition: tabvector.cpp:872
void ExtendToBox(BLOBNBOX *blob)
Definition: tabvector.cpp:238
bool SimilarTo(const ICOORD &vertical, const TabVector &other, BlobGrid *grid) const
Definition: tabvector.cpp:384
void Line(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:498
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:610
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:635
void Pen(Color color)
Definition: scrollview.cpp:710