tesseract v5.3.3.20231005
strokewidth.cpp
Go to the documentation of this file.
1
2// File: strokewidth.cpp
3// Description: Subclass of BBGrid to find uniformity of strokewidth.
4// Author: Ray Smith
5//
6// (C) Copyright 2008, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifdef HAVE_CONFIG_H
20# include "config_auto.h"
21#endif
22
23#include "strokewidth.h"
24
25#include <algorithm>
26#include <cmath>
27
28#include "blobbox.h"
29#include "colpartition.h"
30#include "colpartitiongrid.h"
31#include "helpers.h" // for IntCastRounded
32#include "imagefind.h"
33#include "linlsq.h"
34#include "statistc.h"
35#include "tabfind.h"
36#include "textlineprojection.h"
37#include "tordmain.h" // For SetBlobStrokeWidth.
38
39namespace tesseract {
40
41#ifndef GRAPHICS_DISABLED
42static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths (ScrollView)");
43#else
44static INT_VAR(textord_tabfind_show_strokewidths, 0, "Show stroke widths");
45#endif
46static BOOL_VAR(textord_tabfind_only_strokewidths, false, "Only run stroke widths");
47
49const double kStrokeWidthFractionTolerance = 0.125;
54const double kStrokeWidthTolerance = 1.5;
55// Same but for CJK we are a bit more generous.
56const double kStrokeWidthFractionCJK = 0.25;
57const double kStrokeWidthCJK = 2.0;
58// Radius in grid cells of search for broken CJK. Doesn't need to be very
59// large as the grid size should be about the size of a character anyway.
60const int kCJKRadius = 2;
61// Max distance fraction of size to join close but broken CJK characters.
62const double kCJKBrokenDistanceFraction = 0.25;
63// Max number of components in a broken CJK character.
64const int kCJKMaxComponents = 8;
65// Max aspect ratio of CJK broken characters when put back together.
66const double kCJKAspectRatio = 1.25;
67// Max increase in aspect ratio of CJK broken characters when merged.
68const double kCJKAspectRatioIncrease = 1.0625;
69// Max multiple of the grid size that will be used in computing median CJKsize.
70const int kMaxCJKSizeRatio = 5;
71// Min fraction of blobs broken CJK to iterate and run it again.
72const double kBrokenCJKIterationFraction = 0.125;
73// Multiple of gridsize as x-padding for a search box for diacritic base
74// characters.
75const double kDiacriticXPadRatio = 7.0;
76// Multiple of gridsize as y-padding for a search box for diacritic base
77// characters.
78const double kDiacriticYPadRatio = 1.75;
79// Min multiple of diacritic height that a neighbour must be to be a
80// convincing base character.
81const double kMinDiacriticSizeRatio = 1.0625;
82// Max multiple of a textline's median height as a threshold for the sum of
83// a diacritic's farthest x and y distances (gap + size).
84const double kMaxDiacriticDistanceRatio = 1.25;
85// Max x-gap between a diacritic and its base char as a fraction of the height
86// of the base char (allowing other blobs to fill the gap.)
88// Ratio between longest side of a line and longest side of a character.
89// (neighbor_min > blob_min * kLineTrapShortest &&
90// neighbor_max < blob_max / kLineTrapLongest)
91// => neighbor is a grapheme and blob is a line.
92const int kLineTrapLongest = 4;
93// Ratio between shortest side of a line and shortest side of a character.
94const int kLineTrapShortest = 2;
95// Max aspect ratio of the total box before CountNeighbourGaps
96// decides immediately based on the aspect ratio.
97const int kMostlyOneDirRatio = 3;
98// Aspect ratio for a blob to be considered as line residue.
99const double kLineResidueAspectRatio = 8.0;
100// Padding ratio for line residue search box.
102// Min multiple of neighbour size for a line residue to be genuine.
103const double kLineResidueSizeRatio = 1.75;
104// Aspect ratio filter for OSD.
105const float kSizeRatioToReject = 2.0;
106// Expansion factor for search box for good neighbours.
107const double kNeighbourSearchFactor = 2.5;
108// Factor of increase of overlap when adding diacritics to make an image noisy.
109const double kNoiseOverlapGrowthFactor = 4.0;
110// Fraction of the image size to add overlap when adding diacritics for an
111// image to qualify as noisy.
112const double kNoiseOverlapAreaFactor = 1.0 / 512;
113
114StrokeWidth::StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
115 : BlobGrid(gridsize, bleft, tright)
116 , nontext_map_(nullptr)
117 , projection_(nullptr)
118 , denorm_(nullptr)
119 , grid_box_(bleft, tright)
120 , rerotation_(1.0f, 0.0f) {
121}
122
124#ifndef GRAPHICS_DISABLED
125 if (widths_win_ != nullptr) {
126 widths_win_->AwaitEvent(SVET_DESTROY);
127 if (textord_tabfind_only_strokewidths) {
128 exit(0);
129 }
130 delete widths_win_;
131 }
132 delete leaders_win_;
133 delete initial_widths_win_;
134 delete chains_win_;
135 delete textlines_win_;
136 delete smoothed_win_;
137 delete diacritics_win_;
138#endif
139}
140
141// Sets the neighbours member of the medium-sized blobs in the block.
142// Searches on 4 sides of each blob for similar-sized, similar-strokewidth
143// blobs and sets pointers to the good neighbours.
145 // Run a preliminary strokewidth neighbour detection on the medium blobs.
146 InsertBlobList(&block->blobs);
147 BLOBNBOX_IT blob_it(&block->blobs);
148 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
149 SetNeighbours(false, false, blob_it.data());
150 }
151 Clear();
152}
153
154// Sets the neighbour/textline writing direction members of the medium
155// and large blobs with optional repair of broken CJK characters first.
156// Repair of broken CJK is needed here because broken CJK characters
157// can fool the textline direction detection algorithm.
159 TO_BLOCK *input_block) {
160 // Setup the grid with the remaining (non-noise) blobs.
161 InsertBlobs(input_block);
162 // Repair broken CJK characters if needed.
163 while (cjk_merge && FixBrokenCJK(input_block)) {
164 }
165 // Grade blobs by inspection of neighbours.
166 FindTextlineFlowDirection(pageseg_mode, false);
167 // Clear the grid ready for rotation or leader finding.
168 Clear();
169}
170
171// Helper to collect and count horizontal and vertical blobs from a list.
172static void CollectHorizVertBlobs(BLOBNBOX_LIST *input_blobs, int *num_vertical_blobs,
173 int *num_horizontal_blobs, BLOBNBOX_CLIST *vertical_blobs,
174 BLOBNBOX_CLIST *horizontal_blobs,
175 BLOBNBOX_CLIST *nondescript_blobs) {
176 BLOBNBOX_C_IT v_it(vertical_blobs);
177 BLOBNBOX_C_IT h_it(horizontal_blobs);
178 BLOBNBOX_C_IT n_it(nondescript_blobs);
179 BLOBNBOX_IT blob_it(input_blobs);
180 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
181 BLOBNBOX *blob = blob_it.data();
182 const TBOX &box = blob->bounding_box();
183 float y_x = static_cast<float>(box.height()) / box.width();
184 float x_y = 1.0f / y_x;
185 // Select a >= 1.0 ratio
186 float ratio = x_y > y_x ? x_y : y_x;
187 // If the aspect ratio is small and we want them for osd, save the blob.
188 bool ok_blob = ratio <= kSizeRatioToReject;
189 if (blob->UniquelyVertical()) {
190 ++*num_vertical_blobs;
191 if (ok_blob) {
192 v_it.add_after_then_move(blob);
193 }
194 } else if (blob->UniquelyHorizontal()) {
195 ++*num_horizontal_blobs;
196 if (ok_blob) {
197 h_it.add_after_then_move(blob);
198 }
199 } else if (ok_blob) {
200 n_it.add_after_then_move(blob);
201 }
202 }
203}
204
205// Types all the blobs as vertical or horizontal text or unknown and
206// returns true if the majority are vertical.
207// If the blobs are rotated, it is necessary to call CorrectForRotation
208// after rotating everything, otherwise the work done here will be enough.
209// If osd_blobs is not null, a list of blobs from the dominant textline
210// direction are returned for use in orientation and script detection.
211bool StrokeWidth::TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block,
212 BLOBNBOX_CLIST *osd_blobs) {
213 int vertical_boxes = 0;
214 int horizontal_boxes = 0;
215 // Count vertical normal and large blobs.
216 BLOBNBOX_CLIST vertical_blobs;
217 BLOBNBOX_CLIST horizontal_blobs;
218 BLOBNBOX_CLIST nondescript_blobs;
219 CollectHorizVertBlobs(&block->blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,
220 &horizontal_blobs, &nondescript_blobs);
221 CollectHorizVertBlobs(&block->large_blobs, &vertical_boxes, &horizontal_boxes, &vertical_blobs,
222 &horizontal_blobs, &nondescript_blobs);
224 tprintf("TextDir hbox=%d vs vbox=%d, %dH, %dV, %dN osd blobs\n", horizontal_boxes,
225 vertical_boxes, horizontal_blobs.length(), vertical_blobs.length(),
226 nondescript_blobs.length());
227 }
228 if (osd_blobs != nullptr && vertical_boxes == 0 && horizontal_boxes == 0) {
229 // Only nondescript blobs available, so return those.
230 BLOBNBOX_C_IT osd_it(osd_blobs);
231 osd_it.add_list_after(&nondescript_blobs);
232 return false;
233 }
234 int min_vert_boxes =
235 static_cast<int>((vertical_boxes + horizontal_boxes) * find_vertical_text_ratio);
236 if (vertical_boxes >= min_vert_boxes) {
237 if (osd_blobs != nullptr) {
238 BLOBNBOX_C_IT osd_it(osd_blobs);
239 osd_it.add_list_after(&vertical_blobs);
240 }
241 return true;
242 } else {
243 if (osd_blobs != nullptr) {
244 BLOBNBOX_C_IT osd_it(osd_blobs);
245 osd_it.add_list_after(&horizontal_blobs);
246 }
247 return false;
248 }
249}
250
251// Corrects the data structures for the given rotation.
252void StrokeWidth::CorrectForRotation(const FCOORD &rotation, ColPartitionGrid *part_grid) {
253 Init(part_grid->gridsize(), part_grid->bleft(), part_grid->tright());
254 grid_box_ = TBOX(bleft(), tright());
255 rerotation_.set_x(rotation.x());
256 rerotation_.set_y(-rotation.y());
257}
258
259// Finds leader partitions and inserts them into the given part_grid.
261 Clear();
262 // Find and isolate leaders in the noise list.
263 ColPartition_LIST leader_parts;
264 FindLeadersAndMarkNoise(block, &leader_parts);
265 // Setup the strokewidth grid with the block's remaining (non-noise) blobs.
266 InsertBlobList(&block->blobs);
267 // Mark blobs that have leader neighbours.
268 for (ColPartition_IT it(&leader_parts); !it.empty(); it.forward()) {
269 ColPartition *part = it.extract();
270 part->ClaimBoxes();
271 MarkLeaderNeighbours(part, LR_LEFT);
272 MarkLeaderNeighbours(part, LR_RIGHT);
273 part_grid->InsertBBox(true, true, part);
274 }
275}
276
277// Finds and marks noise those blobs that look like bits of vertical lines
278// that would otherwise screw up layout analysis.
279void StrokeWidth::RemoveLineResidue(ColPartition_LIST *big_part_list) {
280 BlobGridSearch gsearch(this);
281 BLOBNBOX *bbox;
282 // For every vertical line-like bbox in the grid, search its neighbours
283 // to find the tallest, and if the original box is taller by sufficient
284 // margin, then call it line residue and delete it.
285 gsearch.StartFullSearch();
286 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
287 TBOX box = bbox->bounding_box();
288 if (box.height() < box.width() * kLineResidueAspectRatio) {
289 continue;
290 }
291 // Set up a rectangle search around the blob to find the size of its
292 // neighbours.
293 int padding = box.height() * kLineResiduePadRatio;
294 TBOX search_box = box;
295 search_box.pad(padding, padding);
296 bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
297 // Find the largest object in the search box not equal to bbox.
298 BlobGridSearch rsearch(this);
299 int max_height = 0;
300 BLOBNBOX *n;
301 rsearch.StartRectSearch(search_box);
302 while ((n = rsearch.NextRectSearch()) != nullptr) {
303 if (n == bbox) {
304 continue;
305 }
306 TBOX nbox = n->bounding_box();
307 if (nbox.height() > max_height) {
308 max_height = nbox.height();
309 }
310 }
311 if (debug) {
312 tprintf("Max neighbour size=%d for candidate line box at:", max_height);
313 box.print();
314 }
315 if (max_height * kLineResidueSizeRatio < box.height()) {
316#ifndef GRAPHICS_DISABLED
317 if (leaders_win_ != nullptr) {
318 // We are debugging, so display deleted in pink blobs in the same
319 // window that we use to display leader detection.
320 leaders_win_->Pen(ScrollView::PINK);
321 leaders_win_->Rectangle(box.left(), box.bottom(), box.right(), box.top());
322 }
323#endif // !GRAPHICS_DISABLED
324 ColPartition::MakeBigPartition(bbox, big_part_list);
325 }
326 }
327}
328
329// Types all the blobs as vertical text or horizontal text or unknown and
330// puts them into initial ColPartitions in the supplied part_grid.
331// rerotation determines how to get back to the image coordinates from the
332// blob coordinates (since they may have been rotated for vertical text).
333// block is the single block for the whole page or rectangle to be OCRed.
334// nontext_pix (full-size), is a binary mask used to prevent merges across
335// photo/text boundaries. It is not kept beyond this function.
336// denorm provides a mapping back to the image from the current blob
337// coordinate space.
338// projection provides a measure of textline density over the image and
339// provides functions to assist with diacritic detection. It should be a
340// pointer to a new TextlineProjection, and will be setup here.
341// part_grid is the output grid of textline partitions.
342// Large blobs that cause overlap are put in separate partitions and added
343// to the big_parts list.
344void StrokeWidth::GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation,
345 TO_BLOCK *block, Image nontext_pix, const DENORM *denorm,
346 bool cjk_script, TextlineProjection *projection,
347 BLOBNBOX_LIST *diacritic_blobs,
348 ColPartitionGrid *part_grid,
349 ColPartition_LIST *big_parts) {
350 nontext_map_ = nontext_pix;
351 projection_ = projection;
352 denorm_ = denorm;
353 // Clear and re Insert to take advantage of the tab stops in the blobs.
354 Clear();
355 // Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
356 InsertBlobs(block);
357
358 // Run FixBrokenCJK() again if the page is CJK.
359 if (cjk_script) {
360 FixBrokenCJK(block);
361 }
362 FindTextlineFlowDirection(pageseg_mode, false);
363 projection_->ConstructProjection(block, rerotation, nontext_map_);
364#ifndef GRAPHICS_DISABLED
365 if (textord_tabfind_show_strokewidths) {
366 ScrollView *line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
367 projection_->PlotGradedBlobs(&block->blobs, line_blobs_win);
368 projection_->PlotGradedBlobs(&block->small_blobs, line_blobs_win);
369 }
370#endif
371 projection_->MoveNonTextlineBlobs(&block->blobs, &block->noise_blobs);
372 projection_->MoveNonTextlineBlobs(&block->small_blobs, &block->noise_blobs);
373 // Clear and re Insert to take advantage of the removed diacritics.
374 Clear();
375 InsertBlobs(block);
376 FCOORD skew;
377 FindTextlineFlowDirection(pageseg_mode, true);
378 PartitionFindResult r = FindInitialPartitions(pageseg_mode, rerotation, true, block,
379 diacritic_blobs, part_grid, big_parts, &skew);
380 if (r == PFR_NOISE) {
381 tprintf("Detected %d diacritics\n", diacritic_blobs->length());
382 // Noise was found, and removed.
383 Clear();
384 InsertBlobs(block);
385 FindTextlineFlowDirection(pageseg_mode, true);
386 r = FindInitialPartitions(pageseg_mode, rerotation, false, block, diacritic_blobs, part_grid,
387 big_parts, &skew);
388 }
389 nontext_map_ = nullptr;
390 projection_ = nullptr;
391 denorm_ = nullptr;
392}
393
394static void PrintBoxWidths(BLOBNBOX *neighbour) {
395 const TBOX &nbox = neighbour->bounding_box();
396 tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n", nbox.left(),
397 nbox.bottom(), nbox.right(), nbox.top(), neighbour->horz_stroke_width(),
398 neighbour->vert_stroke_width(),
399 2.0 * neighbour->cblob()->area() / neighbour->cblob()->perimeter());
400}
401
405 // Run a radial search for blobs that overlap.
406 BlobGridSearch radsearch(this);
407 radsearch.StartRadSearch(x, y, 1);
408 BLOBNBOX *neighbour;
409 FCOORD click(static_cast<float>(x), static_cast<float>(y));
410 while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
411 TBOX nbox = neighbour->bounding_box();
412 if (nbox.contains(click) && neighbour->cblob() != nullptr) {
413 PrintBoxWidths(neighbour);
414 if (neighbour->neighbour(BND_LEFT) != nullptr) {
415 PrintBoxWidths(neighbour->neighbour(BND_LEFT));
416 }
417 if (neighbour->neighbour(BND_RIGHT) != nullptr) {
418 PrintBoxWidths(neighbour->neighbour(BND_RIGHT));
419 }
420 if (neighbour->neighbour(BND_ABOVE) != nullptr) {
421 PrintBoxWidths(neighbour->neighbour(BND_ABOVE));
422 }
423 if (neighbour->neighbour(BND_BELOW) != nullptr) {
424 PrintBoxWidths(neighbour->neighbour(BND_BELOW));
425 }
426 int gaps[BND_COUNT];
427 neighbour->NeighbourGaps(gaps);
428 tprintf(
429 "Left gap=%d, right=%d, above=%d, below=%d, horz=%d, vert=%d\n"
430 "Good= %d %d %d %d\n",
431 gaps[BND_LEFT], gaps[BND_RIGHT], gaps[BND_ABOVE], gaps[BND_BELOW],
432 neighbour->horz_possible(), neighbour->vert_possible(),
435 break;
436 }
437 }
438}
439
440// Detects and marks leader dots/dashes.
441// Leaders are horizontal chains of small or noise blobs that look
442// monospace according to ColPartition::MarkAsLeaderIfMonospaced().
443// Detected leaders become the only occupants of the block->small_blobs list.
444// Non-leader small blobs get moved to the blobs list.
445// Non-leader noise blobs remain singletons in the noise list.
446// All small and noise blobs in high density regions are marked BTFT_NONTEXT.
447// block is the single block for the whole page or rectangle to be OCRed.
448// leader_parts is the output.
449void StrokeWidth::FindLeadersAndMarkNoise(TO_BLOCK *block, ColPartition_LIST *leader_parts) {
452 BlobGridSearch gsearch(this);
453 BLOBNBOX *bbox;
454 // For every bbox in the grid, set its neighbours.
455 gsearch.StartFullSearch();
456 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
457 SetNeighbours(true, false, bbox);
458 }
459 ColPartition_IT part_it(leader_parts);
460 gsearch.StartFullSearch();
461 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
462 if (bbox->flow() == BTFT_NONE) {
463 if (bbox->neighbour(BND_RIGHT) == nullptr && bbox->neighbour(BND_LEFT) == nullptr) {
464 continue;
465 }
466 // Put all the linked blobs into a ColPartition.
467 auto *part = new ColPartition(BRT_UNKNOWN, ICOORD(0, 1));
468 BLOBNBOX *blob;
469 for (blob = bbox; blob != nullptr && blob->flow() == BTFT_NONE;
470 blob = blob->neighbour(BND_RIGHT)) {
471 part->AddBox(blob);
472 }
473 for (blob = bbox->neighbour(BND_LEFT); blob != nullptr && blob->flow() == BTFT_NONE;
474 blob = blob->neighbour(BND_LEFT)) {
475 part->AddBox(blob);
476 }
477 if (part->MarkAsLeaderIfMonospaced()) {
478 part_it.add_after_then_move(part);
479 } else {
480 delete part;
481 }
482 }
483 }
484#ifndef GRAPHICS_DISABLED
485 if (textord_tabfind_show_strokewidths) {
486 leaders_win_ = DisplayGoodBlobs("LeaderNeighbours", 0, 0);
487 }
488#endif
489 // Move any non-leaders from the small to the blobs list, as they are
490 // most likely dashes or broken characters.
491 BLOBNBOX_IT blob_it(&block->blobs);
492 BLOBNBOX_IT small_it(&block->small_blobs);
493 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
494 BLOBNBOX *blob = small_it.data();
495 if (blob->flow() != BTFT_LEADER) {
496 if (blob->flow() == BTFT_NEIGHBOURS) {
497 blob->set_flow(BTFT_NONE);
498 }
499 blob->ClearNeighbours();
500 blob_it.add_to_end(small_it.extract());
501 }
502 }
503 // Move leaders from the noise list to the small list, leaving the small
504 // list exclusively leaders, so they don't get processed further,
505 // and the remaining small blobs all in the noise list.
506 BLOBNBOX_IT noise_it(&block->noise_blobs);
507 for (noise_it.mark_cycle_pt(); !noise_it.cycled_list(); noise_it.forward()) {
508 BLOBNBOX *blob = noise_it.data();
509 if (blob->flow() == BTFT_LEADER || blob->joined_to_prev()) {
510 small_it.add_to_end(noise_it.extract());
511 } else if (blob->flow() == BTFT_NEIGHBOURS) {
512 blob->set_flow(BTFT_NONE);
513 blob->ClearNeighbours();
514 }
515 }
516 // Clear the grid as we don't want the small stuff hanging around in it.
517 Clear();
518}
519
522void StrokeWidth::InsertBlobs(TO_BLOCK *block) {
523 InsertBlobList(&block->blobs);
524 InsertBlobList(&block->large_blobs);
525}
526
527// Checks the left or right side of the given leader partition and sets the
528// (opposite) leader_on_right or leader_on_left flags for blobs
529// that are next to the given side of the given leader partition.
530void StrokeWidth::MarkLeaderNeighbours(const ColPartition *part, LeftOrRight side) {
531 const TBOX &part_box = part->bounding_box();
532 BlobGridSearch blobsearch(this);
533 // Search to the side of the leader for the nearest neighbour.
534 BLOBNBOX *best_blob = nullptr;
535 int best_gap = 0;
536 blobsearch.StartSideSearch(side == LR_LEFT ? part_box.left() : part_box.right(),
537 part_box.bottom(), part_box.top());
538 BLOBNBOX *blob;
539 while ((blob = blobsearch.NextSideSearch(side == LR_LEFT)) != nullptr) {
540 const TBOX &blob_box = blob->bounding_box();
541 if (!blob_box.y_overlap(part_box)) {
542 continue;
543 }
544 int x_gap = blob_box.x_gap(part_box);
545 if (x_gap > 2 * gridsize()) {
546 break;
547 } else if (best_blob == nullptr || x_gap < best_gap) {
548 best_blob = blob;
549 best_gap = x_gap;
550 }
551 }
552 if (best_blob != nullptr) {
553 if (side == LR_LEFT) {
554 best_blob->set_leader_on_right(true);
555 } else {
556 best_blob->set_leader_on_left(true);
557 }
558#ifndef GRAPHICS_DISABLED
559 if (leaders_win_ != nullptr) {
560 leaders_win_->Pen(side == LR_LEFT ? ScrollView::RED : ScrollView::GREEN);
561 const TBOX &blob_box = best_blob->bounding_box();
562 leaders_win_->Rectangle(blob_box.left(), blob_box.bottom(), blob_box.right(), blob_box.top());
563 }
564#endif // !GRAPHICS_DISABLED
565 }
566}
567
568// Helper to compute the UQ of the square-ish CJK characters.
569static int UpperQuartileCJKSize(int gridsize, BLOBNBOX_LIST *blobs) {
570 STATS sizes(0, gridsize * kMaxCJKSizeRatio - 1);
571 BLOBNBOX_IT it(blobs);
572 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
573 BLOBNBOX *blob = it.data();
574 int width = blob->bounding_box().width();
575 int height = blob->bounding_box().height();
576 if (width <= height * kCJKAspectRatio && height < width * kCJKAspectRatio) {
577 sizes.add(height, 1);
578 }
579 }
580 return static_cast<int>(sizes.ile(0.75f) + 0.5);
581}
582
583// Fix broken CJK characters, using the fake joined blobs mechanism.
584// Blobs are really merged, ie the master takes all the outlines and the
585// others are deleted.
586// Returns true if sufficient blobs are merged that it may be worth running
587// again, due to a better estimate of character size.
588bool StrokeWidth::FixBrokenCJK(TO_BLOCK *block) {
589 BLOBNBOX_LIST *blobs = &block->blobs;
590 int median_height = UpperQuartileCJKSize(gridsize(), blobs);
591 int max_dist = static_cast<int>(median_height * kCJKBrokenDistanceFraction);
592 int max_height = static_cast<int>(median_height * kCJKAspectRatio);
593 int num_fixed = 0;
594 BLOBNBOX_IT blob_it(blobs);
595
596 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
597 BLOBNBOX *blob = blob_it.data();
598 if (blob->cblob() == nullptr || blob->cblob()->out_list()->empty()) {
599 continue;
600 }
601 TBOX bbox = blob->bounding_box();
602 bool debug = AlignedBlob::WithinTestRegion(3, bbox.left(), bbox.bottom());
603 if (debug) {
604 tprintf("Checking for Broken CJK (max size=%d):", max_height);
605 bbox.print();
606 }
607 // Generate a list of blobs that overlap or are near enough to merge.
608 BLOBNBOX_CLIST overlapped_blobs;
609 AccumulateOverlaps(blob, debug, max_height, max_dist, &bbox, &overlapped_blobs);
610 if (!overlapped_blobs.empty()) {
611 // There are overlapping blobs, so qualify them as being satisfactory
612 // before removing them from the grid and replacing them with the union.
613 // The final box must be roughly square.
614 if (bbox.width() > bbox.height() * kCJKAspectRatio ||
615 bbox.height() > bbox.width() * kCJKAspectRatio) {
616 if (debug) {
617 tprintf("Bad final aspectratio:");
618 bbox.print();
619 }
620 continue;
621 }
622 // There can't be too many blobs to merge.
623 if (overlapped_blobs.length() >= kCJKMaxComponents) {
624 if (debug) {
625 tprintf("Too many neighbours: %d\n", overlapped_blobs.length());
626 }
627 continue;
628 }
629 // The strokewidths must match amongst the join candidates.
630 BLOBNBOX_C_IT n_it(&overlapped_blobs);
631 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
632 BLOBNBOX *neighbour = nullptr;
633 neighbour = n_it.data();
634 if (!blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionCJK, kStrokeWidthCJK)) {
635 break;
636 }
637 }
638 if (!n_it.cycled_list()) {
639 if (debug) {
640 tprintf("Bad stroke widths:");
641 PrintBoxWidths(blob);
642 }
643 continue; // Not good enough.
644 }
645
646 // Merge all the candidates into blob.
647 // We must remove blob from the grid and reinsert it after merging
648 // to maintain the integrity of the grid.
649 RemoveBBox(blob);
650 // Everything else will be calculated later.
651 for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
652 BLOBNBOX *neighbour = n_it.data();
653 RemoveBBox(neighbour);
654 // Mark empty blob for deletion.
655 neighbour->set_region_type(BRT_NOISE);
656 blob->really_merge(neighbour);
657 if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
658 blob->rotate_box(rerotation_);
659 }
660 }
661 InsertBBox(true, true, blob);
662 ++num_fixed;
663 if (debug) {
664 tprintf("Done! Final box:");
665 bbox.print();
666 }
667 }
668 }
669 // Count remaining blobs.
670 int num_remaining = 0;
671 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
672 BLOBNBOX *blob = blob_it.data();
673 if (blob->cblob() != nullptr && !blob->cblob()->out_list()->empty()) {
674 ++num_remaining;
675 }
676 }
677 // Permanently delete all the marked blobs after first removing all
678 // references in the neighbour members.
679 block->DeleteUnownedNoise();
680 return num_fixed > num_remaining * kBrokenCJKIterationFraction;
681}
682
683// Helper function to determine whether it is reasonable to merge the
684// bbox and the nbox for repairing broken CJK.
685// The distance apart must not exceed max_dist, the combined size must
686// not exceed max_size, and the aspect ratio must either improve or at
687// least not get worse by much.
688static bool AcceptableCJKMerge(const TBOX &bbox, const TBOX &nbox, bool debug, int max_size,
689 int max_dist, int *x_gap, int *y_gap) {
690 *x_gap = bbox.x_gap(nbox);
691 *y_gap = bbox.y_gap(nbox);
692 TBOX merged(nbox);
693 merged += bbox;
694 if (debug) {
695 tprintf("gaps = %d, %d, merged_box:", *x_gap, *y_gap);
696 merged.print();
697 }
698 if (*x_gap <= max_dist && *y_gap <= max_dist && merged.width() <= max_size &&
699 merged.height() <= max_size) {
700 // Close enough to call overlapping. Check aspect ratios.
701 double old_ratio = static_cast<double>(bbox.width()) / bbox.height();
702 if (old_ratio < 1.0) {
703 old_ratio = 1.0 / old_ratio;
704 }
705 double new_ratio = static_cast<double>(merged.width()) / merged.height();
706 if (new_ratio < 1.0) {
707 new_ratio = 1.0 / new_ratio;
708 }
709 if (new_ratio <= old_ratio * kCJKAspectRatioIncrease) {
710 return true;
711 }
712 }
713 return false;
714}
715
716// Collect blobs that overlap or are within max_dist of the input bbox.
717// Return them in the list of blobs and expand the bbox to be the union
718// of all the boxes. not_this is excluded from the search, as are blobs
719// that cause the merged box to exceed max_size in either dimension.
720void StrokeWidth::AccumulateOverlaps(const BLOBNBOX *not_this, bool debug, int max_size,
721 int max_dist, TBOX *bbox, BLOBNBOX_CLIST *blobs) {
722 // While searching, nearests holds the nearest failed blob in each
723 // direction. When we have a nearest in each of the 4 directions, then
724 // the search is over, and at this point the final bbox must not overlap
725 // any of the nearests.
726 BLOBNBOX *nearests[BND_COUNT];
727 for (auto &nearest : nearests) {
728 nearest = nullptr;
729 }
730 int x = (bbox->left() + bbox->right()) / 2;
731 int y = (bbox->bottom() + bbox->top()) / 2;
732 // Run a radial search for blobs that overlap or are sufficiently close.
733 BlobGridSearch radsearch(this);
734 radsearch.StartRadSearch(x, y, kCJKRadius);
735 BLOBNBOX *neighbour;
736 while ((neighbour = radsearch.NextRadSearch()) != nullptr) {
737 if (neighbour == not_this) {
738 continue;
739 }
740 TBOX nbox = neighbour->bounding_box();
741 int x_gap, y_gap;
742 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {
743 // Close enough to call overlapping. Merge boxes.
744 *bbox += nbox;
745 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
746 if (debug) {
747 tprintf("Added:");
748 nbox.print();
749 }
750 // Since we merged, search the nearests, as some might now me mergeable.
751 for (int dir = 0; dir < BND_COUNT; ++dir) {
752 if (nearests[dir] == nullptr) {
753 continue;
754 }
755 nbox = nearests[dir]->bounding_box();
756 if (AcceptableCJKMerge(*bbox, nbox, debug, max_size, max_dist, &x_gap, &y_gap)) {
757 // Close enough to call overlapping. Merge boxes.
758 *bbox += nbox;
759 blobs->add_sorted(SortByBoxLeft<BLOBNBOX>, true, nearests[dir]);
760 if (debug) {
761 tprintf("Added:");
762 nbox.print();
763 }
764 nearests[dir] = nullptr;
765 dir = -1; // Restart the search.
766 }
767 }
768 } else if (x_gap < 0 && x_gap <= y_gap) {
769 // A vertical neighbour. Record the nearest.
770 BlobNeighbourDir dir = nbox.top() > bbox->top() ? BND_ABOVE : BND_BELOW;
771 if (nearests[dir] == nullptr || y_gap < bbox->y_gap(nearests[dir]->bounding_box())) {
772 nearests[dir] = neighbour;
773 }
774 } else if (y_gap < 0 && y_gap <= x_gap) {
775 // A horizontal neighbour. Record the nearest.
776 BlobNeighbourDir dir = nbox.left() > bbox->left() ? BND_RIGHT : BND_LEFT;
777 if (nearests[dir] == nullptr || x_gap < bbox->x_gap(nearests[dir]->bounding_box())) {
778 nearests[dir] = neighbour;
779 }
780 }
781 // If all nearests are non-null, then we have finished.
782 if (nearests[BND_LEFT] && nearests[BND_RIGHT] && nearests[BND_ABOVE] && nearests[BND_BELOW]) {
783 break;
784 }
785 }
786 // Final overlap with a nearest is not allowed.
787 for (auto &nearest : nearests) {
788 if (nearest == nullptr) {
789 continue;
790 }
791 const TBOX &nbox = nearest->bounding_box();
792 if (debug) {
793 tprintf("Testing for overlap with:");
794 nbox.print();
795 }
796 if (bbox->overlap(nbox)) {
797 blobs->shallow_clear();
798 if (debug) {
799 tprintf("Final box overlaps nearest\n");
800 }
801 return;
802 }
803 }
804}
805
806// For each blob in this grid, Finds the textline direction to be horizontal
807// or vertical according to distance to neighbours and 1st and 2nd order
808// neighbours. Non-text tends to end up without a definite direction.
809// Result is setting of the neighbours and vert_possible/horz_possible
810// flags in the BLOBNBOXes currently in this grid.
811// This function is called more than once if page orientation is uncertain,
812// so display_if_debugging is true on the final call to display the results.
813void StrokeWidth::FindTextlineFlowDirection(PageSegMode pageseg_mode, bool display_if_debugging) {
814 BlobGridSearch gsearch(this);
815 BLOBNBOX *bbox;
816 // For every bbox in the grid, set its neighbours.
817 gsearch.StartFullSearch();
818 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
819 SetNeighbours(false, display_if_debugging, bbox);
820 }
821 // Where vertical or horizontal wins by a big margin, clarify it.
822 gsearch.StartFullSearch();
823 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
824 SimplifyObviousNeighbours(bbox);
825 }
826 // Now try to make the blobs only vertical or horizontal using neighbours.
827 gsearch.StartFullSearch();
828 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
829 if (FindingVerticalOnly(pageseg_mode)) {
830 bbox->set_vert_possible(true);
831 bbox->set_horz_possible(false);
832 } else if (FindingHorizontalOnly(pageseg_mode)) {
833 bbox->set_vert_possible(false);
834 bbox->set_horz_possible(true);
835 } else {
836 SetNeighbourFlows(bbox);
837 }
838 }
839#ifndef GRAPHICS_DISABLED
840 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
841 textord_tabfind_show_strokewidths > 1) {
842 initial_widths_win_ = DisplayGoodBlobs("InitialStrokewidths", 400, 0);
843 }
844#endif
845 // Improve flow direction with neighbours.
846 gsearch.StartFullSearch();
847 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
848 SmoothNeighbourTypes(pageseg_mode, false, bbox);
849 }
850 // Now allow reset of firm values to fix renegades.
851 gsearch.StartFullSearch();
852 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
853 SmoothNeighbourTypes(pageseg_mode, true, bbox);
854 }
855 // Repeat.
856 gsearch.StartFullSearch();
857 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
858 SmoothNeighbourTypes(pageseg_mode, true, bbox);
859 }
860#ifndef GRAPHICS_DISABLED
861 if ((textord_tabfind_show_strokewidths && display_if_debugging) ||
862 textord_tabfind_show_strokewidths > 1) {
863 widths_win_ = DisplayGoodBlobs("ImprovedStrokewidths", 800, 0);
864 }
865#endif
866}
867
868// Sets the neighbours and good_stroke_neighbours members of the blob by
869// searching close on all 4 sides.
870// When finding leader dots/dashes, there is a slightly different rule for
871// what makes a good neighbour.
872void StrokeWidth::SetNeighbours(bool leaders, bool activate_line_trap, BLOBNBOX *blob) {
873 int line_trap_count = 0;
874 for (int dir = 0; dir < BND_COUNT; ++dir) {
875 auto bnd = static_cast<BlobNeighbourDir>(dir);
876 line_trap_count += FindGoodNeighbour(bnd, leaders, blob);
877 }
878 if (line_trap_count > 0 && activate_line_trap) {
879 // It looks like a line so isolate it by clearing its neighbours.
880 blob->ClearNeighbours();
881 const TBOX &box = blob->bounding_box();
882 blob->set_region_type(box.width() > box.height() ? BRT_HLINE : BRT_VLINE);
883 }
884}
885
886// Sets the good_stroke_neighbours member of the blob if it has a
887// GoodNeighbour on the given side.
888// Also sets the neighbour in the blob, whether or not a good one is found.
889// Returns the number of blobs in the nearby search area that would lead us to
890// believe that this blob is a line separator.
891// Leaders get extra special lenient treatment.
892int StrokeWidth::FindGoodNeighbour(BlobNeighbourDir dir, bool leaders, BLOBNBOX *blob) {
893 // Search for neighbours that overlap vertically.
894 TBOX blob_box = blob->bounding_box();
895 bool debug = AlignedBlob::WithinTestRegion(2, blob_box.left(), blob_box.bottom());
896 if (debug) {
897 tprintf("FGN in dir %d for blob:", dir);
898 blob_box.print();
899 }
900 int top = blob_box.top();
901 int bottom = blob_box.bottom();
902 int left = blob_box.left();
903 int right = blob_box.right();
904 int width = right - left;
905 int height = top - bottom;
906
907 // A trap to detect lines tests for the min dimension of neighbours
908 // being larger than a multiple of the min dimension of the line
909 // and the larger dimension being smaller than a fraction of the max
910 // dimension of the line.
911 int line_trap_max = std::max(width, height) / kLineTrapLongest;
912 int line_trap_min = std::min(width, height) * kLineTrapShortest;
913 int line_trap_count = 0;
914
915 int min_good_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 2 : width / 2;
916 int min_decent_overlap = (dir == BND_LEFT || dir == BND_RIGHT) ? height / 3 : width / 3;
917 if (leaders) {
918 min_good_overlap = min_decent_overlap = 1;
919 }
920
921 int search_pad =
922 static_cast<int>(sqrt(static_cast<double>(width * height)) * kNeighbourSearchFactor);
923 if (gridsize() > search_pad) {
924 search_pad = gridsize();
925 }
926 TBOX search_box = blob_box;
927 // Pad the search in the appropriate direction.
928 switch (dir) {
929 case BND_LEFT:
930 search_box.set_left(search_box.left() - search_pad);
931 break;
932 case BND_RIGHT:
933 search_box.set_right(search_box.right() + search_pad);
934 break;
935 case BND_BELOW:
936 search_box.set_bottom(search_box.bottom() - search_pad);
937 break;
938 case BND_ABOVE:
939 search_box.set_top(search_box.top() + search_pad);
940 break;
941 case BND_COUNT:
942 return 0;
943 }
944
945 BlobGridSearch rectsearch(this);
946 rectsearch.StartRectSearch(search_box);
947 BLOBNBOX *best_neighbour = nullptr;
948 double best_goodness = 0.0;
949 bool best_is_good = false;
950 BLOBNBOX *neighbour;
951 while ((neighbour = rectsearch.NextRectSearch()) != nullptr) {
952 TBOX nbox = neighbour->bounding_box();
953 if (neighbour == blob) {
954 continue;
955 }
956 int mid_x = (nbox.left() + nbox.right()) / 2;
957 if (mid_x < blob->left_rule() || mid_x > blob->right_rule()) {
958 continue; // In a different column.
959 }
960 if (debug) {
961 tprintf("Neighbour at:");
962 nbox.print();
963 }
964
965 // Last-minute line detector. There is a small upper limit to the line
966 // width accepted by the morphological line detector.
967 int n_width = nbox.width();
968 int n_height = nbox.height();
969 if (std::min(n_width, n_height) > line_trap_min &&
970 std::max(n_width, n_height) < line_trap_max) {
971 ++line_trap_count;
972 }
973 // Heavily joined text, such as Arabic may have very different sizes when
974 // looking at the maxes, but the heights may be almost identical, so check
975 // for a difference in height if looking sideways or width vertically.
976 if (TabFind::VeryDifferentSizes(std::max(n_width, n_height), std::max(width, height)) &&
977 (((dir == BND_LEFT || dir == BND_RIGHT) && TabFind::DifferentSizes(n_height, height)) ||
978 ((dir == BND_BELOW || dir == BND_ABOVE) && TabFind::DifferentSizes(n_width, width)))) {
979 if (debug) {
980 tprintf("Bad size\n");
981 }
982 continue; // Could be a different font size or non-text.
983 }
984 // Amount of vertical overlap between the blobs.
985 int overlap;
986 // If the overlap is along the short side of the neighbour, and it
987 // is fully overlapped, then perp_overlap holds the length of the long
988 // side of the neighbour. A measure to include hyphens and dashes as
989 // legitimate neighbours.
990 int perp_overlap;
991 int gap;
992 if (dir == BND_LEFT || dir == BND_RIGHT) {
993 overlap = std::min(static_cast<int>(nbox.top()), top) -
994 std::max(static_cast<int>(nbox.bottom()), bottom);
995 if (overlap == nbox.height() && nbox.width() > nbox.height()) {
996 perp_overlap = nbox.width();
997 } else {
998 perp_overlap = overlap;
999 }
1000 gap = dir == BND_LEFT ? left - nbox.left() : nbox.right() - right;
1001 if (gap <= 0) {
1002 if (debug) {
1003 tprintf("On wrong side\n");
1004 }
1005 continue; // On the wrong side.
1006 }
1007 gap -= n_width;
1008 } else {
1009 overlap = std::min(static_cast<int>(nbox.right()), right) -
1010 std::max(static_cast<int>(nbox.left()), left);
1011 if (overlap == nbox.width() && nbox.height() > nbox.width()) {
1012 perp_overlap = nbox.height();
1013 } else {
1014 perp_overlap = overlap;
1015 }
1016 gap = dir == BND_BELOW ? bottom - nbox.bottom() : nbox.top() - top;
1017 if (gap <= 0) {
1018 if (debug) {
1019 tprintf("On wrong side\n");
1020 }
1021 continue; // On the wrong side.
1022 }
1023 gap -= n_height;
1024 }
1025 if (-gap > overlap) {
1026 if (debug) {
1027 tprintf("Overlaps wrong way\n");
1028 }
1029 continue; // Overlaps the wrong way.
1030 }
1031 if (perp_overlap < min_decent_overlap) {
1032 if (debug) {
1033 tprintf("Doesn't overlap enough\n");
1034 }
1035 continue; // Doesn't overlap enough.
1036 }
1037 bool bad_sizes =
1038 TabFind::DifferentSizes(height, n_height) && TabFind::DifferentSizes(width, n_width);
1039 bool is_good =
1040 overlap >= min_good_overlap && !bad_sizes &&
1041 blob->MatchingStrokeWidth(*neighbour, kStrokeWidthFractionTolerance, kStrokeWidthTolerance);
1042 // Best is a fuzzy combination of gap, overlap and is good.
1043 // Basically if you make one thing twice as good without making
1044 // anything else twice as bad, then it is better.
1045 if (gap < 1) {
1046 gap = 1;
1047 }
1048 double goodness = (1.0 + is_good) * overlap / gap;
1049 if (debug) {
1050 tprintf("goodness = %g vs best of %g, good=%d, overlap=%d, gap=%d\n", goodness, best_goodness,
1051 is_good, overlap, gap);
1052 }
1053 if (goodness > best_goodness) {
1054 best_neighbour = neighbour;
1055 best_goodness = goodness;
1056 best_is_good = is_good;
1057 }
1058 }
1059 blob->set_neighbour(dir, best_neighbour, best_is_good);
1060 return line_trap_count;
1061}
1062
1063// Helper to get a list of 1st-order neighbours.
1064static void ListNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1065 for (int dir = 0; dir < BND_COUNT; ++dir) {
1066 auto bnd = static_cast<BlobNeighbourDir>(dir);
1067 BLOBNBOX *neighbour = blob->neighbour(bnd);
1068 if (neighbour != nullptr) {
1069 neighbours->add_sorted(SortByBoxLeft<BLOBNBOX>, true, neighbour);
1070 }
1071 }
1072}
1073
1074// Helper to get a list of 1st and 2nd order neighbours.
1075static void List2ndNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1076 ListNeighbours(blob, neighbours);
1077 for (int dir = 0; dir < BND_COUNT; ++dir) {
1078 auto bnd = static_cast<BlobNeighbourDir>(dir);
1079 BLOBNBOX *neighbour = blob->neighbour(bnd);
1080 if (neighbour != nullptr) {
1081 ListNeighbours(neighbour, neighbours);
1082 }
1083 }
1084}
1085
1086// Helper to get a list of 1st, 2nd and 3rd order neighbours.
1087static void List3rdNeighbours(const BLOBNBOX *blob, BLOBNBOX_CLIST *neighbours) {
1088 List2ndNeighbours(blob, neighbours);
1089 for (int dir = 0; dir < BND_COUNT; ++dir) {
1090 auto bnd = static_cast<BlobNeighbourDir>(dir);
1091 BLOBNBOX *neighbour = blob->neighbour(bnd);
1092 if (neighbour != nullptr) {
1093 List2ndNeighbours(neighbour, neighbours);
1094 }
1095 }
1096}
1097
1098// Helper to count the evidence for verticalness or horizontalness
1099// in a list of neighbours.
1100static void CountNeighbourGaps(bool debug, BLOBNBOX_CLIST *neighbours, int *pure_h_count,
1101 int *pure_v_count) {
1102 if (neighbours->length() <= kMostlyOneDirRatio) {
1103 return;
1104 }
1105 BLOBNBOX_C_IT it(neighbours);
1106 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1107 BLOBNBOX *blob = it.data();
1108 int h_min, h_max, v_min, v_max;
1109 blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1110 if (debug) {
1111 tprintf("Hgaps [%d,%d], vgaps [%d,%d]:", h_min, h_max, v_min, v_max);
1112 }
1113 if (h_max < v_min || blob->leader_on_left() || blob->leader_on_right()) {
1114 // Horizontal gaps are clear winners. Count a pure horizontal.
1115 ++*pure_h_count;
1116 if (debug) {
1117 tprintf("Horz at:");
1118 }
1119 } else if (v_max < h_min) {
1120 // Vertical gaps are clear winners. Clear a pure vertical.
1121 ++*pure_v_count;
1122 if (debug) {
1123 tprintf("Vert at:");
1124 }
1125 } else {
1126 if (debug) {
1127 tprintf("Neither at:");
1128 }
1129 }
1130 if (debug) {
1131 blob->bounding_box().print();
1132 }
1133 }
1134}
1135
1136// Makes the blob to be only horizontal or vertical where evidence
1137// is clear based on gaps of 2nd order neighbours, or definite individual
1138// blobs.
1139void StrokeWidth::SetNeighbourFlows(BLOBNBOX *blob) {
1140 if (blob->DefiniteIndividualFlow()) {
1141 return;
1142 }
1143 bool debug =
1144 AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(), blob->bounding_box().bottom());
1145 if (debug) {
1146 tprintf("SetNeighbourFlows (current flow=%d, type=%d) on:", blob->flow(), blob->region_type());
1147 blob->bounding_box().print();
1148 }
1149 BLOBNBOX_CLIST neighbours;
1150 List3rdNeighbours(blob, &neighbours);
1151 // The number of pure horizontal and vertical neighbours.
1152 int pure_h_count = 0;
1153 int pure_v_count = 0;
1154 CountNeighbourGaps(debug, &neighbours, &pure_h_count, &pure_v_count);
1155 if (debug) {
1156 HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1157 tprintf("SetFlows: h_count=%d, v_count=%d\n", pure_h_count, pure_v_count);
1158 }
1159 if (!neighbours.empty()) {
1160 blob->set_vert_possible(true);
1161 blob->set_horz_possible(true);
1162 if (pure_h_count > 2 * pure_v_count) {
1163 // Horizontal gaps are clear winners. Clear vertical neighbours.
1164 blob->set_vert_possible(false);
1165 } else if (pure_v_count > 2 * pure_h_count) {
1166 // Vertical gaps are clear winners. Clear horizontal neighbours.
1167 blob->set_horz_possible(false);
1168 }
1169 } else {
1170 // Lonely blob. Can't tell its flow direction.
1171 blob->set_vert_possible(false);
1172 blob->set_horz_possible(false);
1173 }
1174}
1175
1176// Helper to count the number of horizontal and vertical blobs in a list.
1177static void CountNeighbourTypes(BLOBNBOX_CLIST *neighbours, int *pure_h_count, int *pure_v_count) {
1178 BLOBNBOX_C_IT it(neighbours);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1180 BLOBNBOX *blob = it.data();
1181 if (blob->UniquelyHorizontal()) {
1182 ++*pure_h_count;
1183 }
1184 if (blob->UniquelyVertical()) {
1185 ++*pure_v_count;
1186 }
1187 }
1188}
1189
1190// Nullify the neighbours in the wrong directions where the direction
1191// is clear-cut based on a distance margin. Good for isolating vertical
1192// text from neighbouring horizontal text.
1193void StrokeWidth::SimplifyObviousNeighbours(BLOBNBOX *blob) {
1194 // Case 1: We have text that is likely several characters, blurry and joined
1195 // together.
1196 if ((blob->bounding_box().width() > 3 * blob->area_stroke_width() &&
1197 blob->bounding_box().height() > 3 * blob->area_stroke_width())) {
1198 // The blob is complex (not stick-like).
1199 if (blob->bounding_box().width() > 4 * blob->bounding_box().height()) {
1200 // Horizontal conjoined text.
1201 blob->set_neighbour(BND_ABOVE, nullptr, false);
1202 blob->set_neighbour(BND_BELOW, nullptr, false);
1203 return;
1204 }
1205 if (blob->bounding_box().height() > 4 * blob->bounding_box().width()) {
1206 // Vertical conjoined text.
1207 blob->set_neighbour(BND_LEFT, nullptr, false);
1208 blob->set_neighbour(BND_RIGHT, nullptr, false);
1209 return;
1210 }
1211 }
1212
1213 // Case 2: This blob is likely a single character.
1214 int margin = gridsize() / 2;
1215 int h_min, h_max, v_min, v_max;
1216 blob->MinMaxGapsClipped(&h_min, &h_max, &v_min, &v_max);
1217 if ((h_max + margin < v_min && h_max < margin / 2) || blob->leader_on_left() ||
1218 blob->leader_on_right()) {
1219 // Horizontal gaps are clear winners. Clear vertical neighbours.
1220 blob->set_neighbour(BND_ABOVE, nullptr, false);
1221 blob->set_neighbour(BND_BELOW, nullptr, false);
1222 } else if (v_max + margin < h_min && v_max < margin / 2) {
1223 // Vertical gaps are clear winners. Clear horizontal neighbours.
1224 blob->set_neighbour(BND_LEFT, nullptr, false);
1225 blob->set_neighbour(BND_RIGHT, nullptr, false);
1226 }
1227}
1228
1229// Smoothes the vertical/horizontal type of the blob based on the
1230// 2nd-order neighbours. If reset_all is true, then all blobs are
1231// changed. Otherwise, only ambiguous blobs are processed.
1232void StrokeWidth::SmoothNeighbourTypes(PageSegMode pageseg_mode, bool reset_all, BLOBNBOX *blob) {
1233 if ((blob->vert_possible() && blob->horz_possible()) || reset_all) {
1234 // There are both horizontal and vertical so try to fix it.
1235 BLOBNBOX_CLIST neighbours;
1236 List2ndNeighbours(blob, &neighbours);
1237 // The number of pure horizontal and vertical neighbours.
1238 int pure_h_count = 0;
1239 int pure_v_count = 0;
1240 CountNeighbourTypes(&neighbours, &pure_h_count, &pure_v_count);
1241 if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1242 blob->bounding_box().bottom())) {
1243 HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1244 tprintf("pure_h=%d, pure_v=%d\n", pure_h_count, pure_v_count);
1245 }
1246 if (pure_h_count > pure_v_count && !FindingVerticalOnly(pageseg_mode)) {
1247 // Horizontal gaps are clear winners. Clear vertical neighbours.
1248 blob->set_vert_possible(false);
1249 blob->set_horz_possible(true);
1250 } else if (pure_v_count > pure_h_count && !FindingHorizontalOnly(pageseg_mode)) {
1251 // Vertical gaps are clear winners. Clear horizontal neighbours.
1252 blob->set_horz_possible(false);
1253 blob->set_vert_possible(true);
1254 }
1255 } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1256 blob->bounding_box().bottom())) {
1257 HandleClick(blob->bounding_box().left() + 1, blob->bounding_box().bottom() + 1);
1258 tprintf("Clean on pass 3!\n");
1259 }
1260}
1261
1262// Partition creation. Accumulates vertical and horizontal text chains,
1263// puts the remaining blobs in as unknowns, and then merges/splits to
1264// minimize overlap and smoothes the types with neighbours and the color
1265// image if provided. rerotation is used to rotate the coordinate space
1266// back to the nontext_map_ image.
1267// If find_problems is true, detects possible noise pollution by the amount
1268// of partition overlap that is created by the diacritics. If excessive, the
1269// noise is separated out into diacritic blobs, and PFR_NOISE is returned.
1270// [TODO(rays): if the partition overlap is caused by heavy skew, deskews
1271// the components, saves the skew_angle and returns PFR_SKEW.] If the return
1272// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
1273// called again after cleaning up the partly done work.
1274PartitionFindResult StrokeWidth::FindInitialPartitions(
1275 PageSegMode pageseg_mode, const FCOORD &rerotation, bool find_problems, TO_BLOCK *block,
1276 BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts,
1277 FCOORD *skew_angle) {
1278 if (!FindingHorizontalOnly(pageseg_mode)) {
1279 FindVerticalTextChains(part_grid);
1280 }
1281 if (!FindingVerticalOnly(pageseg_mode)) {
1282 FindHorizontalTextChains(part_grid);
1283 }
1284#ifndef GRAPHICS_DISABLED
1285 if (textord_tabfind_show_strokewidths) {
1286 chains_win_ = MakeWindow(0, 400, "Initial text chains");
1287 part_grid->DisplayBoxes(chains_win_);
1288 projection_->DisplayProjection();
1289 }
1290#endif
1291 if (find_problems) {
1292 // TODO(rays) Do something to find skew, set skew_angle and return if there
1293 // is some.
1294 }
1295 part_grid->SplitOverlappingPartitions(big_parts);
1296 EasyMerges(part_grid);
1297 RemoveLargeUnusedBlobs(block, part_grid, big_parts);
1298 TBOX grid_box(bleft(), tright());
1299 while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {
1300 ;
1301 }
1302 while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {
1303 ;
1304 }
1305 int pre_overlap = part_grid->ComputeTotalOverlap(nullptr);
1306 TestDiacritics(part_grid, block);
1307 MergeDiacritics(block, part_grid);
1308 if (find_problems && diacritic_blobs != nullptr &&
1309 DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, diacritic_blobs)) {
1310 return PFR_NOISE;
1311 }
1312#ifndef GRAPHICS_DISABLED
1313 if (textord_tabfind_show_strokewidths) {
1314 textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
1315 part_grid->DisplayBoxes(textlines_win_);
1316 diacritics_win_ = DisplayDiacritics("Diacritics", 0, 0, block);
1317 }
1318#endif
1319 PartitionRemainingBlobs(pageseg_mode, part_grid);
1320 part_grid->SplitOverlappingPartitions(big_parts);
1321 EasyMerges(part_grid);
1322 while (part_grid->GridSmoothNeighbours(BTFT_CHAIN, nontext_map_, grid_box, rerotation)) {
1323 ;
1324 }
1325 while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)) {
1326 ;
1327 }
1328 // Now eliminate strong stuff in a sea of the opposite.
1329 while (part_grid->GridSmoothNeighbours(BTFT_STRONG_CHAIN, nontext_map_, grid_box, rerotation)) {
1330 ;
1331 }
1332#ifndef GRAPHICS_DISABLED
1333 if (textord_tabfind_show_strokewidths) {
1334 smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
1335 part_grid->DisplayBoxes(smoothed_win_);
1336 }
1337#endif
1338 return PFR_OK;
1339}
1340
1341// Detects noise by a significant increase in partition overlap from
1342// pre_overlap to now, and removes noise from the union of all the overlapping
1343// partitions, placing the blobs in diacritic_blobs. Returns true if any noise
1344// was found and removed.
1345bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX &grid_box, TO_BLOCK *block,
1346 ColPartitionGrid *part_grid,
1347 BLOBNBOX_LIST *diacritic_blobs) {
1348 ColPartitionGrid *noise_grid = nullptr;
1349 int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
1350 if (pre_overlap == 0) {
1351 pre_overlap = 1;
1352 }
1353 BLOBNBOX_IT diacritic_it(diacritic_blobs);
1354 if (noise_grid != nullptr) {
1355 if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
1356 post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
1357 // This is noisy enough to fix.
1358#ifndef GRAPHICS_DISABLED
1359 if (textord_tabfind_show_strokewidths) {
1360 ScrollView *noise_win = MakeWindow(1000, 500, "Noise Areas");
1361 noise_grid->DisplayBoxes(noise_win);
1362 }
1363#endif
1364 part_grid->DeleteNonLeaderParts();
1365 BLOBNBOX_IT blob_it(&block->noise_blobs);
1366 ColPartitionGridSearch rsearch(noise_grid);
1367 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1368 BLOBNBOX *blob = blob_it.data();
1369 blob->ClearNeighbours();
1370 if (!blob->IsDiacritic() || blob->owner() != nullptr) {
1371 continue; // Not a noise candidate.
1372 }
1373 TBOX search_box(blob->bounding_box());
1374 search_box.pad(gridsize(), gridsize());
1375 rsearch.StartRectSearch(search_box);
1376 ColPartition *part = rsearch.NextRectSearch();
1377 if (part != nullptr) {
1378 // Consider blob as possible noise.
1379 blob->set_owns_cblob(true);
1380 blob->compute_bounding_box();
1381 diacritic_it.add_after_then_move(blob_it.extract());
1382 }
1383 }
1384 noise_grid->DeleteParts();
1385 delete noise_grid;
1386 return true;
1387 }
1388 noise_grid->DeleteParts();
1389 delete noise_grid;
1390 }
1391 return false;
1392}
1393
1394// Helper verifies that blob's neighbour in direction dir is good to add to a
1395// vertical text chain by returning the neighbour if it is not null, not owned,
1396// and not uniquely horizontal, as well as its neighbour in the opposite
1397// direction is blob.
1398static BLOBNBOX *MutualUnusedVNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {
1399 BLOBNBOX *next_blob = blob->neighbour(dir);
1400 if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyHorizontal()) {
1401 return nullptr;
1402 }
1403 if (next_blob->neighbour(DirOtherWay(dir)) == blob) {
1404 return next_blob;
1405 }
1406 return nullptr;
1407}
1408
1409// Finds vertical chains of text-like blobs and puts them in ColPartitions.
1410void StrokeWidth::FindVerticalTextChains(ColPartitionGrid *part_grid) {
1411 // A PageSegMode that forces vertical textlines with the current rotation.
1412 PageSegMode pageseg_mode =
1413 rerotation_.y() == 0.0f ? PSM_SINGLE_BLOCK_VERT_TEXT : PSM_SINGLE_COLUMN;
1414 BlobGridSearch gsearch(this);
1415 BLOBNBOX *bbox;
1416 gsearch.StartFullSearch();
1417 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1418 // Only process boxes that have no horizontal hope and have not yet
1419 // been included in a chain.
1420 BLOBNBOX *blob;
1421 if (bbox->owner() == nullptr && bbox->UniquelyVertical() &&
1422 (blob = MutualUnusedVNeighbour(bbox, BND_ABOVE)) != nullptr) {
1423 // Put all the linked blobs into a ColPartition.
1424 auto *part = new ColPartition(BRT_VERT_TEXT, ICOORD(0, 1));
1425 part->AddBox(bbox);
1426 while (blob != nullptr) {
1427 part->AddBox(blob);
1428 blob = MutualUnusedVNeighbour(blob, BND_ABOVE);
1429 }
1430 blob = MutualUnusedVNeighbour(bbox, BND_BELOW);
1431 while (blob != nullptr) {
1432 part->AddBox(blob);
1433 blob = MutualUnusedVNeighbour(blob, BND_BELOW);
1434 }
1435 CompletePartition(pageseg_mode, part, part_grid);
1436 }
1437 }
1438}
1439
1440// Helper verifies that blob's neighbour in direction dir is good to add to a
1441// horizontal text chain by returning the neighbour if it is not null, not
1442// owned, and not uniquely vertical, as well as its neighbour in the opposite
1443// direction is blob.
1444static BLOBNBOX *MutualUnusedHNeighbour(const BLOBNBOX *blob, BlobNeighbourDir dir) {
1445 BLOBNBOX *next_blob = blob->neighbour(dir);
1446 if (next_blob == nullptr || next_blob->owner() != nullptr || next_blob->UniquelyVertical()) {
1447 return nullptr;
1448 }
1449 if (next_blob->neighbour(DirOtherWay(dir)) == blob) {
1450 return next_blob;
1451 }
1452 return nullptr;
1453}
1454
1455// Finds horizontal chains of text-like blobs and puts them in ColPartitions.
1456void StrokeWidth::FindHorizontalTextChains(ColPartitionGrid *part_grid) {
1457 // A PageSegMode that forces horizontal textlines with the current rotation.
1458 PageSegMode pageseg_mode =
1459 rerotation_.y() == 0.0f ? PSM_SINGLE_COLUMN : PSM_SINGLE_BLOCK_VERT_TEXT;
1460 BlobGridSearch gsearch(this);
1461 BLOBNBOX *bbox;
1462 gsearch.StartFullSearch();
1463 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1464 BLOBNBOX *blob;
1465 if (bbox->owner() == nullptr && bbox->UniquelyHorizontal() &&
1466 (blob = MutualUnusedHNeighbour(bbox, BND_RIGHT)) != nullptr) {
1467 // Put all the linked blobs into a ColPartition.
1468 auto *part = new ColPartition(BRT_TEXT, ICOORD(0, 1));
1469 part->AddBox(bbox);
1470 while (blob != nullptr) {
1471 part->AddBox(blob);
1472 blob = MutualUnusedHNeighbour(blob, BND_RIGHT);
1473 }
1474 blob = MutualUnusedHNeighbour(bbox, BND_LEFT);
1475 while (blob != nullptr) {
1476 part->AddBox(blob);
1477 blob = MutualUnusedVNeighbour(blob, BND_LEFT);
1478 }
1479 CompletePartition(pageseg_mode, part, part_grid);
1480 }
1481 }
1482}
1483
1484// Finds diacritics and saves their base character in the blob.
1485// The objective is to move all diacritics to the noise_blobs list, so
1486// they don't mess up early textline finding/merging, or force splits
1487// on textlines that overlap a bit. Blobs that become diacritics must be
1488// either part of no ColPartition (nullptr owner) or in a small partition in
1489// which ALL the blobs are diacritics, in which case the partition is
1490// exploded (deleted) back to its blobs.
1491void StrokeWidth::TestDiacritics(ColPartitionGrid *part_grid, TO_BLOCK *block) {
1492 BlobGrid small_grid(gridsize(), bleft(), tright());
1493 small_grid.InsertBlobList(&block->noise_blobs);
1494 small_grid.InsertBlobList(&block->blobs);
1495 int medium_diacritics = 0;
1496 int small_diacritics = 0;
1497 BLOBNBOX_IT small_it(&block->noise_blobs);
1498 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1499 BLOBNBOX *blob = small_it.data();
1500 if (blob->owner() == nullptr && !blob->IsDiacritic() && DiacriticBlob(&small_grid, blob)) {
1501 ++small_diacritics;
1502 }
1503 }
1504 BLOBNBOX_IT blob_it(&block->blobs);
1505 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1506 BLOBNBOX *blob = blob_it.data();
1507 if (blob->IsDiacritic()) {
1508 small_it.add_to_end(blob_it.extract());
1509 continue; // Already a diacritic.
1510 }
1511 ColPartition *part = blob->owner();
1512 if (part == nullptr && DiacriticBlob(&small_grid, blob)) {
1513 ++medium_diacritics;
1514 RemoveBBox(blob);
1515 small_it.add_to_end(blob_it.extract());
1516 } else if (part != nullptr && !part->block_owned() && part->boxes_count() < 3) {
1517 // We allow blobs in small partitions to become diacritics if ALL the
1518 // blobs in the partition qualify as we can then cleanly delete the
1519 // partition, turn all the blobs in it to diacritics and they can be
1520 // merged into the base character partition more easily than merging
1521 // the partitions.
1522 BLOBNBOX_C_IT box_it(part->boxes());
1523 for (box_it.mark_cycle_pt();
1524 !box_it.cycled_list() && DiacriticBlob(&small_grid, box_it.data()); box_it.forward()) {
1525 ;
1526 }
1527 if (box_it.cycled_list()) {
1528 // They are all good.
1529 while (!box_it.empty()) {
1530 // Liberate the blob from its partition so it can be treated
1531 // as a diacritic and merged explicitly with the base part.
1532 // The blob is really owned by the block. The partition "owner"
1533 // is nulled to allow the blob to get merged with its base character
1534 // partition.
1535 BLOBNBOX *box = box_it.extract();
1536 box->set_owner(nullptr);
1537 box_it.forward();
1538 ++medium_diacritics;
1539 // We remove the blob from the grid so it isn't found by subsequent
1540 // searches where we might not want to include diacritics.
1541 RemoveBBox(box);
1542 }
1543 // We only move the one blob to the small list here, but the others
1544 // all get moved by the test at the top of the loop.
1545 small_it.add_to_end(blob_it.extract());
1546 part_grid->RemoveBBox(part);
1547 delete part;
1548 }
1549 } else if (AlignedBlob::WithinTestRegion(2, blob->bounding_box().left(),
1550 blob->bounding_box().bottom())) {
1551 tprintf("Blob not available to be a diacritic at:");
1552 blob->bounding_box().print();
1553 }
1554 }
1555 if (textord_tabfind_show_strokewidths) {
1556 tprintf("Found %d small diacritics, %d medium\n", small_diacritics, medium_diacritics);
1557 }
1558}
1559
1560// Searches this grid for an appropriately close and sized neighbour of the
1561// given [small] blob. If such a blob is found, the diacritic base is saved
1562// in the blob and true is returned.
1563// The small_grid is a secondary grid that contains the small/noise objects
1564// that are not in this grid, but may be useful for determining a connection
1565// between blob and its potential base character. (See DiacriticXGapFilled.)
1566bool StrokeWidth::DiacriticBlob(BlobGrid *small_grid, BLOBNBOX *blob) {
1567 if (BLOBNBOX::UnMergeableType(blob->region_type()) || blob->region_type() == BRT_VERT_TEXT) {
1568 return false;
1569 }
1570 TBOX small_box(blob->bounding_box());
1571 bool debug = AlignedBlob::WithinTestRegion(2, small_box.left(), small_box.bottom());
1572 if (debug) {
1573 tprintf("Testing blob for diacriticness at:");
1574 small_box.print();
1575 }
1576 int x = (small_box.left() + small_box.right()) / 2;
1577 int y = (small_box.bottom() + small_box.top()) / 2;
1578 int grid_x, grid_y;
1579 GridCoords(x, y, &grid_x, &grid_y);
1580 int height = small_box.height();
1581 // Setup a rectangle search to find its nearest base-character neighbour.
1582 // We keep 2 different best candidates:
1583 // best_x_overlap is a category of base characters that have an overlap in x
1584 // (like a acute) in which we look for the least y-gap, computed using the
1585 // projection to favor base characters in the same textline.
1586 // best_y_overlap is a category of base characters that have no x overlap,
1587 // (nominally a y-overlap is preferrecd but not essential) in which we
1588 // look for the least weighted sum of x-gap and y-gap, with x-gap getting
1589 // a lower weight to catch quotes at the end of a textline.
1590 // NOTE that x-gap and y-gap are measured from the nearest side of the base
1591 // character to the FARTHEST side of the diacritic to allow small diacritics
1592 // to be a reasonable distance away, but not big diacritics.
1593 BLOBNBOX *best_x_overlap = nullptr;
1594 BLOBNBOX *best_y_overlap = nullptr;
1595 int best_total_dist = 0;
1596 int best_y_gap = 0;
1597 TBOX best_xbox;
1598 // TODO(rays) the search box could be setup using the projection as a guide.
1599 TBOX search_box(small_box);
1602 search_box.pad(x_pad, y_pad);
1603 BlobGridSearch rsearch(this);
1604 rsearch.SetUniqueMode(true);
1605 int min_height = height * kMinDiacriticSizeRatio;
1606 rsearch.StartRectSearch(search_box);
1607 BLOBNBOX *neighbour;
1608 while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1609 if (BLOBNBOX::UnMergeableType(neighbour->region_type()) || neighbour == blob ||
1610 neighbour->owner() == blob->owner()) {
1611 continue;
1612 }
1613 TBOX nbox = neighbour->bounding_box();
1614 if (neighbour->owner() == nullptr || neighbour->owner()->IsVerticalType() ||
1615 (neighbour->flow() != BTFT_CHAIN && neighbour->flow() != BTFT_STRONG_CHAIN)) {
1616 if (debug) {
1617 tprintf("Neighbour not strong enough:");
1618 nbox.print();
1619 }
1620 continue; // Diacritics must be attached to strong text.
1621 }
1622 if (nbox.height() < min_height) {
1623 if (debug) {
1624 tprintf("Neighbour not big enough:");
1625 nbox.print();
1626 }
1627 continue; // Too small to be the base character.
1628 }
1629 int x_gap = small_box.x_gap(nbox);
1630 int y_gap = small_box.y_gap(nbox);
1631 int total_distance = projection_->DistanceOfBoxFromBox(small_box, nbox, true, denorm_, debug);
1632 if (debug) {
1633 tprintf("xgap=%d, y=%d, total dist=%d\n", x_gap, y_gap, total_distance);
1634 }
1635 if (total_distance > neighbour->owner()->median_height() * kMaxDiacriticDistanceRatio) {
1636 if (debug) {
1637 tprintf("Neighbour with median size %d too far away:", neighbour->owner()->median_height());
1638 neighbour->bounding_box().print();
1639 }
1640 continue; // Diacritics must not be too distant.
1641 }
1642 if (x_gap <= 0) {
1643 if (debug) {
1644 tprintf("Computing reduced box for :");
1645 nbox.print();
1646 }
1647 int left = small_box.left() - small_box.width();
1648 int right = small_box.right() + small_box.width();
1649 nbox = neighbour->BoundsWithinLimits(left, right);
1650 y_gap = small_box.y_gap(nbox);
1651 if (best_x_overlap == nullptr || y_gap < best_y_gap) {
1652 best_x_overlap = neighbour;
1653 best_xbox = nbox;
1654 best_y_gap = y_gap;
1655 if (debug) {
1656 tprintf("New best:");
1657 nbox.print();
1658 }
1659 } else if (debug) {
1660 tprintf("Shrunken box doesn't win:");
1661 nbox.print();
1662 }
1663 } else if (blob->ConfirmNoTabViolation(*neighbour)) {
1664 if (best_y_overlap == nullptr || total_distance < best_total_dist) {
1665 if (debug) {
1666 tprintf("New best y overlap:");
1667 nbox.print();
1668 }
1669 best_y_overlap = neighbour;
1670 best_total_dist = total_distance;
1671 } else if (debug) {
1672 tprintf("New y overlap box doesn't win:");
1673 nbox.print();
1674 }
1675 } else if (debug) {
1676 tprintf("Neighbour wrong side of a tab:");
1677 nbox.print();
1678 }
1679 }
1680 if (best_x_overlap != nullptr &&
1681 (best_y_overlap == nullptr || best_xbox.major_y_overlap(best_y_overlap->bounding_box()))) {
1682 blob->set_diacritic_box(best_xbox);
1683 blob->set_base_char_blob(best_x_overlap);
1684 if (debug) {
1685 tprintf("DiacriticBlob OK! (x-overlap:");
1686 small_box.print();
1687 best_xbox.print();
1688 }
1689 return true;
1690 }
1691 if (best_y_overlap != nullptr &&
1692 DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()) &&
1693 NoNoiseInBetween(small_box, best_y_overlap->bounding_box())) {
1694 blob->set_diacritic_box(best_y_overlap->bounding_box());
1695 blob->set_base_char_blob(best_y_overlap);
1696 if (debug) {
1697 tprintf("DiacriticBlob OK! (y-overlap:");
1698 small_box.print();
1699 best_y_overlap->bounding_box().print();
1700 }
1701 return true;
1702 }
1703 if (debug) {
1704 tprintf("DiacriticBlob fails:");
1705 small_box.print();
1706 tprintf("Best x+y gap = %d, y = %d\n", best_total_dist, best_y_gap);
1707 if (best_y_overlap != nullptr) {
1708 tprintf("XGapFilled=%d, NoiseBetween=%d\n",
1709 DiacriticXGapFilled(small_grid, small_box, best_y_overlap->bounding_box()),
1710 NoNoiseInBetween(small_box, best_y_overlap->bounding_box()));
1711 }
1712 }
1713 return false;
1714}
1715
1716// Returns true if there is no gap between the base char and the diacritic
1717// bigger than a fraction of the height of the base char:
1718// Eg: line end.....'
1719// The quote is a long way from the end of the line, yet it needs to be a
1720// diacritic. To determine that the quote is not part of an image, or
1721// a different text block, we check for other marks in the gap between
1722// the base char and the diacritic.
1723// '<--Diacritic
1724// |---------|
1725// | |<-toobig-gap->
1726// | Base |<ok gap>
1727// |---------| x<-----Dot occupying gap
1728// The grid is const really.
1729bool StrokeWidth::DiacriticXGapFilled(BlobGrid *grid, const TBOX &diacritic_box,
1730 const TBOX &base_box) {
1731 // Since most gaps are small, use an iterative algorithm to search the gap.
1732 int max_gap = IntCastRounded(base_box.height() * kMaxDiacriticGapToBaseCharHeight);
1733 TBOX occupied_box(base_box);
1734 int diacritic_gap;
1735 while ((diacritic_gap = diacritic_box.x_gap(occupied_box)) > max_gap) {
1736 TBOX search_box(occupied_box);
1737 if (diacritic_box.left() > search_box.right()) {
1738 // We are looking right.
1739 search_box.set_left(search_box.right());
1740 search_box.set_right(search_box.left() + max_gap);
1741 } else {
1742 // We are looking left.
1743 search_box.set_right(search_box.left());
1744 search_box.set_left(search_box.left() - max_gap);
1745 }
1746 BlobGridSearch rsearch(grid);
1747 rsearch.StartRectSearch(search_box);
1748 BLOBNBOX *neighbour;
1749 while ((neighbour = rsearch.NextRectSearch()) != nullptr) {
1750 const TBOX &nbox = neighbour->bounding_box();
1751 if (nbox.x_gap(diacritic_box) < diacritic_gap) {
1752 if (nbox.left() < occupied_box.left()) {
1753 occupied_box.set_left(nbox.left());
1754 }
1755 if (nbox.right() > occupied_box.right()) {
1756 occupied_box.set_right(nbox.right());
1757 }
1758 break;
1759 }
1760 }
1761 if (neighbour == nullptr) {
1762 return false; // Found a big gap.
1763 }
1764 }
1765 return true; // The gap was filled.
1766}
1767
1768// Merges diacritics with the ColPartition of the base character blob.
1769void StrokeWidth::MergeDiacritics(TO_BLOCK *block, ColPartitionGrid *part_grid) {
1770 BLOBNBOX_IT small_it(&block->noise_blobs);
1771 for (small_it.mark_cycle_pt(); !small_it.cycled_list(); small_it.forward()) {
1772 BLOBNBOX *blob = small_it.data();
1773 if (blob->base_char_blob() != nullptr) {
1774 ColPartition *part = blob->base_char_blob()->owner();
1775 // The base character must be owned by a partition and that partition
1776 // must not be on the big_parts list (not block owned).
1777 if (part != nullptr && !part->block_owned() && blob->owner() == nullptr &&
1778 blob->IsDiacritic()) {
1779 // The partition has to be removed from the grid and reinserted
1780 // because its bounding box may change.
1781 part_grid->RemoveBBox(part);
1782 part->AddBox(blob);
1783 blob->set_region_type(part->blob_type());
1784 blob->set_flow(part->flow());
1785 blob->set_owner(part);
1786 part_grid->InsertBBox(true, true, part);
1787 }
1788 // Set all base chars to nullptr before any blobs get deleted.
1789 blob->set_base_char_blob(nullptr);
1790 }
1791 }
1792}
1793
1794// Any blobs on the large_blobs list of block that are still unowned by a
1795// ColPartition, are probably drop-cap or vertically touching so the blobs
1796// are removed to the big_parts list and treated separately.
1797void StrokeWidth::RemoveLargeUnusedBlobs(TO_BLOCK *block, ColPartitionGrid *part_grid,
1798 ColPartition_LIST *big_parts) {
1799 BLOBNBOX_IT large_it(&block->large_blobs);
1800 for (large_it.mark_cycle_pt(); !large_it.cycled_list(); large_it.forward()) {
1801 BLOBNBOX *blob = large_it.data();
1802 ColPartition *big_part = blob->owner();
1803 if (big_part == nullptr) {
1804 // Large blobs should have gone into partitions by now if they are
1805 // genuine characters, so move any unowned ones out to the big parts
1806 // list. This will include drop caps and vertically touching characters.
1807 ColPartition::MakeBigPartition(blob, big_parts);
1808 }
1809 }
1810}
1811
1812// All remaining unused blobs are put in individual ColPartitions.
1813void StrokeWidth::PartitionRemainingBlobs(PageSegMode pageseg_mode, ColPartitionGrid *part_grid) {
1814 BlobGridSearch gsearch(this);
1815 BLOBNBOX *bbox;
1816 int prev_grid_x = -1;
1817 int prev_grid_y = -1;
1818 BLOBNBOX_CLIST cell_list;
1819 BLOBNBOX_C_IT cell_it(&cell_list);
1820 bool cell_all_noise = true;
1821 gsearch.StartFullSearch();
1822 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1823 int grid_x = gsearch.GridX();
1824 int grid_y = gsearch.GridY();
1825 if (grid_x != prev_grid_x || grid_y != prev_grid_y) {
1826 // New cell. Process old cell.
1827 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);
1828 cell_it.set_to_list(&cell_list);
1829 prev_grid_x = grid_x;
1830 prev_grid_y = grid_y;
1831 cell_all_noise = true;
1832 }
1833 if (bbox->owner() == nullptr) {
1834 cell_it.add_to_end(bbox);
1835 if (bbox->flow() != BTFT_NONTEXT) {
1836 cell_all_noise = false;
1837 }
1838 } else {
1839 cell_all_noise = false;
1840 }
1841 }
1842 MakePartitionsFromCellList(pageseg_mode, cell_all_noise, part_grid, &cell_list);
1843}
1844
1845// If combine, put all blobs in the cell_list into a single partition, otherwise
1846// put each one into its own partition.
1847void StrokeWidth::MakePartitionsFromCellList(PageSegMode pageseg_mode, bool combine,
1848 ColPartitionGrid *part_grid,
1849 BLOBNBOX_CLIST *cell_list) {
1850 if (cell_list->empty()) {
1851 return;
1852 }
1853 BLOBNBOX_C_IT cell_it(cell_list);
1854 if (combine) {
1855 BLOBNBOX *bbox = cell_it.extract();
1856 auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1857 part->AddBox(bbox);
1858 part->set_flow(bbox->flow());
1859 for (cell_it.forward(); !cell_it.empty(); cell_it.forward()) {
1860 part->AddBox(cell_it.extract());
1861 }
1862 CompletePartition(pageseg_mode, part, part_grid);
1863 } else {
1864 for (; !cell_it.empty(); cell_it.forward()) {
1865 BLOBNBOX *bbox = cell_it.extract();
1866 auto *part = new ColPartition(bbox->region_type(), ICOORD(0, 1));
1867 part->set_flow(bbox->flow());
1868 part->AddBox(bbox);
1869 CompletePartition(pageseg_mode, part, part_grid);
1870 }
1871 }
1872}
1873
1874// Helper function to finish setting up a ColPartition and insert into
1875// part_grid.
1876void StrokeWidth::CompletePartition(PageSegMode pageseg_mode, ColPartition *part,
1877 ColPartitionGrid *part_grid) {
1878 part->ComputeLimits();
1879 TBOX box = part->bounding_box();
1880 bool debug = AlignedBlob::WithinTestRegion(2, box.left(), box.bottom());
1881 int value = projection_->EvaluateColPartition(*part, denorm_, debug);
1882 // Override value if pageseg_mode disagrees.
1883 if (value > 0 && FindingVerticalOnly(pageseg_mode)) {
1884 value = part->boxes_count() == 1 ? 0 : -2;
1885 } else if (value < 0 && FindingHorizontalOnly(pageseg_mode)) {
1886 value = part->boxes_count() == 1 ? 0 : 2;
1887 }
1888 part->SetRegionAndFlowTypesFromProjectionValue(value);
1889 part->ClaimBoxes();
1890 part_grid->InsertBBox(true, true, part);
1891}
1892
1893// Merge partitions where the merge appears harmless.
1894// As this
1895void StrokeWidth::EasyMerges(ColPartitionGrid *part_grid) {
1896 using namespace std::placeholders; // for _1, _2
1897 part_grid->Merges(std::bind(&StrokeWidth::OrientationSearchBox, this, _1, _2),
1898 std::bind(&StrokeWidth::ConfirmEasyMerge, this, _1, _2));
1899}
1900
1901// Compute a search box based on the orientation of the partition.
1902// Returns true if a suitable box can be calculated.
1903// Callback for EasyMerges.
1904bool StrokeWidth::OrientationSearchBox(ColPartition *part, TBOX *box) {
1905 if (part->IsVerticalType()) {
1906 box->set_top(box->top() + box->width());
1907 box->set_bottom(box->bottom() - box->width());
1908 } else {
1909 box->set_left(box->left() - box->height());
1910 box->set_right(box->right() + box->height());
1911 }
1912 return true;
1913}
1914
1915// Merge confirmation callback for EasyMerges.
1916bool StrokeWidth::ConfirmEasyMerge(const ColPartition *p1, const ColPartition *p2) {
1917 ASSERT_HOST(p1 != nullptr && p2 != nullptr);
1918 ASSERT_HOST(!p1->IsEmpty() && !p2->IsEmpty());
1919 if ((p1->flow() == BTFT_NONTEXT && p2->flow() >= BTFT_CHAIN) ||
1920 (p1->flow() >= BTFT_CHAIN && p2->flow() == BTFT_NONTEXT)) {
1921 return false; // Don't merge confirmed image with text.
1922 }
1923 if ((p1->IsVerticalType() || p2->IsVerticalType()) && p1->HCoreOverlap(*p2) <= 0 &&
1924 ((!p1->IsSingleton() && !p2->IsSingleton()) ||
1925 !p1->bounding_box().major_overlap(p2->bounding_box()))) {
1926 return false; // Overlap must be in the text line.
1927 }
1928 if ((p1->IsHorizontalType() || p2->IsHorizontalType()) && p1->VCoreOverlap(*p2) <= 0 &&
1929 ((!p1->IsSingleton() && !p2->IsSingleton()) ||
1930 (!p1->bounding_box().major_overlap(p2->bounding_box()) &&
1931 !p1->OKDiacriticMerge(*p2, false) && !p2->OKDiacriticMerge(*p1, false)))) {
1932 return false; // Overlap must be in the text line.
1933 }
1934 if (!p1->ConfirmNoTabViolation(*p2)) {
1935 return false;
1936 }
1937 if (p1->flow() <= BTFT_NONTEXT && p2->flow() <= BTFT_NONTEXT) {
1938 return true;
1939 }
1940 return NoNoiseInBetween(p1->bounding_box(), p2->bounding_box());
1941}
1942
1943// Returns true if there is no significant noise in between the boxes.
1944bool StrokeWidth::NoNoiseInBetween(const TBOX &box1, const TBOX &box2) const {
1945 return ImageFind::BlankImageInBetween(box1, box2, grid_box_, rerotation_, nontext_map_);
1946}
1947
1948#ifndef GRAPHICS_DISABLED
1949
1953ScrollView *StrokeWidth::DisplayGoodBlobs(const char *window_name, int x, int y) {
1954 auto window = MakeWindow(x, y, window_name);
1955 // For every blob in the grid, display it.
1956 window->Brush(ScrollView::NONE);
1957
1958 // For every bbox in the grid, display it.
1959 BlobGridSearch gsearch(this);
1960 gsearch.StartFullSearch();
1961 BLOBNBOX *bbox;
1962 while ((bbox = gsearch.NextFullSearch()) != nullptr) {
1963 const TBOX &box = bbox->bounding_box();
1964 int left_x = box.left();
1965 int right_x = box.right();
1966 int top_y = box.top();
1967 int bottom_y = box.bottom();
1968 int goodness = bbox->GoodTextBlob();
1969 BlobRegionType blob_type = bbox->region_type();
1970 if (bbox->UniquelyVertical()) {
1971 blob_type = BRT_VERT_TEXT;
1972 }
1973 if (bbox->UniquelyHorizontal()) {
1974 blob_type = BRT_TEXT;
1975 }
1976 BlobTextFlowType flow = bbox->flow();
1977 if (flow == BTFT_NONE) {
1978 if (goodness == 0) {
1979 flow = BTFT_NEIGHBOURS;
1980 } else if (goodness == 1) {
1981 flow = BTFT_CHAIN;
1982 } else {
1983 flow = BTFT_STRONG_CHAIN;
1984 }
1985 }
1986 window->Pen(BLOBNBOX::TextlineColor(blob_type, flow));
1987 window->Rectangle(left_x, bottom_y, right_x, top_y);
1988 }
1989 window->Update();
1990 return window;
1991}
1992
1993static void DrawDiacriticJoiner(const BLOBNBOX *blob, ScrollView *window) {
1994 const TBOX &blob_box(blob->bounding_box());
1995 int top = std::max(static_cast<int>(blob_box.top()), blob->base_char_top());
1996 int bottom = std::min(static_cast<int>(blob_box.bottom()), blob->base_char_bottom());
1997 int x = (blob_box.left() + blob_box.right()) / 2;
1998 window->Line(x, top, x, bottom);
1999}
2000
2001// Displays blobs colored according to whether or not they are diacritics.
2002ScrollView *StrokeWidth::DisplayDiacritics(const char *window_name, int x, int y, TO_BLOCK *block) {
2003 auto window = MakeWindow(x, y, window_name);
2004 // For every blob in the grid, display it.
2005 window->Brush(ScrollView::NONE);
2006
2007 BLOBNBOX_IT it(&block->blobs);
2008 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2009 BLOBNBOX *blob = it.data();
2010 if (blob->IsDiacritic()) {
2011 window->Pen(ScrollView::GREEN);
2012 DrawDiacriticJoiner(blob, window);
2013 } else {
2014 window->Pen(blob->BoxColor());
2015 }
2016 const TBOX &box = blob->bounding_box();
2017 window->Rectangle(box.left(), box.bottom(), box.right(), box.top());
2018 }
2019 it.set_to_list(&block->noise_blobs);
2020 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
2021 BLOBNBOX *blob = it.data();
2022 if (blob->IsDiacritic()) {
2023 window->Pen(ScrollView::GREEN);
2024 DrawDiacriticJoiner(blob, window);
2025 } else {
2026 window->Pen(ScrollView::WHITE);
2027 }
2028 const TBOX &box = blob->bounding_box();
2029 window->Rectangle(box.left(), box.bottom(), box.right(), box.top());
2030 }
2031 window->Update();
2032 return window;
2033}
2034
2035#endif // !GRAPHICS_DISABLED
2036
2037} // namespace tesseract.
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define INT_VAR(name, val, comment)
Definition: params.h:357
#define ASSERT_HOST(x)
Definition: errcode.h:54
@ TBOX
int value
const double y
const double kMaxDiacriticDistanceRatio
Definition: strokewidth.cpp:84
const int kLineResiduePadRatio
BlobRegionType
Definition: blobbox.h:74
@ BRT_TEXT
Definition: blobbox.h:82
@ BRT_HLINE
Definition: blobbox.h:76
@ BRT_NOISE
Definition: blobbox.h:75
@ BRT_VLINE
Definition: blobbox.h:77
@ BRT_VERT_TEXT
Definition: blobbox.h:81
@ BRT_UNKNOWN
Definition: blobbox.h:80
PartitionFindResult
Definition: strokewidth.h:42
const double kNoiseOverlapAreaFactor
const int kCJKMaxComponents
Definition: strokewidth.cpp:64
@ PSM_SINGLE_BLOCK_VERT_TEXT
Definition: publictypes.h:164
@ PSM_SINGLE_COLUMN
Assume a single column of text of variable sizes.
Definition: publictypes.h:163
const double kMinDiacriticSizeRatio
Definition: strokewidth.cpp:81
const double kCJKBrokenDistanceFraction
Definition: strokewidth.cpp:62
const int kLineTrapLongest
Definition: strokewidth.cpp:92
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const double kCJKAspectRatio
Definition: strokewidth.cpp:66
int IntCastRounded(double x)
Definition: helpers.h:170
@ SVET_DESTROY
Definition: scrollview.h:54
const double kStrokeWidthTolerance
Definition: strokewidth.cpp:54
const double kNoiseOverlapGrowthFactor
const double kCJKAspectRatioIncrease
Definition: strokewidth.cpp:68
GridSearch< ColPartition, ColPartition_CLIST, ColPartition_C_IT > ColPartitionGridSearch
Definition: colpartition.h:919
const int kCJKRadius
Definition: strokewidth.cpp:60
const double kNeighbourSearchFactor
const double kLineResidueAspectRatio
Definition: strokewidth.cpp:99
int textord_debug_tabfind
Definition: alignedblob.cpp:29
const double kBrokenCJKIterationFraction
Definition: strokewidth.cpp:72
BlobTextFlowType
Definition: blobbox.h:110
@ BTFT_STRONG_CHAIN
Definition: blobbox.h:115
@ BTFT_NONE
Definition: blobbox.h:111
@ BTFT_CHAIN
Definition: blobbox.h:114
@ BTFT_LEADER
Definition: blobbox.h:117
@ BTFT_NEIGHBOURS
Definition: blobbox.h:113
@ BTFT_NONTEXT
Definition: blobbox.h:112
const int kMaxCJKSizeRatio
Definition: strokewidth.cpp:70
const double kDiacriticXPadRatio
Definition: strokewidth.cpp:75
const double kLineResidueSizeRatio
const double kMaxDiacriticGapToBaseCharHeight
Definition: strokewidth.cpp:87
const int kLineTrapShortest
Definition: strokewidth.cpp:94
const double kStrokeWidthFractionTolerance
Definition: strokewidth.cpp:49
const double kStrokeWidthFractionCJK
Definition: strokewidth.cpp:56
const double kStrokeWidthCJK
Definition: strokewidth.cpp:57
const double kDiacriticYPadRatio
Definition: strokewidth.cpp:78
BlobNeighbourDir DirOtherWay(BlobNeighbourDir dir)
Definition: blobbox.h:102
BlobNeighbourDir
Definition: blobbox.h:89
@ BND_LEFT
Definition: blobbox.h:89
@ BND_RIGHT
Definition: blobbox.h:89
@ BND_BELOW
Definition: blobbox.h:89
@ BND_ABOVE
Definition: blobbox.h:89
@ BND_COUNT
Definition: blobbox.h:89
const float kSizeRatioToReject
Definition: osdetect.cpp:41
const int kMostlyOneDirRatio
Definition: strokewidth.cpp:97
GridSearch< BLOBNBOX, BLOBNBOX_CLIST, BLOBNBOX_C_IT > BlobGridSearch
Definition: blobgrid.h:30
float vert_stroke_width() const
Definition: blobbox.h:358
bool good_stroke_neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:388
const TBOX & bounding_box() const
Definition: blobbox.h:239
void NeighbourGaps(int gaps[BND_COUNT]) const
Definition: blobbox.cpp:178
bool UniquelyHorizontal() const
Definition: blobbox.h:430
bool UniquelyVertical() const
Definition: blobbox.h:427
bool vert_possible() const
Definition: blobbox.h:316
BlobTextFlowType flow() const
Definition: blobbox.h:310
BLOBNBOX * neighbour(BlobNeighbourDir n) const
Definition: blobbox.h:385
C_BLOB * cblob() const
Definition: blobbox.h:277
float horz_stroke_width() const
Definition: blobbox.h:352
bool horz_possible() const
Definition: blobbox.h:322
static ScrollView::Color TextlineColor(BlobRegionType region_type, BlobTextFlowType flow_type)
Definition: blobbox.cpp:442
static bool UnMergeableType(BlobRegionType type)
Definition: blobbox.h:447
BLOBNBOX_LIST blobs
Definition: blobbox.h:776
BLOBNBOX_LIST small_blobs
Definition: blobbox.h:779
BLOBNBOX_LIST large_blobs
Definition: blobbox.h:780
BLOBNBOX_LIST noise_blobs
Definition: blobbox.h:778
integer coordinate
Definition: points.h:36
void set_y(float yin)
rewrite function
Definition: points.h:217
void set_x(float xin)
rewrite function
Definition: points.h:213
float y() const
Definition: points.h:209
float x() const
Definition: points.h:206
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
void print() const
Definition: rect.h:289
TDimension right() const
Definition: rect.h:89
TDimension bottom() const
Definition: rect.h:75
void pad(int xpad, int ypad)
Definition: rect.h:144
bool contains(const FCOORD pt) const
Definition: rect.h:344
int32_t perimeter()
Definition: stepblob.cpp:285
int32_t area()
Definition: stepblob.cpp:268
static bool WithinTestRegion(int detail_level, int x, int y)
void StartRadSearch(int x, int y, int max_radius)
Definition: bbgrid.h:735
BBC * NextRectSearch()
Definition: bbgrid.h:896
void StartFullSearch()
Definition: bbgrid.h:701
void StartRectSearch(const TBOX &rect)
Definition: bbgrid.h:884
BBC * NextFullSearch()
Definition: bbgrid.h:711
BBC * NextRadSearch()
Definition: bbgrid.h:749
int gridsize() const
Definition: bbgrid.h:63
const ICOORD & bleft() const
Definition: bbgrid.h:72
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
Definition: bbgrid.cpp:53
const ICOORD & tright() const
Definition: bbgrid.h:75
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: bbgrid.h:488
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
Definition: bbgrid.h:529
virtual void HandleClick(int x, int y)
Definition: bbgrid.h:691
ScrollView * MakeWindow(int x, int y, const char *window_name)
Definition: bbgrid.h:633
BlobGrid(int gridsize, const ICOORD &bleft, const ICOORD &tright)
Definition: blobgrid.cpp:24
void InsertBlobList(BLOBNBOX_LIST *blobs)
Definition: blobgrid.cpp:35
static ColPartition * MakeBigPartition(BLOBNBOX *box, ColPartition_LIST *big_part_list)
static bool BlankImageInBetween(const TBOX &box1, const TBOX &box2, const TBOX &im_box, const FCOORD &rotation, Image pix)
Definition: imagefind.cpp:437
void FindTextlineDirectionAndFixBrokenCJK(PageSegMode pageseg_mode, bool cjk_merge, TO_BLOCK *input_block)
void CorrectForRotation(const FCOORD &rerotation, ColPartitionGrid *part_grid)
void HandleClick(int x, int y) override
StrokeWidth(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void RemoveLineResidue(ColPartition_LIST *big_part_list)
void GradeBlobsIntoPartitions(PageSegMode pageseg_mode, const FCOORD &rerotation, TO_BLOCK *block, Image nontext_pix, const DENORM *denorm, bool cjk_script, TextlineProjection *projection, BLOBNBOX_LIST *diacritic_blobs, ColPartitionGrid *part_grid, ColPartition_LIST *big_parts)
void SetNeighboursOnMediumBlobs(TO_BLOCK *block)
void FindLeaderPartitions(TO_BLOCK *block, ColPartitionGrid *part_grid)
bool TestVerticalTextDirection(double find_vertical_text_ratio, TO_BLOCK *block, BLOBNBOX_CLIST *osd_blobs)
static bool DifferentSizes(int size1, int size2)
Definition: tabfind.cpp:407
static bool VeryDifferentSizes(int size1, int size2)
Definition: tabfind.cpp:413
void ConstructProjection(TO_BLOCK *input_block, const FCOORD &rotation, Image nontext_map)
void MoveNonTextlineBlobs(BLOBNBOX_LIST *blobs, BLOBNBOX_LIST *small_blobs) const
int EvaluateColPartition(const ColPartition &part, const DENORM *denorm, bool debug) const
int DistanceOfBoxFromBox(const TBOX &from_box, const TBOX &to_box, bool horizontal_textline, const DENORM *denorm, bool debug) const
void PlotGradedBlobs(BLOBNBOX_LIST *blobs, ScrollView *win)
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)
Definition: scrollview.cpp:432
void Pen(Color color)
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:576