tesseract v5.3.3.20231005
devanagari_processing.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: devanagari_processing.cpp
3 * Description: Methods to process images containing devanagari symbols,
4 * prior to classification.
5 * Author: Shobhit Saxena
6 *
7 * (C) Copyright 2008, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#ifdef HAVE_CONFIG_H
21# include "config_auto.h"
22#endif
23
25
26#include "debugpixa.h"
27#include "statistc.h"
28#include "tordmain.h"
29
30#include <allheaders.h>
31
32namespace tesseract {
33
34// Flags controlling the debugging information for shiro-rekha splitting
35// strategies.
36INT_VAR(devanagari_split_debuglevel, 0, "Debug level for split shiro-rekha process.");
37
39 "Whether to create a debug image for split shiro-rekha process.");
40
42 orig_pix_ = nullptr;
43 segmentation_block_list_ = nullptr;
44 splitted_image_ = nullptr;
45 global_xheight_ = kUnspecifiedXheight;
46 perform_close_ = false;
47 debug_image_ = nullptr;
48 pageseg_split_strategy_ = NO_SPLIT;
49 ocr_split_strategy_ = NO_SPLIT;
50}
51
53 Clear();
54}
55
57 orig_pix_.destroy();
58 splitted_image_.destroy();
59 pageseg_split_strategy_ = NO_SPLIT;
60 ocr_split_strategy_ = NO_SPLIT;
61 debug_image_.destroy();
62 segmentation_block_list_ = nullptr;
63 global_xheight_ = kUnspecifiedXheight;
64 perform_close_ = false;
65}
66
67// On setting the input image, a clone of it is owned by this class.
69 if (orig_pix_) {
70 orig_pix_.destroy();
71 }
72 orig_pix_ = pix.clone();
73}
74
75// Top-level method to perform splitting based on current settings.
76// Returns true if a split was actually performed.
77// split_for_pageseg should be true if the splitting is being done prior to
78// page segmentation. This mode uses the flag
79// pageseg_devanagari_split_strategy to determine the splitting strategy.
80bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa *pixa_debug) {
81 SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_;
82 if (split_strategy == NO_SPLIT) {
83 return false; // Nothing to do.
84 }
85 ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT);
86 ASSERT_HOST(orig_pix_);
88 tprintf("Splitting shiro-rekha ...\n");
89 tprintf("Split strategy = %s\n", split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
90 tprintf("Initial pageseg available = %s\n", segmentation_block_list_ ? "yes" : "no");
91 }
92 // Create a copy of original image to store the splitting output.
93 splitted_image_.destroy();
94 splitted_image_ = orig_pix_.copy();
95
96 // Initialize debug image if required.
98 debug_image_.destroy();
99 debug_image_ = pixConvertTo32(orig_pix_);
100 }
101
102 // Determine all connected components in the input image. A close operation
103 // may be required prior to this, depending on the current settings.
104 Image pix_for_ccs = orig_pix_.clone();
105 if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) {
107 tprintf("Performing a global close operation..\n");
108 }
109 // A global measure is available for xheight, but no local information
110 // exists.
111 pix_for_ccs.destroy();
112 pix_for_ccs = orig_pix_.copy();
113 PerformClose(pix_for_ccs, global_xheight_);
114 }
115 Pixa *ccs;
116 Boxa *tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
117 boxaDestroy(&tmp_boxa);
118 pix_for_ccs.destroy();
119
120 // Iterate over all connected components. Get their bounding boxes and clip
121 // out the image regions corresponding to these boxes from the original image.
122 // Conditionally run splitting on each of them.
123 Boxa *regions_to_clear = boxaCreate(0);
124 int num_ccs = 0;
125 if (ccs != nullptr) {
126 num_ccs = pixaGetCount(ccs);
127 }
128 for (int i = 0; i < num_ccs; ++i) {
129 Box *box = pixaGetBox(ccs, i, L_CLONE);
130 Image word_pix = pixClipRectangle(orig_pix_, box, nullptr);
131 ASSERT_HOST(word_pix);
132 int xheight = GetXheightForCC(box);
133 if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) {
134 pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
135 }
136 // If some xheight measure is available, attempt to pre-eliminate small
137 // blobs from the shiro-rekha process. This is primarily to save the CCs
138 // corresponding to punctuation marks/small dots etc which are part of
139 // larger graphemes.
140 l_int32 x, y, w, h;
141 boxGetGeometry(box, &x, &y, &w, &h);
142 if (xheight == kUnspecifiedXheight || (w > xheight / 3 && h > xheight / 2)) {
143 SplitWordShiroRekha(split_strategy, word_pix, xheight, x, y, regions_to_clear);
144 } else if (devanagari_split_debuglevel > 0) {
145 tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", x, y, w, h);
146 }
147 word_pix.destroy();
148 boxDestroy(&box);
149 }
150 // Actually clear the boxes now.
151 for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
152 Box *box = boxaGetBox(regions_to_clear, i, L_CLONE);
153 pixClearInRect(splitted_image_, box);
154 boxDestroy(&box);
155 }
156 boxaDestroy(&regions_to_clear);
157 pixaDestroy(&ccs);
158 if (devanagari_split_debugimage && pixa_debug != nullptr) {
159 pixa_debug->AddPix(debug_image_, split_for_pageseg ? "pageseg_split" : "ocr_split");
160 }
161 return true;
162}
163
164// Method to perform a close operation on the input image. The xheight
165// estimate decides the size of sel used.
166void ShiroRekhaSplitter::PerformClose(Image pix, int xheight_estimate) {
167 pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
168}
169
170// This method resolves the cc bbox to a particular row and returns the row's
171// xheight.
172int ShiroRekhaSplitter::GetXheightForCC(Box *cc_bbox) {
173 if (!segmentation_block_list_) {
174 return global_xheight_;
175 }
176 // Compute the box coordinates in Tesseract's coordinate system.
177 l_int32 x, y, w, h;
178 boxGetGeometry(cc_bbox, &x, &y, &w, &h);
179 TBOX bbox(x, pixGetHeight(orig_pix_) - y - h - 1,
180 x + w, pixGetHeight(orig_pix_) - y - 1);
181 // Iterate over all blocks.
182 BLOCK_IT block_it(segmentation_block_list_);
183 for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
184 BLOCK *block = block_it.data();
185 // Iterate over all rows in the block.
186 ROW_IT row_it(block->row_list());
187 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
188 ROW *row = row_it.data();
189 if (!row->bounding_box().major_overlap(bbox)) {
190 continue;
191 }
192 // Row could be skewed, warped, etc. Use the position of the box to
193 // determine the baseline position of the row for that x-coordinate.
194 // Create a square TBOX whose baseline's mid-point lies at this point
195 // and side is row's xheight. Take the overlap of this box with the input
196 // box and check if it is a 'major overlap'. If so, this box lies in this
197 // row. In that case, return the xheight for this row.
198 float box_middle = 0.5 * (bbox.left() + bbox.right());
199 int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
200 TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2,
201 static_cast<int>(baseline + row->x_height()));
202 // Compute overlap. If it is a major overlap, this is the right row.
203 if (bbox.major_overlap(test_box)) {
204 return row->x_height();
205 }
206 }
207 }
208 // No row found for this bbox.
209 return kUnspecifiedXheight;
210}
211
212// Returns a list of regions (boxes) which should be cleared in the original
213// image so as to perform shiro-rekha splitting. Pix is assumed to carry one
214// (or less) word only. Xheight measure could be the global estimate, the row
215// estimate, or unspecified. If unspecified, over splitting may occur, since a
216// conservative estimate of stroke width along with an associated multiplier
217// is used in its place. It is advisable to have a specified xheight when
218// splitting for classification/training.
219// A vertical projection histogram of all the on-pixels in the input pix is
220// computed. The maxima of this histogram is regarded as an approximate location
221// of the shiro-rekha. By descending on the maxima's peak on both sides,
222// stroke width of shiro-rekha is estimated.
223// A horizontal projection histogram is computed for a sub-image of the input
224// image, which extends from just below the shiro-rekha down to a certain
225// leeway. The leeway depends on the input xheight, if provided, else a
226// conservative multiplier on approximate stroke width is used (which may lead
227// to over-splitting).
228void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight,
229 int word_left, int word_top, Boxa *regions_to_clear) {
230 if (split_strategy == NO_SPLIT) {
231 return;
232 }
233 int width = pixGetWidth(pix);
234 int height = pixGetHeight(pix);
235 // Statistically determine the yextents of the shiro-rekha.
236 int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
237 GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, &shirorekha_ylevel);
238 // Since the shiro rekha is also a stroke, its width is equal to the stroke
239 // width.
240 int stroke_width = shirorekha_bottom - shirorekha_top + 1;
241
242 // Some safeguards to protect CCs we do not want to be split.
243 // These are particularly useful when the word wasn't eliminated earlier
244 // because xheight information was unavailable.
245 if (shirorekha_ylevel > height / 2) {
246 // Shirorekha shouldn't be in the bottom half of the word.
248 tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", word_left,
249 word_top);
250 }
251 return;
252 }
253 if (stroke_width > height / 3) {
254 // Even the boldest of fonts shouldn't do this.
256 tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", word_left, word_top);
257 }
258 return;
259 }
260
261 // Clear the ascender and descender regions of the word.
262 // Obtain a vertical projection histogram for the resulting image.
263 Box *box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, width, 5 * stroke_width / 3);
264 Image word_in_xheight = pix.copy();
265 pixClearInRect(word_in_xheight, box_to_clear);
266 // Also clear any pixels which are below shirorekha_bottom + some leeway.
267 // The leeway is set to xheight if the information is available, else it is a
268 // multiplier applied to the stroke width.
269 int leeway_to_keep = stroke_width * 3;
270 if (xheight != kUnspecifiedXheight) {
271 // This is because the xheight-region typically includes the shiro-rekha
272 // inside it, i.e., the top of the xheight range corresponds to the top of
273 // shiro-rekha.
274 leeway_to_keep = xheight - stroke_width;
275 }
276 auto y = shirorekha_bottom + leeway_to_keep;
277 boxSetGeometry(box_to_clear, -1, y, -1, height - y);
278 pixClearInRect(word_in_xheight, box_to_clear);
279 boxDestroy(&box_to_clear);
280
281 PixelHistogram vert_hist;
282 vert_hist.ConstructVerticalCountHist(word_in_xheight);
283 word_in_xheight.destroy();
284
285 // If the number of black pixel in any column of the image is less than a
286 // fraction of the stroke width, treat it as noise / a stray mark. Perform
287 // these changes inside the vert_hist data itself, as that is used later on as
288 // a bit vector for the final split decision at every column.
289 for (int i = 0; i < width; ++i) {
290 if (vert_hist.hist()[i] <= stroke_width / 4) {
291 vert_hist.hist()[i] = 0;
292 } else {
293 vert_hist.hist()[i] = 1;
294 }
295 }
296 // In order to split the line at any point, we make sure that the width of the
297 // gap is at least half the stroke width.
298 int i = 0;
299 int cur_component_width = 0;
300 while (i < width) {
301 if (!vert_hist.hist()[i]) {
302 int j = 0;
303 while (i + j < width && !vert_hist.hist()[i + j]) {
304 ++j;
305 }
306 if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
307 // Perform a shiro-rekha split. The intervening region lies from i to
308 // i+j-1.
309 // A minimal single-pixel split makes the estimation of intra- and
310 // inter-word spacing easier during page layout analysis,
311 // whereas a maximal split may be needed for OCR, depending on
312 // how the engine was trained.
313 bool minimal_split = (split_strategy == MINIMAL_SPLIT);
314 int split_width = minimal_split ? 1 : j;
315 int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
316 if (!minimal_split || (i != 0 && i + j != width)) {
317 Box *box_to_clear =
318 boxCreate(word_left + split_left, word_top + shirorekha_top - stroke_width / 3,
319 split_width, 5 * stroke_width / 3);
320 if (box_to_clear) {
321 boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
322 // Mark this in the debug image if needed.
324 pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
325 }
326 boxDestroy(&box_to_clear);
327 cur_component_width = 0;
328 }
329 }
330 }
331 i += j;
332 } else {
333 ++i;
334 ++cur_component_width;
335 }
336 }
337}
338
339// Refreshes the words in the segmentation block list by using blobs in the
340// input block list.
341// The segmentation block list must be set.
343 // The segmentation block list must have been specified.
344 ASSERT_HOST(segmentation_block_list_);
346 tprintf("Before refreshing blobs:\n");
347 PrintSegmentationStats(segmentation_block_list_);
348 tprintf("New Blobs found: %d\n", new_blobs->length());
349 }
350
351 C_BLOB_LIST not_found_blobs;
353 segmentation_block_list_, new_blobs,
354 ((devanagari_split_debugimage && debug_image_) ? &not_found_blobs : nullptr));
355
357 tprintf("After refreshing blobs:\n");
358 PrintSegmentationStats(segmentation_block_list_);
359 }
360 if (devanagari_split_debugimage && debug_image_) {
361 // Plot out the original blobs for which no match was found in the new
362 // all_blobs list.
363 C_BLOB_IT not_found_it(&not_found_blobs);
364 for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
365 C_BLOB *not_found = not_found_it.data();
366 TBOX not_found_box = not_found->bounding_box();
367 Box *box_to_plot = GetBoxForTBOX(not_found_box);
368 pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
369 boxDestroy(&box_to_plot);
370 }
371
372 // Plot out the blobs unused from all blobs.
373 C_BLOB_IT all_blobs_it(new_blobs);
374 for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
375 C_BLOB *a_blob = all_blobs_it.data();
376 Box *box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
377 pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
378 boxDestroy(&box_to_plot);
379 }
380 }
381}
382
383// Returns a new box object for the corresponding TBOX, based on the original
384// image's coordinate system.
385Box *ShiroRekhaSplitter::GetBoxForTBOX(const TBOX &tbox) const {
386 return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, tbox.width(),
387 tbox.height());
388}
389
390// This method returns the computed mode-height of blobs in the pix.
391// It also prunes very small blobs from calculation.
393 Boxa *boxa = pixConnComp(pix, nullptr, 8);
394 STATS heights(0, pixGetHeight(pix) - 1);
395 heights.clear();
396 for (int i = 0; i < boxaGetCount(boxa); ++i) {
397 Box *box = boxaGetBox(boxa, i, L_CLONE);
398 l_int32 x, y, w, h;
399 boxGetGeometry(box, &x, &y, &w, &h);
400 if (h >= 3 || w >= 3) {
401 heights.add(h, 1);
402 }
403 boxDestroy(&box);
404 }
405 boxaDestroy(&boxa);
406 return heights.mode();
407}
408
409// This method returns y-extents of the shiro-rekha computed from the input
410// word image.
411void ShiroRekhaSplitter::GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top,
412 int *shirorekha_bottom, int *shirorekha_ylevel) {
413 // Compute a histogram from projecting the word on a vertical line.
414 PixelHistogram hist_horiz;
415 hist_horiz.ConstructHorizontalCountHist(word_pix);
416 // Get the ylevel where the top-line exists. This is basically the global
417 // maxima in the horizontal histogram.
418 int topline_onpixel_count = 0;
419 int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
420
421 // Get the upper and lower extents of the shiro rekha.
422 int thresh = (topline_onpixel_count * 70) / 100;
423 int ulimit = topline_ylevel;
424 int llimit = topline_ylevel;
425 while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) {
426 --ulimit;
427 }
428 while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) {
429 ++llimit;
430 }
431
432 if (shirorekha_top) {
433 *shirorekha_top = ulimit;
434 }
435 if (shirorekha_bottom) {
436 *shirorekha_bottom = llimit;
437 }
438 if (shirorekha_ylevel) {
439 *shirorekha_ylevel = topline_ylevel;
440 }
441}
442
443// This method returns the global-maxima for the histogram. The frequency of
444// the global maxima is returned in count, if specified.
446 int best_value = 0;
447 for (int i = 0; i < length_; ++i) {
448 if (hist_[i] > hist_[best_value]) {
449 best_value = i;
450 }
451 }
452 if (count) {
453 *count = hist_[best_value];
454 }
455 return best_value;
456}
457
458// Methods to construct histograms from images.
460 Clear();
461 int width = pixGetWidth(pix);
462 int height = pixGetHeight(pix);
463 hist_ = new int[width];
464 length_ = width;
465 int wpl = pixGetWpl(pix);
466 l_uint32 *data = pixGetData(pix);
467 for (int i = 0; i < width; ++i) {
468 hist_[i] = 0;
469 }
470 for (int i = 0; i < height; ++i) {
471 l_uint32 *line = data + i * wpl;
472 for (int j = 0; j < width; ++j) {
473 if (GET_DATA_BIT(line, j)) {
474 ++(hist_[j]);
475 }
476 }
477 }
478}
479
481 Clear();
482 Numa *counts = pixCountPixelsByRow(pix, nullptr);
483 length_ = numaGetCount(counts);
484 hist_ = new int[length_];
485 for (int i = 0; i < length_; ++i) {
486 l_int32 val = 0;
487 numaGetIValue(counts, i, &val);
488 hist_[i] = val;
489 }
490 numaDestroy(&counts);
491}
492
493} // namespace tesseract.
#define BOOL_VAR(name, val, comment)
Definition: params.h:360
#define INT_VAR(name, val, comment)
Definition: params.h:357
#define ASSERT_HOST(x)
Definition: errcode.h:54
@ TBOX
const double y
int * count
bool devanagari_split_debugimage
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ baseline
Definition: mfoutline.h:53
void RefreshWordBlobsFromNewBlobs(BLOCK_LIST *block_list, C_BLOB_LIST *new_blobs, C_BLOB_LIST *not_found_blobs)
Definition: ocrblock.cpp:474
void PrintSegmentationStats(BLOCK_LIST *block_list)
Definition: ocrblock.cpp:407
void AddPix(const Image pix, const char *caption)
Definition: debugpixa.h:32
Image copy() const
Definition: image.cpp:28
Image clone() const
Definition: image.cpp:24
void destroy()
Definition: image.cpp:32
TDimension left() const
Definition: rect.h:82
TDimension height() const
Definition: rect.h:118
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
int32_t mode() const
Definition: statistc.cpp:112
TBOX bounding_box() const
Definition: stepblob.cpp:250
int GetHistogramMaximum(int *count) const
void ConstructHorizontalCountHist(Image pix)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)