tesseract v5.3.3.20231005
thresholder.cpp
Go to the documentation of this file.
1
2// File: thresholder.cpp
3// Description: Base API for thresholding images in tesseract.
4// Author: Ray Smith
5//
6// (C) Copyright 2008, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19// Include automatically generated configuration file
20#ifdef HAVE_CONFIG_H
21# include "config_auto.h"
22#endif
23
24#include "otsuthr.h"
25#include "thresholder.h"
26#include "tprintf.h" // for tprintf
27
28#if defined(USE_OPENCL)
29# include "openclwrapper.h" // for OpenclDevice
30#endif
31
32#include <allheaders.h>
33#include <tesseract/baseapi.h> // for api->GetIntVariable()
34
35#include <algorithm> // for std::max, std::min
36#include <cstdint> // for uint32_t
37#include <cstring>
38#include <tuple>
39
40namespace tesseract {
41
43 : pix_(nullptr)
44 , image_width_(0)
45 , image_height_(0)
46 , pix_channels_(0)
47 , pix_wpl_(0)
48 , scale_(1)
49 , yres_(300)
50 , estimated_res_(300) {
51 SetRectangle(0, 0, 0, 0);
52}
53
55 Clear();
56}
57
58// Destroy the Pix if there is one, freeing memory.
60 pix_.destroy();
61}
62
63// Return true if no image has been set.
65 return pix_ == nullptr;
66}
67
68// SetImage makes a copy of all the image data, so it may be deleted
69// immediately after this call.
70// Greyscale of 8 and color of 24 or 32 bits per pixel may be given.
71// Palette color images will not work properly and must be converted to
72// 24 bit.
73// Binary images of 1 bit per pixel may also be given but they must be
74// byte packed with the MSB of the first byte being the first pixel, and a
75// one pixel is WHITE. For binary images set bytes_per_pixel=0.
76void ImageThresholder::SetImage(const unsigned char *imagedata, int width, int height,
77 int bytes_per_pixel, int bytes_per_line) {
78 int bpp = bytes_per_pixel * 8;
79 if (bpp == 0) {
80 bpp = 1;
81 }
82 Image pix = pixCreate(width, height, bpp == 24 ? 32 : bpp);
83 l_uint32 *data = pixGetData(pix);
84 int wpl = pixGetWpl(pix);
85 switch (bpp) {
86 case 1:
87 for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
88 for (int x = 0; x < width; ++x) {
89 if (imagedata[x / 8] & (0x80 >> (x % 8))) {
90 CLEAR_DATA_BIT(data, x);
91 } else {
92 SET_DATA_BIT(data, x);
93 }
94 }
95 }
96 break;
97
98 case 8:
99 // Greyscale just copies the bytes in the right order.
100 for (int y = 0; y < height; ++y, data += wpl, imagedata += bytes_per_line) {
101 for (int x = 0; x < width; ++x) {
102 SET_DATA_BYTE(data, x, imagedata[x]);
103 }
104 }
105 break;
106
107 case 24:
108 // Put the colors in the correct places in the line buffer.
109 for (int y = 0; y < height; ++y, imagedata += bytes_per_line) {
110 for (int x = 0; x < width; ++x, ++data) {
111 SET_DATA_BYTE(data, COLOR_RED, imagedata[3 * x]);
112 SET_DATA_BYTE(data, COLOR_GREEN, imagedata[3 * x + 1]);
113 SET_DATA_BYTE(data, COLOR_BLUE, imagedata[3 * x + 2]);
114 }
115 }
116 break;
117
118 case 32:
119 // Maintain byte order consistency across different endianness.
120 for (int y = 0; y < height; ++y, imagedata += bytes_per_line, data += wpl) {
121 for (int x = 0; x < width; ++x) {
122 data[x] = (imagedata[x * 4] << 24) | (imagedata[x * 4 + 1] << 16) |
123 (imagedata[x * 4 + 2] << 8) | imagedata[x * 4 + 3];
124 }
125 }
126 break;
127
128 default:
129 tprintf("Cannot convert RAW image to Pix with bpp = %d\n", bpp);
130 }
131 SetImage(pix);
132 pix.destroy();
133}
134
135// Store the coordinates of the rectangle to process for later use.
136// Doesn't actually do any thresholding.
137void ImageThresholder::SetRectangle(int left, int top, int width, int height) {
138 rect_left_ = left;
139 rect_top_ = top;
140 rect_width_ = width;
141 rect_height_ = height;
142}
143
144// Get enough parameters to be able to rebuild bounding boxes in the
145// original image (not just within the rectangle).
146// Left and top are enough with top-down coordinates, but
147// the height of the rectangle and the image are needed for bottom-up.
148void ImageThresholder::GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth,
149 int *imageheight) {
150 *left = rect_left_;
151 *top = rect_top_;
152 *width = rect_width_;
153 *height = rect_height_;
154 *imagewidth = image_width_;
155 *imageheight = image_height_;
156}
157
158// Pix vs raw, which to use? Pix is the preferred input for efficiency,
159// since raw buffers are copied.
160// SetImage for Pix clones its input, so the source pix may be pixDestroyed
161// immediately after, but may not go away until after the Thresholder has
162// finished with it.
164 if (pix_ != nullptr) {
165 pix_.destroy();
166 }
167 Image src = pix;
168 int depth;
169 pixGetDimensions(src, &image_width_, &image_height_, &depth);
170 // Convert the image as necessary so it is one of binary, plain RGB, or
171 // 8 bit with no colormap. Guarantee that we always end up with our own copy,
172 // not just a clone of the input.
173 if (depth > 1 && depth < 8) {
174 pix_ = pixConvertTo8(src, false);
175 } else {
176 pix_ = src.copy();
177 }
178 depth = pixGetDepth(pix_);
179 pix_channels_ = depth / 8;
180 pix_wpl_ = pixGetWpl(pix_);
181 scale_ = 1;
182 estimated_res_ = yres_ = pixGetYRes(pix_);
183 Init();
184}
185
186std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
187 TessBaseAPI *api,
188 ThresholdMethod method) {
189 Image pix_binary = nullptr;
190 Image pix_thresholds = nullptr;
191
192 if (pix_channels_ == 0) {
193 // We have a binary image, but it still has to be copied, as this API
194 // allows the caller to modify the output.
195 Image original = GetPixRect();
196 pix_binary = original.copy();
197 original.destroy();
198 return std::make_tuple(true, nullptr, pix_binary, nullptr);
199 }
200
201 auto pix_grey = GetPixRectGrey();
202
203 int r;
204
205 l_int32 pix_w, pix_h;
206 pixGetDimensions(pix_grey, &pix_w, &pix_h, nullptr);
207
208 bool thresholding_debug;
209 api->GetBoolVariable("thresholding_debug", &thresholding_debug);
210 if (thresholding_debug) {
211 tprintf("\nimage width: %d height: %d ppi: %d\n", pix_w, pix_h, yres_);
212 }
213
214 if (method == ThresholdMethod::Sauvola) {
215 int window_size;
216 double window_size_factor;
217 api->GetDoubleVariable("thresholding_window_size", &window_size_factor);
218 window_size = window_size_factor * yres_;
219 window_size = std::max(7, window_size);
220 window_size = std::min(pix_w < pix_h ? pix_w - 3 : pix_h - 3, window_size);
221 int half_window_size = window_size / 2;
222
223 // factor for image division into tiles; >= 1
224 l_int32 nx, ny;
225 // tiles size will be approx. 250 x 250 pixels
226 nx = std::max(1, (pix_w + 125) / 250);
227 ny = std::max(1, (pix_h + 125) / 250);
228 auto xrat = pix_w / nx;
229 auto yrat = pix_h / ny;
230 if (xrat < half_window_size + 2) {
231 nx = pix_w / (half_window_size + 2);
232 }
233 if (yrat < half_window_size + 2) {
234 ny = pix_h / (half_window_size + 2);
235 }
236
237 double kfactor;
238 api->GetDoubleVariable("thresholding_kfactor", &kfactor);
239 kfactor = std::max(0.0, kfactor);
240
241 if (thresholding_debug) {
242 tprintf("window size: %d kfactor: %.3f nx:%d ny: %d\n", window_size, kfactor, nx, ny);
243 }
244
245 r = pixSauvolaBinarizeTiled(pix_grey, half_window_size, kfactor, nx, ny,
246 (PIX**)pix_thresholds,
247 (PIX**)pix_binary);
248 } else { // if (method == ThresholdMethod::LeptonicaOtsu)
249 int tile_size;
250 double tile_size_factor;
251 api->GetDoubleVariable("thresholding_tile_size", &tile_size_factor);
252 tile_size = tile_size_factor * yres_;
253 tile_size = std::max(16, tile_size);
254
255 int smooth_size;
256 double smooth_size_factor;
257 api->GetDoubleVariable("thresholding_smooth_kernel_size",
258 &smooth_size_factor);
259 smooth_size_factor = std::max(0.0, smooth_size_factor);
260 smooth_size = smooth_size_factor * yres_;
261 int half_smooth_size = smooth_size / 2;
262
263 double score_fraction;
264 api->GetDoubleVariable("thresholding_score_fraction", &score_fraction);
265
266 if (thresholding_debug) {
267 tprintf("tile size: %d smooth_size: %d score_fraction: %.2f\n", tile_size, smooth_size, score_fraction);
268 }
269
270 r = pixOtsuAdaptiveThreshold(pix_grey, tile_size, tile_size,
271 half_smooth_size, half_smooth_size,
272 score_fraction,
273 (PIX**)pix_thresholds,
274 (PIX**)pix_binary);
275 }
276
277 bool ok = (r == 0);
278 return std::make_tuple(ok, pix_grey, pix_binary, pix_thresholds);
279}
280
281// Threshold the source image as efficiently as possible to the output Pix.
282// Creates a Pix and sets pix to point to the resulting pointer.
283// Caller must use pixDestroy to free the created Pix.
286 if (image_width_ > INT16_MAX || image_height_ > INT16_MAX) {
287 tprintf("Image too large: (%d, %d)\n", image_width_, image_height_);
288 return false;
289 }
290 Image original = GetPixRect();
291 if (pix_channels_ == 0) {
292 // We have a binary image, but it still has to be copied, as this API
293 // allows the caller to modify the output.
294 *pix = original.copy();
295 } else {
296 if (pixGetColormap(original)) {
297 Image tmp;
298 Image without_cmap =
299 pixRemoveColormap(original, REMOVE_CMAP_BASED_ON_SRC);
300 int depth = pixGetDepth(without_cmap);
301 if (depth > 1 && depth < 8) {
302 tmp = pixConvertTo8(without_cmap, false);
303 } else {
304 tmp = without_cmap.copy();
305 }
306 without_cmap.destroy();
307 OtsuThresholdRectToPix(tmp, pix);
308 tmp.destroy();
309 } else {
311 }
312 }
313 original.destroy();
314 return true;
315}
316
317// Gets a pix that contains an 8 bit threshold value at each pixel. The
318// returned pix may be an integer reduction of the binary image such that
319// the scale factor may be inferred from the ratio of the sizes, even down
320// to the extreme of a 1x1 pixel thresholds image.
321// Ideally the 8 bit threshold should be the exact threshold used to generate
322// the binary image in ThresholdToPix, but this is not a hard constraint.
323// Returns nullptr if the input is binary. PixDestroy after use.
325 if (IsBinary()) {
326 return nullptr;
327 }
328 Image pix_grey = GetPixRectGrey();
329 int width = pixGetWidth(pix_grey);
330 int height = pixGetHeight(pix_grey);
331 std::vector<int> thresholds;
332 std::vector<int> hi_values;
333 OtsuThreshold(pix_grey, 0, 0, width, height, thresholds, hi_values);
334 pix_grey.destroy();
335 Image pix_thresholds = pixCreate(width, height, 8);
336 int threshold = thresholds[0] > 0 ? thresholds[0] : 128;
337 pixSetAllArbitrary(pix_thresholds, threshold);
338 return pix_thresholds;
339}
340
341// Common initialization shared between SetImage methods.
344}
345
346// Get a clone/copy of the source image rectangle.
347// The returned Pix must be pixDestroyed.
348// This function will be used in the future by the page layout analysis, and
349// the layout analysis that uses it will only be available with Leptonica,
350// so there is no raw equivalent.
352 if (IsFullImage()) {
353 // Just clone the whole thing.
354 return pix_.clone();
355 } else {
356 // Crop to the given rectangle.
357 Box *box = boxCreate(rect_left_, rect_top_, rect_width_, rect_height_);
358 Image cropped = pixClipRectangle(pix_, box, nullptr);
359 boxDestroy(&box);
360 return cropped;
361 }
362}
363
364// Get a clone/copy of the source image rectangle, reduced to greyscale,
365// and at the same resolution as the output binary.
366// The returned Pix must be pixDestroyed.
367// Provided to the classifier to extract features from the greyscale image.
369 auto pix = GetPixRect(); // May have to be reduced to grey.
370 int depth = pixGetDepth(pix);
371 if (depth != 8 || pixGetColormap(pix)) {
372 if (depth == 24) {
373 auto tmp = pixConvert24To32(pix);
374 pix.destroy();
375 pix = tmp;
376 }
377 auto result = pixConvertTo8(pix, false);
378 pix.destroy();
379 return result;
380 }
381 return pix;
382}
383
384// Otsu thresholds the rectangle, taking the rectangle from *this.
386 std::vector<int> thresholds;
387 std::vector<int> hi_values;
388
389 int num_channels = OtsuThreshold(src_pix, rect_left_, rect_top_, rect_width_, rect_height_,
390 thresholds, hi_values);
391 // only use opencl if compiled w/ OpenCL and selected device is opencl
392#ifdef USE_OPENCL
393 OpenclDevice od;
394 if (num_channels == 4 && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0) {
395 od.ThresholdRectToPixOCL((unsigned char *)pixGetData(src_pix), num_channels,
396 pixGetWpl(src_pix) * 4, &thresholds[0], &hi_values[0], out_pix /*pix_OCL*/,
398 } else {
399#endif
400 ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix);
401#ifdef USE_OPENCL
402 }
403#endif
404}
405
409// arrays and also the bytes per pixel in src_pix.
410void ImageThresholder::ThresholdRectToPix(Image src_pix, int num_channels, const std::vector<int> &thresholds,
411 const std::vector<int> &hi_values, Image *pix) const {
412 *pix = pixCreate(rect_width_, rect_height_, 1);
413 uint32_t *pixdata = pixGetData(*pix);
414 int wpl = pixGetWpl(*pix);
415 int src_wpl = pixGetWpl(src_pix);
416 uint32_t *srcdata = pixGetData(src_pix);
417 pixSetXRes(*pix, pixGetXRes(src_pix));
418 pixSetYRes(*pix, pixGetYRes(src_pix));
419 for (int y = 0; y < rect_height_; ++y) {
420 const uint32_t *linedata = srcdata + (y + rect_top_) * src_wpl;
421 uint32_t *pixline = pixdata + y * wpl;
422 for (int x = 0; x < rect_width_; ++x) {
423 bool white_result = true;
424 for (int ch = 0; ch < num_channels; ++ch) {
425 int pixel = GET_DATA_BYTE(linedata, (x + rect_left_) * num_channels + ch);
426 if (hi_values[ch] >= 0 && (pixel > thresholds[ch]) == (hi_values[ch] == 0)) {
427 white_result = false;
428 break;
429 }
430 }
431 if (white_result) {
432 CLEAR_DATA_BIT(pixline, x);
433 } else {
434 SET_DATA_BIT(pixline, x);
435 }
436 }
437 }
438}
439
440} // namespace tesseract.
const double y
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int OtsuThreshold(Image src_pix, int left, int top, int width, int height, std::vector< int > &thresholds, std::vector< int > &hi_values)
Definition: otsuthr.cpp:38
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:304
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:320
virtual Image GetPixRectThresholds()
int pix_wpl_
Words per line of pix_.
Definition: thresholder.h:188
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool IsFullImage() const
Return true if we are processing the full image.
Definition: thresholder.h:165
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:64
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:76
int estimated_res_
Resolution estimate from text size.
Definition: thresholder.h:192
virtual std::tuple< bool, Image, Image, Image > Threshold(TessBaseAPI *api, ThresholdMethod method)
void SetRectangle(int left, int top, int width, int height)
virtual void Init()
Common initialization shared between SetImage methods.
void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const
int scale_
Scale factor from original image.
Definition: thresholder.h:190
int pix_channels_
Number of 8-bit channels in pix_.
Definition: thresholder.h:187
int yres_
y pixels/inch in source image.
Definition: thresholder.h:191
int image_width_
Width of source pix_.
Definition: thresholder.h:185
virtual Image GetPixRectGrey()
void ThresholdRectToPix(Image src_pix, int num_channels, const std::vector< int > &thresholds, const std::vector< int > &hi_values, Image *pix) const
virtual bool ThresholdToPix(Image *pix)
Returns false on error.
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:84
int image_height_
Height of source pix_.
Definition: thresholder.h:186
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:59
Image copy() const
Definition: image.cpp:28
Image clone() const
Definition: image.cpp:24
void destroy()
Definition: image.cpp:32