tesseract v5.3.3.20231005
pageiterator.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
2// File: pageiterator.h
3// Description: Iterator for tesseract page structure that avoids using
4// tesseract internal data structures.
5// Author: Ray Smith
6//
7// (C) Copyright 2010, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17
18#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_
19#define TESSERACT_CCMAIN_PAGEITERATOR_H_
20
21#include "export.h"
22#include "publictypes.h"
23
24struct Pix;
25struct Pta;
26
27namespace tesseract {
28
29struct BlamerBundle;
30class C_BLOB_IT;
31class PAGE_RES;
32class PAGE_RES_IT;
33class WERD;
34
35class Tesseract;
36
51public:
66 PageIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale,
67 int scaled_yres, int rect_left, int rect_top, int rect_width,
68 int rect_height);
69 virtual ~PageIterator();
70
77 PageIterator(const PageIterator &src);
78 const PageIterator &operator=(const PageIterator &src);
79
81 bool PositionedAtSameWord(const PAGE_RES_IT *other) const;
82
83 // ============= Moving around within the page ============.
84
89 virtual void Begin();
90
96 virtual void RestartParagraph();
97
102 bool IsWithinFirstTextlineOfParagraph() const;
103
109 virtual void RestartRow();
110
122 virtual bool Next(PageIteratorLevel level);
123
137 virtual bool IsAtBeginningOf(PageIteratorLevel level) const;
138
155 virtual bool IsAtFinalElement(PageIteratorLevel level,
156 PageIteratorLevel element) const;
157
164 int Cmp(const PageIterator &other) const;
165
166 // ============= Accessing data ==============.
167 // Coordinate system:
168 // Integer coordinates are at the cracks between the pixels.
169 // The top-left corner of the top-left pixel in the image is at (0,0).
170 // The bottom-right corner of the bottom-right pixel in the image is at
171 // (width, height).
172 // Every bounding box goes from the top-left of the top-left contained
173 // pixel to the bottom-right of the bottom-right contained pixel, so
174 // the bounding box of the single top-left pixel in the image is:
175 // (0,0)->(1,1).
176 // If an image rectangle has been set in the API, then returned coordinates
177 // relate to the original (full) image, rather than the rectangle.
178
188 void SetBoundingBoxComponents(bool include_upper_dots,
189 bool include_lower_dots) {
190 include_upper_dots_ = include_upper_dots;
191 include_lower_dots_ = include_lower_dots;
192 }
193
203 bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right,
204 int *bottom) const;
205 bool BoundingBox(PageIteratorLevel level, int padding, int *left, int *top,
206 int *right, int *bottom) const;
212 bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top,
213 int *right, int *bottom) const;
214
216 bool Empty(PageIteratorLevel level) const;
217
222 PolyBlockType BlockType() const;
223
231 Pta *BlockPolygon() const;
232
239 Pix *GetBinaryImage(PageIteratorLevel level) const;
240
252 Pix *GetImage(PageIteratorLevel level, int padding, Pix *original_img,
253 int *left, int *top) const;
254
261 bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2,
262 int *y2) const;
263
264 // Returns the attributes of the current row.
265 void RowAttributes(float *row_height, float *descenders,
266 float *ascenders) const;
267
276 void Orientation(tesseract::Orientation *orientation,
277 tesseract::WritingDirection *writing_direction,
278 tesseract::TextlineOrder *textline_order,
279 float *deskew_angle) const;
280
309 void ParagraphInfo(tesseract::ParagraphJustification *justification,
310 bool *is_list_item, bool *is_crown,
311 int *first_line_indent) const;
312
313 // If the current WERD_RES (it_->word()) is not nullptr, sets the BlamerBundle
314 // of the current word to the given pointer (takes ownership of the pointer)
315 // and returns true.
316 // Can only be used when iterating on the word level.
317 bool SetWordBlamerBundle(BlamerBundle *blamer_bundle);
318
319protected:
324 void BeginWord(int offset);
325
349 C_BLOB_IT *cblob_it_;
360};
361
362} // namespace tesseract.
363
364#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_
ParagraphJustification
Definition: publictypes.h:246
void SetBoundingBoxComponents(bool include_upper_dots, bool include_lower_dots)
Definition: pageiterator.h:188
#define TESS_API
Definition: export.h:32