tesseract v5.3.3.20231005
ocrpara.h
Go to the documentation of this file.
1
2// File: ocrpara.h
3// Description: OCR Paragraph Output Type
4// Author: David Eger
5//
6// (C) Copyright 2010, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#ifndef TESSERACT_CCSTRUCT_OCRPARA_H_
20#define TESSERACT_CCSTRUCT_OCRPARA_H_
21
22#include "elst.h"
23
25
26namespace tesseract {
27
28class ParagraphModel;
29
30struct PARA : public ELIST_LINK {
31public:
33 : model(nullptr)
34 , is_list_item(false)
36 , has_drop_cap(false) {}
37
38 // We do not own the model, we just reference it.
39 // model may be nullptr if there is not a good model for this paragraph.
41
43
44 // The first paragraph on a page often lacks a first line indent, but should
45 // still be modeled by the same model as other body text paragraphs on the
46 // page.
48
49 // Does this paragraph begin with a drop cap?
51};
52
54
55// A geometric model of paragraph indentation and alignment.
56//
57// Measurements are in pixels. The meaning of the integer arguments changes
58// depending upon the value of justification. Distances less than or equal
59// to tolerance apart we take as "equivalent" for the purpose of model
60// matching, and in the examples below, we assume tolerance is zero.
61//
62// justification = LEFT:
63// margin the "ignored" margin to the left block edge.
64// first_indent indent from the left margin to a typical first text line.
65// body_indent indent from the left margin of a typical body text line.
66//
67// justification = RIGHT:
68// margin the "ignored" margin to the right block edge.
69// first_indent indent from the right margin to a typical first text line.
70// body_indent indent from the right margin of a typical body text line.
71//
72// justification = CENTER:
73// margin ignored
74// first_indent ignored
75// body_indent ignored
76//
77// ====== Extended example, assuming each letter is ten pixels wide: =======
78//
79// +--------------------------------+
80// | Awesome | ParagraphModel(CENTER, 0, 0, 0)
81// | Centered Title |
82// | Paragraph Detection |
83// | OCR TEAM |
84// | 10 November 2010 |
85// | |
86// | Look here, I have a paragraph.| ParagraphModel(LEFT, 0, 20, 0)
87// |This paragraph starts at the top|
88// |of the page and takes 3 lines. |
89// | Here I have a second paragraph| ParagraphModel(LEFT, 0, 20, 0)
90// |which indicates that the first |
91// |paragraph is not a continuation |
92// |from a previous page, as it is |
93// |indented just like this second |
94// |paragraph. |
95// | Here is a block quote. It | ParagraphModel(LEFT, 30, 0, 0)
96// | looks like the prior text |
97// | but it is indented more |
98// | and is fully justified. |
99// | So how does one deal with | ParagraphModel(LEFT, 0, 20, 0)
100// |centered text, block quotes, |
101// |normal paragraphs, and lists |
102// |like what follows? |
103// |1. Make a plan. | ParagraphModel(LEFT, 0, 0, 30)
104// |2. Use a heuristic, for example,| ParagraphModel(LEFT, 0, 0, 30)
105// | looking for lines where the |
106// | first word of the next line |
107// | would fit on the previous |
108// | line. |
109// |8. Try to implement the plan in | ParagraphModel(LEFT, 0, 0, 30)
110// | Python and try it out. |
111// |4. Determine how to fix the | ParagraphModel(LEFT, 0, 0, 30)
112// | mistakes. |
113// |5. Repeat. | ParagraphModel(LEFT, 0, 0, 30)
114// | For extra painful penalty work| ParagraphModel(LEFT, 0, 20, 0)
115// |you can try to identify source |
116// |code. Ouch! |
117// +--------------------------------+
119public:
120 ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent,
121 int body_indent, int tolerance)
122 : justification_(justification)
123 , margin_(margin)
124 , first_indent_(first_indent)
125 , body_indent_(body_indent)
126 , tolerance_(tolerance) {
127 // Make one of {first_indent, body_indent} is 0.
128 int added_margin = first_indent;
129 if (body_indent < added_margin) {
130 added_margin = body_indent;
131 }
132 margin_ += added_margin;
133 first_indent_ -= added_margin;
134 body_indent_ -= added_margin;
135 }
136
138 : justification_(tesseract::JUSTIFICATION_UNKNOWN)
139 , margin_(0)
140 , first_indent_(0)
141 , body_indent_(0)
142 , tolerance_(0) {}
143
144 // ValidFirstLine() and ValidBodyLine() take arguments describing a text line
145 // in a block of text which we are trying to model:
146 // lmargin, lindent: these add up to the distance from the leftmost ink
147 // in the text line to the surrounding text block's left
148 // edge.
149 // rmargin, rindent: these add up to the distance from the rightmost ink
150 // in the text line to the surrounding text block's right
151 // edge.
152 // The caller determines the division between "margin" and "indent", which
153 // only actually affect whether we think the line may be centered.
154 //
155 // If the amount of whitespace matches the amount of whitespace expected on
156 // the relevant side of the line (within tolerance_) we say it matches.
157
158 // Return whether a given text line could be a first paragraph line according
159 // to this paragraph model.
160 bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const;
161
162 // Return whether a given text line could be a first paragraph line according
163 // to this paragraph model.
164 bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const;
165
167 return justification_;
168 }
169 int margin() const {
170 return margin_;
171 }
172 int first_indent() const {
173 return first_indent_;
174 }
175 int body_indent() const {
176 return body_indent_;
177 }
178 int tolerance() const {
179 return tolerance_;
180 }
181 bool is_flush() const {
182 return (justification_ == tesseract::JUSTIFICATION_LEFT ||
183 justification_ == tesseract::JUSTIFICATION_RIGHT) &&
184 abs(first_indent_ - body_indent_) <= tolerance_;
185 }
186
187 // Return whether this model is likely to agree with the other model on most
188 // paragraphs they are marked.
189 bool Comparable(const ParagraphModel &other) const;
190
191 std::string ToString() const;
192
193private:
195 int margin_;
196 int first_indent_;
197 int body_indent_;
198 int tolerance_;
199};
200
201} // namespace tesseract
202
203#endif // TESSERACT_CCSTRUCT_OCRPARA_H_
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
ParagraphJustification
Definition: publictypes.h:246
@ JUSTIFICATION_LEFT
Definition: publictypes.h:248
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:247
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:250
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
const ParagraphModel * model
Definition: ocrpara.h:40
bool has_drop_cap
Definition: ocrpara.h:50
bool is_list_item
Definition: ocrpara.h:42
bool is_very_first_or_continuation
Definition: ocrpara.h:47
tesseract::ParagraphJustification justification() const
Definition: ocrpara.h:166
bool is_flush() const
Definition: ocrpara.h:181
int body_indent() const
Definition: ocrpara.h:175
int tolerance() const
Definition: ocrpara.h:178
ParagraphModel(tesseract::ParagraphJustification justification, int margin, int first_indent, int body_indent, int tolerance)
Definition: ocrpara.h:120
int first_indent() const
Definition: ocrpara.h:172
#define TESS_API
Definition: export.h:32