tesseract v5.3.3.20231005
altorenderer.cpp
Go to the documentation of this file.
1// File: altorenderer.cpp
2// Description: ALTO rendering interface
3// Author: Jake Sebright
4
5// (C) Copyright 2018
6// Licensed under the Apache License, Version 2.0 (the "License");
7// you may not use this file except in compliance with the License.
8// You may obtain a copy of the License at
9// http://www.apache.org/licenses/LICENSE-2.0
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#include "errcode.h" // for ASSERT_HOST
17#ifdef _WIN32
18# include "host.h" // windows.h for MultiByteToWideChar, ...
19#endif
20#include "tprintf.h" // for tprintf
21
22#include <tesseract/baseapi.h>
23#include <tesseract/renderer.h>
24
25#include <memory>
26#include <sstream> // for std::stringstream
27
28namespace tesseract {
29
33static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level,
34 std::stringstream &alto_str) {
35 int left, top, right, bottom;
36 it->BoundingBox(level, &left, &top, &right, &bottom);
37
38 int hpos = left;
39 int vpos = top;
40 int height = bottom - top;
41 int width = right - left;
42
43 alto_str << " HPOS=\"" << hpos << "\"";
44 alto_str << " VPOS=\"" << vpos << "\"";
45 alto_str << " WIDTH=\"" << width << "\"";
46 alto_str << " HEIGHT=\"" << height << "\"";
47
48 if (level == RIL_WORD) {
49 int wc = it->Confidence(RIL_WORD);
50 alto_str << " WC=\"0." << wc << "\"";
51 } else {
52 alto_str << ">";
53 }
54}
55
60 // Delay the XML output because we need the name of the image file.
61 begin_document = true;
62 return true;
63}
64
69 if (begin_document) {
71 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
72 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
73 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
74 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
75 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
76 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
77 "\t<Description>\n"
78 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
79 "\t\t<sourceImageInformation>\n"
80 "\t\t\t<fileName>");
81
83
85 "</fileName>\n"
86 "\t\t</sourceImageInformation>\n"
87 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
88 "\t\t\t<ocrProcessingStep>\n"
89 "\t\t\t\t<processingSoftware>\n"
90 "\t\t\t\t\t<softwareName>tesseract ");
93 "</softwareName>\n"
94 "\t\t\t\t</processingSoftware>\n"
95 "\t\t\t</ocrProcessingStep>\n"
96 "\t\t</OCRProcessing>\n"
97 "\t</Description>\n"
98 "\t<Layout>\n");
99 begin_document = false;
100 }
101
102 const std::unique_ptr<const char[]> text(api->GetAltoText(imagenum()));
103 if (text == nullptr) {
104 return false;
105 }
106
107 AppendString(text.get());
108
109 return true;
110}
111
116 AppendString("\t</Layout>\n</alto>\n");
117
118 return true;
119}
120
122 : TessResultRenderer(outputbase, "xml"),
123 begin_document(false) {}
124
129char *TessBaseAPI::GetAltoText(int page_number) {
130 return GetAltoText(nullptr, page_number);
131}
132
137char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) {
138 if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) {
139 return nullptr;
140 }
141
142 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
143
144 if (input_file_.empty()) {
145 SetInputName(nullptr);
146 }
147
148#ifdef _WIN32
149 // convert input name from ANSI encoding to utf-8
150 int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
151 wchar_t *uni16_str = new WCHAR[str16_len];
152 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str, str16_len);
153 int utf8_len =
154 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, nullptr, nullptr);
155 char *utf8_str = new char[utf8_len];
156 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, nullptr, nullptr);
157 input_file_ = utf8_str;
158 delete[] uni16_str;
159 delete[] utf8_str;
160#endif
161
162 std::stringstream alto_str;
163 // Use "C" locale (needed for int values larger than 999).
164 alto_str.imbue(std::locale::classic());
165 alto_str << "\t\t<Page WIDTH=\"" << rect_width_ << "\" HEIGHT=\"" << rect_height_
166 << "\" PHYSICAL_IMG_NR=\"" << page_number << "\""
167 << " ID=\"page_" << page_number << "\">\n"
168 << "\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
169 << " WIDTH=\"" << rect_width_ << "\""
170 << " HEIGHT=\"" << rect_height_ << "\">\n";
171
172 ResultIterator *res_it = GetIterator();
173 while (!res_it->Empty(RIL_BLOCK)) {
174 if (res_it->Empty(RIL_WORD)) {
175 res_it->Next(RIL_WORD);
176 continue;
177 }
178
179 int left, top, right, bottom;
180 auto block_type = res_it->BlockType();
181
182 switch (block_type) {
183 case PT_FLOWING_IMAGE:
184 case PT_HEADING_IMAGE:
185 case PT_PULLOUT_IMAGE: {
186 // Handle all kinds of images.
187 // TODO: optionally add TYPE, for example TYPE="photo".
188 alto_str << "\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ << "\"";
189 AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
190 alto_str << "</Illustration>\n";
191 res_it->Next(RIL_BLOCK);
192 continue;
193 }
194 case PT_HORZ_LINE:
195 case PT_VERT_LINE:
196 // Handle horizontal and vertical lines.
197 alto_str << "\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ << "\"";
198 AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
199 alto_str << "</GraphicalElement >\n";
200 res_it->Next(RIL_BLOCK);
201 continue;
202 case PT_NOISE:
203 tprintf("TODO: Please report image which triggers the noise case.\n");
204 ASSERT_HOST(false);
205 default:
206 break;
207 }
208
209 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
210 alto_str << "\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt << "\"";
211 AddBoxToAlto(res_it, RIL_BLOCK, alto_str);
212 alto_str << "\n";
213 }
214
215 if (res_it->IsAtBeginningOf(RIL_PARA)) {
216 alto_str << "\t\t\t\t\t<TextBlock ID=\"block_" << tcnt << "\"";
217 AddBoxToAlto(res_it, RIL_PARA, alto_str);
218 alto_str << "\n";
219 }
220
221 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
222 alto_str << "\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt << "\"";
223 AddBoxToAlto(res_it, RIL_TEXTLINE, alto_str);
224 alto_str << "\n";
225 }
226
227 alto_str << "\t\t\t\t\t\t\t<String ID=\"string_" << wcnt << "\"";
228 AddBoxToAlto(res_it, RIL_WORD, alto_str);
229 alto_str << " CONTENT=\"";
230
231 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
232 bool last_word_in_tblock = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
233 bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
234
235 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
236
237 do {
238 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
239 if (grapheme && grapheme[0] != 0) {
240 alto_str << HOcrEscape(grapheme.get()).c_str();
241 }
242 res_it->Next(RIL_SYMBOL);
243 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
244
245 alto_str << "\"/>";
246
247 wcnt++;
248
249 if (last_word_in_line) {
250 alto_str << "\n\t\t\t\t\t\t</TextLine>\n";
251 lcnt++;
252 } else {
253 int hpos = right;
254 int vpos = top;
255 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
256 int width = left - hpos;
257 alto_str << "<SP WIDTH=\"" << width << "\" VPOS=\"" << vpos << "\" HPOS=\"" << hpos
258 << "\"/>\n";
259 }
260
261 if (last_word_in_tblock) {
262 alto_str << "\t\t\t\t\t</TextBlock>\n";
263 tcnt++;
264 }
265
266 if (last_word_in_cblock) {
267 alto_str << "\t\t\t\t</ComposedBlock>\n";
268 bcnt++;
269 }
270 }
271
272 alto_str << "\t\t\t</PrintSpace>\n"
273 << "\t\t</Page>\n";
274 const std::string &text = alto_str.str();
275
276 char *result = new char[text.length() + 1];
277 strcpy(result, text.c_str());
278 delete res_it;
279 return result;
280}
281
282} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2378
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:63
@ PT_HEADING_IMAGE
Definition: publictypes.h:62
@ PT_HORZ_LINE
Definition: publictypes.h:64
@ PT_FLOWING_IMAGE
Definition: publictypes.h:61
@ PT_VERT_LINE
Definition: publictypes.h:65
const char * GetInputName()
Definition: baseapi.cpp:928
std::string input_file_
Name used by training code.
Definition: baseapi.h:773
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:834
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:772
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:765
static const char * Version()
Definition: baseapi.cpp:241
ResultIterator * GetIterator()
Definition: baseapi.cpp:1337
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:270
PolyBlockType BlockType() const
bool Empty(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
Definition: renderer.cpp:111
bool BeginDocumentHandler() override
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override