tesseract v5.3.3.20231005
hocrrenderer.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: hocrrenderer.cpp
3 * Description: Simple API for calling tesseract.
4 * Author: Ray Smith (original code from baseapi.cpp)
5 * Author: Stefan Weil (moved to separate file and cleaned code)
6 *
7 * (C) Copyright 2006, Google Inc.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 **********************************************************************/
19
20#include <tesseract/baseapi.h> // for TessBaseAPI
21#include <locale> // for std::locale::classic
22#include <memory> // for std::unique_ptr
23#include <sstream> // for std::stringstream
24#ifdef _WIN32
25# include "host.h" // windows.h for MultiByteToWideChar, ...
26#endif
27#include <tesseract/renderer.h>
28#include "tesseractclass.h" // for Tesseract
29
30namespace tesseract {
31
35static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
36 tesseract::Orientation orientation;
37 tesseract::WritingDirection writing_direction;
38 tesseract::TextlineOrder textline_order;
39 float deskew_angle;
40 it->Orientation(&orientation, &writing_direction, &textline_order,
41 &deskew_angle);
42 return orientation;
43}
44
53static void AddBaselineCoordsTohOCR(const PageIterator *it,
55 std::stringstream &hocr_str) {
56 tesseract::Orientation orientation = GetBlockTextOrientation(it);
57 if (orientation != ORIENTATION_PAGE_UP) {
58 hocr_str << "; textangle " << 360 - orientation * 90;
59 return;
60 }
61
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
64
65 // Try to get the baseline coordinates at this level.
66 int x1, y1, x2, y2;
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
68 return;
69 }
70 // Following the description of this field of the hOCR spec, we convert the
71 // baseline coordinates so that "the bottom left of the bounding box is the
72 // origin".
73 x1 -= left;
74 x2 -= left;
75 y1 -= bottom;
76 y2 -= bottom;
77
78 // Now fit a line through the points so we can extract coefficients for the
79 // equation: y = p1 x + p0
80 if (x1 == x2) {
81 // Problem computing the polynomial coefficients.
82 return;
83 }
84 double p1 = (y2 - y1) / static_cast<double>(x2 - x1);
85 double p0 = y1 - p1 * x1;
86
87 hocr_str << "; baseline " << round(p1 * 1000.0) / 1000.0 << " "
88 << round(p0 * 1000.0) / 1000.0;
89}
90
91static void AddBoxTohOCR(const ResultIterator *it, PageIteratorLevel level,
92 std::stringstream &hocr_str) {
93 int left, top, right, bottom;
94 it->BoundingBox(level, &left, &top, &right, &bottom);
95 // This is the only place we use double quotes instead of single quotes,
96 // but it may too late to change for consistency
97 hocr_str << " title=\"bbox " << left << " " << top << " " << right << " "
98 << bottom;
99 // Add baseline coordinates & heights for textlines only.
100 if (level == RIL_TEXTLINE) {
101 AddBaselineCoordsTohOCR(it, level, hocr_str);
102 // add custom height measures
103 float row_height, descenders, ascenders; // row attributes
104 it->RowAttributes(&row_height, &descenders, &ascenders);
105 // TODO(rays): Do we want to limit these to a single decimal place?
106 hocr_str << "; x_size " << row_height << "; x_descenders " << -descenders
107 << "; x_ascenders " << ascenders;
108 }
109 hocr_str << "\">";
110}
111
121char *TessBaseAPI::GetHOCRText(int page_number) {
122 return GetHOCRText(nullptr, page_number);
123}
124
134char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
135 if (tesseract_ == nullptr ||
136 (page_res_ == nullptr && Recognize(monitor) < 0)) {
137 return nullptr;
138 }
139
140 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
141 int page_id = page_number + 1; // hOCR uses 1-based page numbers.
142 bool para_is_ltr = true; // Default direction is LTR
143 const char *paragraph_lang = nullptr;
144 bool font_info = false;
145 bool hocr_boxes = false;
146 GetBoolVariable("hocr_font_info", &font_info);
147 GetBoolVariable("hocr_char_boxes", &hocr_boxes);
148
149 if (input_file_.empty()) {
150 SetInputName(nullptr);
151 }
152
153#ifdef _WIN32
154 // convert input name from ANSI encoding to utf-8
155 int str16_len =
156 MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, nullptr, 0);
157 wchar_t *uni16_str = new WCHAR[str16_len];
158 str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_.c_str(), -1, uni16_str,
159 str16_len);
160 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr,
161 0, nullptr, nullptr);
162 char *utf8_str = new char[utf8_len];
163 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
164 nullptr, nullptr);
165 input_file_ = utf8_str;
166 delete[] uni16_str;
167 delete[] utf8_str;
168#endif
169
170 std::stringstream hocr_str;
171 // Use "C" locale (needed for double values x_size and x_descenders).
172 hocr_str.imbue(std::locale::classic());
173 // Use 8 digits for double values.
174 hocr_str.precision(8);
175 hocr_str << " <div class='ocr_page'"
176 << " id='"
177 << "page_" << page_id << "'"
178 << " title='image \"";
179 if (!input_file_.empty()) {
180 hocr_str << HOcrEscape(input_file_.c_str());
181 } else {
182 hocr_str << "unknown";
183 }
184
185 hocr_str << "\"; bbox " << rect_left_ << " " << rect_top_ << " "
186 << rect_width_ << " " << rect_height_ << "; ppageno " << page_number
187 << "; scan_res " << GetSourceYResolution() << " "
188 << GetSourceYResolution() << "'>\n";
189
190 std::unique_ptr<ResultIterator> res_it(GetIterator());
191 while (!res_it->Empty(RIL_BLOCK)) {
192 int left, top, right, bottom;
193 auto block_type = res_it->BlockType();
194 switch (block_type) {
195 case PT_FLOWING_IMAGE:
196 case PT_HEADING_IMAGE:
197 case PT_PULLOUT_IMAGE: {
198 // Handle all kinds of images.
199 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
200 hocr_str << " <div class='ocr_photo' id='block_" << page_id << '_'
201 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
202 << right << " " << bottom << "\"></div>\n";
203 res_it->Next(RIL_BLOCK);
204 continue;
205 }
206 case PT_HORZ_LINE:
207 case PT_VERT_LINE:
208 // Handle horizontal and vertical lines.
209 res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
210 hocr_str << " <div class='ocr_separator' id='block_" << page_id << '_'
211 << bcnt++ << "' title=\"bbox " << left << " " << top << " "
212 << right << " " << bottom << "\"></div>\n";
213 res_it->Next(RIL_BLOCK);
214 continue;
215 case PT_NOISE:
216 tprintf("TODO: Please report image which triggers the noise case.\n");
217 ASSERT_HOST(false);
218 default:
219 break;
220 }
221
222 if (res_it->Empty(RIL_WORD)) {
223 res_it->Next(RIL_WORD);
224 continue;
225 }
226
227 // Open any new block/paragraph/textline.
228 if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
229 para_is_ltr = true; // reset to default direction
230 hocr_str << " <div class='ocr_carea'"
231 << " id='"
232 << "block_" << page_id << "_" << bcnt << "'";
233 AddBoxTohOCR(res_it.get(), RIL_BLOCK, hocr_str);
234 }
235 if (res_it->IsAtBeginningOf(RIL_PARA)) {
236 hocr_str << "\n <p class='ocr_par'";
237 para_is_ltr = res_it->ParagraphIsLtr();
238 if (!para_is_ltr) {
239 hocr_str << " dir='rtl'";
240 }
241 hocr_str << " id='"
242 << "par_" << page_id << "_" << pcnt << "'";
243 paragraph_lang = res_it->WordRecognitionLanguage();
244 if (paragraph_lang) {
245 hocr_str << " lang='" << paragraph_lang << "'";
246 }
247 AddBoxTohOCR(res_it.get(), RIL_PARA, hocr_str);
248 }
249 if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
250 hocr_str << "\n <span class='";
251 switch (block_type) {
252 case PT_HEADING_TEXT:
253 hocr_str << "ocr_header";
254 break;
255 case PT_PULLOUT_TEXT:
256 hocr_str << "ocr_textfloat";
257 break;
258 case PT_CAPTION_TEXT:
259 hocr_str << "ocr_caption";
260 break;
261 case PT_FLOWING_IMAGE:
262 case PT_HEADING_IMAGE:
263 case PT_PULLOUT_IMAGE:
264 ASSERT_HOST(false);
265 break;
266 default:
267 hocr_str << "ocr_line";
268 }
269 hocr_str << "' id='"
270 << "line_" << page_id << "_" << lcnt << "'";
271 AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
272 }
273
274 // Now, process the word...
275 int32_t lstm_choice_mode = tesseract_->lstm_choice_mode;
276 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
277 *rawTimestepMap = nullptr;
278 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap = nullptr;
279 if (lstm_choice_mode) {
280 CTCMap = res_it->GetBestLSTMSymbolChoices();
281 rawTimestepMap = res_it->GetRawLSTMTimesteps();
282 }
283 hocr_str << "\n <span class='ocrx_word'"
284 << " id='"
285 << "word_" << page_id << "_" << wcnt << "'";
286 bool bold, italic, underlined, monospace, serif, smallcaps;
287 int pointsize, font_id;
288 res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
289 const char *font_name =
290 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
291 &serif, &smallcaps, &pointsize, &font_id);
292 hocr_str << " title='bbox " << left << " " << top << " " << right << " "
293 << bottom << "; x_wconf "
294 << static_cast<int>(res_it->Confidence(RIL_WORD));
295 if (font_info) {
296 if (font_name) {
297 hocr_str << "; x_font " << HOcrEscape(font_name).c_str();
298 }
299 hocr_str << "; x_fsize " << pointsize;
300 }
301 hocr_str << "'";
302 const char *lang = res_it->WordRecognitionLanguage();
303 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
304 hocr_str << " lang='" << lang << "'";
305 }
306 switch (res_it->WordDirection()) {
307 // Only emit direction if different from current paragraph direction
309 if (!para_is_ltr) {
310 hocr_str << " dir='ltr'";
311 }
312 break;
314 if (para_is_ltr) {
315 hocr_str << " dir='rtl'";
316 }
317 break;
318 case DIR_MIX:
319 case DIR_NEUTRAL:
320 default: // Do nothing.
321 break;
322 }
323 hocr_str << ">";
324 bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
325 bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
326 bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
327 if (bold) {
328 hocr_str << "<strong>";
329 }
330 if (italic) {
331 hocr_str << "<em>";
332 }
333 do {
334 const std::unique_ptr<const char[]> grapheme(
335 res_it->GetUTF8Text(RIL_SYMBOL));
336 if (grapheme && grapheme[0] != 0) {
337 if (hocr_boxes) {
338 res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
339 hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
340 << left << " " << top << " " << right << " " << bottom
341 << "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
342 }
343 hocr_str << HOcrEscape(grapheme.get()).c_str();
344 if (hocr_boxes) {
345 hocr_str << "</span>";
346 tesseract::ChoiceIterator ci(*res_it);
347 if (lstm_choice_mode == 1 && ci.Timesteps() != nullptr) {
348 std::vector<std::vector<std::pair<const char *, float>>> *symbol =
349 ci.Timesteps();
350 hocr_str << "\n <span class='ocr_symbol'"
351 << " id='"
352 << "symbol_" << page_id << "_" << wcnt << "_" << scnt
353 << "'>";
354 for (const auto &timestep : *symbol) {
355 hocr_str << "\n <span class='ocrx_cinfo'"
356 << " id='"
357 << "timestep" << page_id << "_" << wcnt << "_" << tcnt
358 << "'>";
359 for (auto conf : timestep) {
360 hocr_str << "\n <span class='ocrx_cinfo'"
361 << " id='"
362 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
363 << "'"
364 << " title='x_confs " << int(conf.second * 100) << "'>"
365 << HOcrEscape(conf.first).c_str() << "</span>";
366 ++ccnt;
367 }
368 hocr_str << "</span>";
369 ++tcnt;
370 }
371 hocr_str << "\n </span>";
372 ++scnt;
373 } else if (lstm_choice_mode == 2) {
374 hocr_str << "\n <span class='ocrx_cinfo'"
375 << " id='"
376 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
377 << "'>";
378 do {
379 const char *choice = ci.GetUTF8Text();
380 float choiceconf = ci.Confidence();
381 if (choice != nullptr) {
382 hocr_str << "\n <span class='ocrx_cinfo'"
383 << " id='"
384 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
385 << "'"
386 << " title='x_confs " << choiceconf << "'>"
387 << HOcrEscape(choice).c_str() << "</span>";
388 ccnt++;
389 }
390 } while (ci.Next());
391 hocr_str << "\n </span>";
392 tcnt++;
393 }
394 }
395 }
396 res_it->Next(RIL_SYMBOL);
397 } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
398 if (italic) {
399 hocr_str << "</em>";
400 }
401 if (bold) {
402 hocr_str << "</strong>";
403 }
404 // If the lstm choice mode is required it is added here
405 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap != nullptr) {
406 for (const auto &symbol : *rawTimestepMap) {
407 hocr_str << "\n <span class='ocr_symbol'"
408 << " id='"
409 << "symbol_" << page_id << "_" << wcnt << "_" << scnt << "'>";
410 for (const auto &timestep : symbol) {
411 hocr_str << "\n <span class='ocrx_cinfo'"
412 << " id='"
413 << "timestep" << page_id << "_" << wcnt << "_" << tcnt
414 << "'>";
415 for (auto &&conf : timestep) {
416 hocr_str << "\n <span class='ocrx_cinfo'"
417 << " id='"
418 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
419 << "'"
420 << " title='x_confs " << int(conf.second * 100) << "'>"
421 << HOcrEscape(conf.first).c_str() << "</span>";
422 ++ccnt;
423 }
424 hocr_str << "</span>";
425 ++tcnt;
426 }
427 hocr_str << "</span>";
428 ++scnt;
429 }
430 } else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap != nullptr) {
431 for (const auto &timestep : *CTCMap) {
432 if (timestep.size() > 0) {
433 hocr_str << "\n <span class='ocrx_cinfo'"
434 << " id='"
435 << "lstm_choices_" << page_id << "_" << wcnt << "_" << tcnt
436 << "'>";
437 for (auto &j : timestep) {
438 float conf = 100 - tesseract_->lstm_rating_coefficient * j.second;
439 if (conf < 0.0f) {
440 conf = 0.0f;
441 }
442 if (conf > 100.0f) {
443 conf = 100.0f;
444 }
445 hocr_str << "\n <span class='ocrx_cinfo'"
446 << " id='"
447 << "choice_" << page_id << "_" << wcnt << "_" << ccnt
448 << "'"
449 << " title='x_confs " << conf << "'>"
450 << HOcrEscape(j.first).c_str() << "</span>";
451 ccnt++;
452 }
453 hocr_str << "</span>";
454 tcnt++;
455 }
456 }
457 }
458 // Close ocrx_word.
459 if (hocr_boxes || lstm_choice_mode > 0) {
460 hocr_str << "\n ";
461 }
462 hocr_str << "</span>";
463 tcnt = 1;
464 ccnt = 1;
465 wcnt++;
466 // Close any ending block/paragraph/textline.
467 if (last_word_in_line) {
468 hocr_str << "\n </span>";
469 lcnt++;
470 }
471 if (last_word_in_para) {
472 hocr_str << "\n </p>\n";
473 pcnt++;
474 para_is_ltr = true; // back to default direction
475 }
476 if (last_word_in_block) {
477 hocr_str << " </div>\n";
478 bcnt++;
479 }
480 }
481 hocr_str << " </div>\n";
482
483 const std::string &text = hocr_str.str();
484 char *result = new char[text.length() + 1];
485 strcpy(result, text.c_str());
486 return result;
487}
488
489/**********************************************************************
490 * HOcr Text Renderer interface implementation
491 **********************************************************************/
493 : TessResultRenderer(outputbase, "hocr") {
494 font_info_ = false;
495}
496
497TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
498 : TessResultRenderer(outputbase, "hocr") {
499 font_info_ = font_info;
500}
501
504 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
505 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
506 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
507 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
508 "lang=\"en\">\n <head>\n <title>");
511 "</title>\n"
512 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
513 "charset=utf-8\"/>\n"
514 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
515 "' />\n"
516 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
517 " ocr_line ocrx_word ocrp_wconf");
518 if (font_info_) {
519 AppendString(" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
520 }
522 "'/>\n"
523 " </head>\n"
524 " <body>\n");
525
526 return true;
527}
528
530 AppendString(" </body>\n</html>\n");
531
532 return true;
533}
534
536 const std::unique_ptr<const char[]> hocr(api->GetHOCRText(imagenum()));
537 if (hocr == nullptr) {
538 return false;
539 }
540
541 AppendString(hocr.get());
542
543 return true;
544}
545
546} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
@ ORIENTATION_PAGE_UP
Definition: publictypes.h:115
@ DIR_MIX
Definition: unichar.h:45
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_NEUTRAL
Definition: unichar.h:42
std::string HOcrEscape(const char *text)
Definition: baseapi.cpp:2378
@ PT_PULLOUT_IMAGE
Definition: publictypes.h:63
@ PT_HEADING_IMAGE
Definition: publictypes.h:62
@ PT_CAPTION_TEXT
Definition: publictypes.h:60
@ PT_HORZ_LINE
Definition: publictypes.h:64
@ PT_FLOWING_IMAGE
Definition: publictypes.h:61
@ PT_VERT_LINE
Definition: publictypes.h:65
@ PT_PULLOUT_TEXT
Definition: publictypes.h:55
@ PT_HEADING_TEXT
Definition: publictypes.h:54
std::string input_file_
Name used by training code.
Definition: baseapi.h:773
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:834
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:772
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:765
ResultIterator * GetIterator()
Definition: baseapi.cpp:1337
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
Definition: baseapi.cpp:270
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:304
const char * GetUTF8Text() const
std::vector< std::vector< std::pair< const char *, float > > > * Timesteps() const
void AppendString(const char *s)
Definition: renderer.cpp:111
const char * title() const
Definition: renderer.h:87
bool AddImageHandler(TessBaseAPI *api) override
TessHOcrRenderer(const char *outputbase, bool font_info)
bool BeginDocumentHandler() override
bool EndDocumentHandler() override