40 it->Orientation(&orientation, &writing_direction, &textline_order,
53static void AddBaselineCoordsTohOCR(
const PageIterator *it,
55 std::stringstream &hocr_str) {
58 hocr_str <<
"; textangle " << 360 - orientation * 90;
62 int left, top, right, bottom;
63 it->BoundingBox(level, &left, &top, &right, &bottom);
67 if (!it->Baseline(level, &x1, &y1, &x2, &y2)) {
84 double p1 = (y2 - y1) /
static_cast<double>(x2 - x1);
85 double p0 = y1 - p1 * x1;
87 hocr_str <<
"; baseline " << round(p1 * 1000.0) / 1000.0 <<
" "
88 << round(p0 * 1000.0) / 1000.0;
92 std::stringstream &hocr_str) {
93 int left, top, right, bottom;
94 it->BoundingBox(level, &left, &top, &right, &bottom);
97 hocr_str <<
" title=\"bbox " << left <<
" " << top <<
" " << right <<
" "
101 AddBaselineCoordsTohOCR(it, level, hocr_str);
103 float row_height, descenders, ascenders;
104 it->RowAttributes(&row_height, &descenders, &ascenders);
106 hocr_str <<
"; x_size " << row_height <<
"; x_descenders " << -descenders
107 <<
"; x_ascenders " << ascenders;
140 int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, scnt = 1, tcnt = 1, ccnt = 1;
141 int page_id = page_number + 1;
142 bool para_is_ltr =
true;
143 const char *paragraph_lang =
nullptr;
144 bool font_info =
false;
145 bool hocr_boxes =
false;
156 MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1,
nullptr, 0);
157 wchar_t *uni16_str =
new WCHAR[str16_len];
158 str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1, uni16_str,
160 int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr,
161 0,
nullptr,
nullptr);
162 char *utf8_str =
new char[utf8_len];
163 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
170 std::stringstream hocr_str;
172 hocr_str.imbue(std::locale::classic());
174 hocr_str.precision(8);
175 hocr_str <<
" <div class='ocr_page'"
177 <<
"page_" << page_id <<
"'"
178 <<
" title='image \"";
182 hocr_str <<
"unknown";
190 std::unique_ptr<ResultIterator> res_it(
GetIterator());
192 int left, top, right, bottom;
193 auto block_type = res_it->BlockType();
194 switch (block_type) {
199 res_it.get()->BoundingBox(
RIL_TEXTLINE, &left, &top, &right, &bottom);
200 hocr_str <<
" <div class='ocr_photo' id='block_" << page_id <<
'_'
201 << bcnt++ <<
"' title=\"bbox " << left <<
" " << top <<
" "
202 << right <<
" " << bottom <<
"\"></div>\n";
209 res_it.get()->BoundingBox(
RIL_TEXTLINE, &left, &top, &right, &bottom);
210 hocr_str <<
" <div class='ocr_separator' id='block_" << page_id <<
'_'
211 << bcnt++ <<
"' title=\"bbox " << left <<
" " << top <<
" "
212 << right <<
" " << bottom <<
"\"></div>\n";
216 tprintf(
"TODO: Please report image which triggers the noise case.\n");
228 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
230 hocr_str <<
" <div class='ocr_carea'"
232 <<
"block_" << page_id <<
"_" << bcnt <<
"'";
233 AddBoxTohOCR(res_it.get(),
RIL_BLOCK, hocr_str);
235 if (res_it->IsAtBeginningOf(
RIL_PARA)) {
236 hocr_str <<
"\n <p class='ocr_par'";
237 para_is_ltr = res_it->ParagraphIsLtr();
239 hocr_str <<
" dir='rtl'";
242 <<
"par_" << page_id <<
"_" << pcnt <<
"'";
243 paragraph_lang = res_it->WordRecognitionLanguage();
244 if (paragraph_lang) {
245 hocr_str <<
" lang='" << paragraph_lang <<
"'";
247 AddBoxTohOCR(res_it.get(),
RIL_PARA, hocr_str);
250 hocr_str <<
"\n <span class='";
251 switch (block_type) {
253 hocr_str <<
"ocr_header";
256 hocr_str <<
"ocr_textfloat";
259 hocr_str <<
"ocr_caption";
267 hocr_str <<
"ocr_line";
270 <<
"line_" << page_id <<
"_" << lcnt <<
"'";
275 int32_t lstm_choice_mode =
tesseract_->lstm_choice_mode;
276 std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
277 *rawTimestepMap =
nullptr;
278 std::vector<std::vector<std::pair<const char *, float>>> *CTCMap =
nullptr;
279 if (lstm_choice_mode) {
280 CTCMap = res_it->GetBestLSTMSymbolChoices();
281 rawTimestepMap = res_it->GetRawLSTMTimesteps();
283 hocr_str <<
"\n <span class='ocrx_word'"
285 <<
"word_" << page_id <<
"_" << wcnt <<
"'";
286 bool bold, italic, underlined, monospace, serif, smallcaps;
287 int pointsize, font_id;
288 res_it->BoundingBox(
RIL_WORD, &left, &top, &right, &bottom);
289 const char *font_name =
290 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
291 &serif, &smallcaps, &pointsize, &font_id);
292 hocr_str <<
" title='bbox " << left <<
" " << top <<
" " << right <<
" "
293 << bottom <<
"; x_wconf "
294 <<
static_cast<int>(res_it->Confidence(
RIL_WORD));
297 hocr_str <<
"; x_font " <<
HOcrEscape(font_name).c_str();
299 hocr_str <<
"; x_fsize " << pointsize;
302 const char *lang = res_it->WordRecognitionLanguage();
303 if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
304 hocr_str <<
" lang='" << lang <<
"'";
306 switch (res_it->WordDirection()) {
310 hocr_str <<
" dir='ltr'";
315 hocr_str <<
" dir='rtl'";
328 hocr_str <<
"<strong>";
334 const std::unique_ptr<const char[]> grapheme(
336 if (grapheme && grapheme[0] != 0) {
338 res_it->BoundingBox(
RIL_SYMBOL, &left, &top, &right, &bottom);
339 hocr_str <<
"\n <span class='ocrx_cinfo' title='x_bboxes "
340 << left <<
" " << top <<
" " << right <<
" " << bottom
341 <<
"; x_conf " << res_it->Confidence(
RIL_SYMBOL) <<
"'>";
343 hocr_str <<
HOcrEscape(grapheme.get()).c_str();
345 hocr_str <<
"</span>";
347 if (lstm_choice_mode == 1 && ci.
Timesteps() !=
nullptr) {
348 std::vector<std::vector<std::pair<const char *, float>>> *symbol =
350 hocr_str <<
"\n <span class='ocr_symbol'"
352 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt
354 for (
const auto ×tep : *symbol) {
355 hocr_str <<
"\n <span class='ocrx_cinfo'"
357 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
359 for (
auto conf : timestep) {
360 hocr_str <<
"\n <span class='ocrx_cinfo'"
362 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
364 <<
" title='x_confs " << int(conf.second * 100) <<
"'>"
365 <<
HOcrEscape(conf.first).c_str() <<
"</span>";
368 hocr_str <<
"</span>";
371 hocr_str <<
"\n </span>";
373 }
else if (lstm_choice_mode == 2) {
374 hocr_str <<
"\n <span class='ocrx_cinfo'"
376 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
381 if (choice !=
nullptr) {
382 hocr_str <<
"\n <span class='ocrx_cinfo'"
384 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
386 <<
" title='x_confs " << choiceconf <<
"'>"
391 hocr_str <<
"\n </span>";
402 hocr_str <<
"</strong>";
405 if (lstm_choice_mode == 1 && !hocr_boxes && rawTimestepMap !=
nullptr) {
406 for (
const auto &symbol : *rawTimestepMap) {
407 hocr_str <<
"\n <span class='ocr_symbol'"
409 <<
"symbol_" << page_id <<
"_" << wcnt <<
"_" << scnt <<
"'>";
410 for (
const auto ×tep : symbol) {
411 hocr_str <<
"\n <span class='ocrx_cinfo'"
413 <<
"timestep" << page_id <<
"_" << wcnt <<
"_" << tcnt
415 for (
auto &&conf : timestep) {
416 hocr_str <<
"\n <span class='ocrx_cinfo'"
418 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
420 <<
" title='x_confs " << int(conf.second * 100) <<
"'>"
421 <<
HOcrEscape(conf.first).c_str() <<
"</span>";
424 hocr_str <<
"</span>";
427 hocr_str <<
"</span>";
430 }
else if (lstm_choice_mode == 2 && !hocr_boxes && CTCMap !=
nullptr) {
431 for (
const auto ×tep : *CTCMap) {
432 if (timestep.size() > 0) {
433 hocr_str <<
"\n <span class='ocrx_cinfo'"
435 <<
"lstm_choices_" << page_id <<
"_" << wcnt <<
"_" << tcnt
437 for (
auto &j : timestep) {
438 float conf = 100 -
tesseract_->lstm_rating_coefficient * j.second;
445 hocr_str <<
"\n <span class='ocrx_cinfo'"
447 <<
"choice_" << page_id <<
"_" << wcnt <<
"_" << ccnt
449 <<
" title='x_confs " << conf <<
"'>"
453 hocr_str <<
"</span>";
459 if (hocr_boxes || lstm_choice_mode > 0) {
462 hocr_str <<
"</span>";
467 if (last_word_in_line) {
468 hocr_str <<
"\n </span>";
471 if (last_word_in_para) {
472 hocr_str <<
"\n </p>\n";
476 if (last_word_in_block) {
477 hocr_str <<
" </div>\n";
481 hocr_str <<
" </div>\n";
483 const std::string &text = hocr_str.str();
484 char *result =
new char[text.length() + 1];
485 strcpy(result, text.c_str());
499 font_info_ = font_info;
504 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
505 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
506 " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
507 "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
508 "lang=\"en\">\n <head>\n <title>");
512 " <meta http-equiv=\"Content-Type\" content=\"text/html;"
513 "charset=utf-8\"/>\n"
514 " <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
516 " <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
517 " ocr_line ocrx_word ocrp_wconf");
519 AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize");
537 if (hocr ==
nullptr) {
void tprintf(const char *format,...)
std::string HOcrEscape(const char *text)
std::string input_file_
Name used by training code.
int Recognize(ETEXT_DESC *monitor)
PAGE_RES * page_res_
The page-level data.
Tesseract * tesseract_
The underlying data object.
int GetSourceYResolution()
ResultIterator * GetIterator()
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
bool GetBoolVariable(const char *name, bool *value) const
const char * GetUTF8Text() const
std::vector< std::vector< std::pair< const char *, float > > > * Timesteps() const
void AppendString(const char *s)
const char * title() const
bool AddImageHandler(TessBaseAPI *api) override
TessHOcrRenderer(const char *outputbase, bool font_info)
bool BeginDocumentHandler() override
bool EndDocumentHandler() override