tesseract v5.3.3.20231005
stringrenderer.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: stringrenderer.cpp
3 * Description: Class for rendering UTF-8 text to an image, and retrieving
4 * bounding boxes around each grapheme cluster.
5 * Author: Ranjith Unnikrishnan
6 *
7 * (C) Copyright 2013, Google Inc.
8 * Licensed under the Apache License, Version 2.0 (the "License");
9 * you may not use this file except in compliance with the License.
10 * You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 **********************************************************************/
19
20#include "stringrenderer.h"
21
22#include <allheaders.h> // from leptonica
23#include "boxchar.h"
24#include "helpers.h" // for TRand
25#include "ligature_table.h"
26#include "normstrngs.h"
27#include "tlog.h"
28
29#include <tesseract/unichar.h>
30
31#include "pango/pango-font.h"
32#include "pango/pango-glyph-item.h"
33#include "unicode/uchar.h" // from libicu
34
35#include <algorithm>
36#include <cassert>
37#include <cstdio>
38#include <cstring>
39#include <map>
40#include <utility>
41#include <vector>
42
43#define DISABLE_HEAP_LEAK_CHECK
44
45namespace tesseract {
46
47static const int kDefaultOutputResolution = 300;
48
49// Word joiner (U+2060) inserted after letters in ngram mode, as per
50// recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
51// hyphens and other non-alpha characters.
52static const char *kWordJoinerUTF8 = "\u2060";
53
54static bool IsCombiner(int ch) {
55 const int char_type = u_charType(ch);
56 return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) ||
57 (char_type == U_COMBINING_SPACING_MARK));
58}
59
60static std::string EncodeAsUTF8(const char32 ch32) {
61 UNICHAR uni_ch(ch32);
62 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
63}
64
65// Returns true with probability 'prob'.
66static bool RandBool(const double prob, TRand *rand) {
67 if (prob == 1.0) {
68 return true;
69 }
70 if (prob == 0.0) {
71 return false;
72 }
73 return rand->UnsignedRand(1.0) < prob;
74}
75
76/* static */
77static Image CairoARGB32ToPixFormat(cairo_surface_t *surface) {
78 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
79 printf("Unexpected surface format %d\n", cairo_image_surface_get_format(surface));
80 return nullptr;
81 }
82 const int width = cairo_image_surface_get_width(surface);
83 const int height = cairo_image_surface_get_height(surface);
84 Image pix = pixCreate(width, height, 32);
85 int byte_stride = cairo_image_surface_get_stride(surface);
86
87 for (int i = 0; i < height; ++i) {
88 memcpy(reinterpret_cast<unsigned char *>(pixGetData(pix) + i * pixGetWpl(pix)) + 1,
89 cairo_image_surface_get_data(surface) + i * byte_stride,
90 byte_stride - ((i == height - 1) ? 1 : 0));
91 }
92 return pix;
93}
94
95StringRenderer::StringRenderer(const std::string &font_desc, int page_width, int page_height)
96 : font_(font_desc)
97 , page_width_(page_width)
98 , page_height_(page_height)
99 , h_margin_(50)
100 , v_margin_(50)
101 , pen_color_{0.0, 0.0, 0.0}
102 , char_spacing_(0)
103 , leading_(0)
104 , vertical_text_(false)
105 , gravity_hint_strong_(false)
106 , render_fullwidth_latin_(false)
107 , underline_start_prob_(0)
108 , underline_continuation_prob_(0)
109 , underline_style_(PANGO_UNDERLINE_SINGLE)
110 , drop_uncovered_chars_(true)
111 , strip_unrenderable_words_(false)
112 , add_ligatures_(false)
113 , output_word_boxes_(false)
114 , surface_(nullptr)
115 , cr_(nullptr)
116 , layout_(nullptr)
117 , start_box_(0)
118 , page_(0)
119 , box_padding_(0)
120 , page_boxes_(nullptr)
121 , total_chars_(0)
122 , font_index_(0)
123 , last_offset_(0) {
124 set_resolution(kDefaultOutputResolution);
125 set_font(font_desc);
126}
127
128bool StringRenderer::set_font(const std::string &desc) {
129 bool success = font_.ParseFontDescriptionName(desc);
131 return success;
132}
133
134void StringRenderer::set_resolution(const int resolution) {
135 resolution_ = resolution;
136 font_.set_resolution(resolution);
137}
138
140 underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
141}
142
144 underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
145}
146
148 ClearBoxes();
150}
151
154 surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, page_height_);
155 cr_ = cairo_create(surface_);
156 {
158 layout_ = pango_cairo_create_layout(cr_);
159 }
160
161 if (vertical_text_) {
162 PangoContext *context = pango_layout_get_context(layout_);
163 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
165 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
166 }
167 pango_layout_context_changed(layout_);
168 }
169
171}
172
174 std::string font_desc = font_.DescriptionName();
175 // Specify the font via a description name
176 PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str());
177 // Assign the font description to the layout
178 pango_layout_set_font_description(layout_, desc);
179 pango_font_description_free(desc); // free the description
180 pango_cairo_context_set_resolution(pango_layout_get_context(layout_), resolution_);
181
182 int max_width = page_width_ - 2 * h_margin_;
183 int max_height = page_height_ - 2 * v_margin_;
184 tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
185 if (vertical_text_) {
186 using std::swap;
187 swap(max_width, max_height);
188 }
189 pango_layout_set_width(layout_, max_width * PANGO_SCALE);
190 // Ultra-wide Thai strings need to wrap at char level.
191 pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
192
193 // Adjust character spacing
194 PangoAttrList *attr_list = pango_attr_list_new();
195 if (char_spacing_) {
196 PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
197 spacing_attr->start_index = 0;
198 spacing_attr->end_index = static_cast<guint>(-1);
199 pango_attr_list_change(attr_list, spacing_attr);
200 }
201
202 if (add_ligatures_) {
203 set_features("liga, clig, dlig, hlig");
204 PangoAttribute *feature_attr = pango_attr_font_features_new(features_.c_str());
205 pango_attr_list_change(attr_list, feature_attr);
206 }
207
208 pango_layout_set_attributes(layout_, attr_list);
209 pango_attr_list_unref(attr_list);
210 // Adjust line spacing
211 if (leading_) {
212 pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
213 }
214}
215
217 if (layout_) {
218 g_object_unref(layout_);
219 layout_ = nullptr;
220 }
221 if (cr_) {
222 cairo_destroy(cr_);
223 cr_ = nullptr;
224 }
225 if (surface_) {
226 cairo_surface_destroy(surface_);
227 surface_ = nullptr;
228 }
229}
230
231void StringRenderer::SetWordUnderlineAttributes(const std::string &page_text) {
232 if (underline_start_prob_ == 0) {
233 return;
234 }
235 PangoAttrList *attr_list = pango_layout_get_attributes(layout_);
236
237 const char *text = page_text.c_str();
238 size_t offset = 0;
239 TRand rand;
240 bool started_underline = false;
241 PangoAttribute *und_attr = nullptr;
242
243 while (offset < page_text.length()) {
244 offset += SpanUTF8Whitespace(text + offset);
245 if (offset == page_text.length()) {
246 break;
247 }
248
249 int word_start = offset;
250 int word_len = SpanUTF8NotWhitespace(text + offset);
251 offset += word_len;
252 if (started_underline) {
253 // Should we continue the underline to the next word?
254 if (RandBool(underline_continuation_prob_, &rand)) {
255 // Continue the current underline to this word.
256 und_attr->end_index = word_start + word_len;
257 } else {
258 // Otherwise end the current underline attribute at the end of the
259 // previous word.
260 pango_attr_list_insert(attr_list, und_attr);
261 started_underline = false;
262 und_attr = nullptr;
263 }
264 }
265 if (!started_underline && RandBool(underline_start_prob_, &rand)) {
266 // Start a new underline attribute
267 und_attr = pango_attr_underline_new(underline_style_);
268 und_attr->start_index = word_start;
269 und_attr->end_index = word_start + word_len;
270 started_underline = true;
271 }
272 }
273 // Finish the current underline attribute at the end of the page.
274 if (started_underline) {
275 und_attr->end_index = page_text.length();
276 pango_attr_list_insert(attr_list, und_attr);
277 }
278}
279
280// Returns offset in utf8 bytes to first page.
281int StringRenderer::FindFirstPageBreakOffset(const char *text, int text_length) {
282 if (!text_length) {
283 return 0;
284 }
285 const int max_height = (page_height_ - 2 * v_margin_);
286 const int max_width = (page_width_ - 2 * h_margin_);
287 const int max_layout_height = vertical_text_ ? max_width : max_height;
288
289 UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
290 const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
291 const int kMaxUnicodeBufLength = 15000;
292 for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i) {
293 ;
294 }
295 int buf_length = it.utf8_data() - text;
296 tlog(1, "len = %d buf_len = %d\n", text_length, buf_length);
297 pango_layout_set_text(layout_, text, buf_length);
298
299 PangoLayoutIter *line_iter = nullptr;
300 { // Fontconfig caches some info here that is not freed before exit.
302 line_iter = pango_layout_get_iter(layout_);
303 }
304 bool first_page = true;
305 int page_top = 0;
306 int offset = buf_length;
307 do {
308 // Get bounding box of the current line
309 PangoRectangle line_ink_rect;
310 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
311 pango_extents_to_pixels(&line_ink_rect, nullptr);
312 PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter);
313 if (first_page) {
314 page_top = line_ink_rect.y;
315 first_page = false;
316 }
317 int line_bottom = line_ink_rect.y + line_ink_rect.height;
318 if (line_bottom - page_top > max_layout_height) {
319 offset = line->start_index;
320 tlog(1, "Found offset = %d\n", offset);
321 break;
322 }
323 } while (pango_layout_iter_next_line(line_iter));
324 pango_layout_iter_free(line_iter);
325 return offset;
326}
327
328const std::vector<BoxChar *> &StringRenderer::GetBoxes() const {
329 return boxchars_;
330}
331
333 return page_boxes_;
334}
335
338 &boxchars_);
339}
340
342 for (auto &boxchar : boxchars_) {
343 delete boxchar;
344 }
345 boxchars_.clear();
346 boxaDestroy(&page_boxes_);
347}
348
352}
353
354void StringRenderer::WriteAllBoxes(const std::string &filename) {
357}
358
359// Returns cluster strings in logical order.
360bool StringRenderer::GetClusterStrings(std::vector<std::string> *cluster_text) {
361 std::map<int, std::string> start_byte_to_text;
362 PangoLayoutIter *run_iter = pango_layout_get_iter(layout_);
363 const char *full_text = pango_layout_get_text(layout_);
364 do {
365 PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);
366 if (!run) {
367 // End of line nullptr run marker
368 tlog(2, "Found end of line marker\n");
369 continue;
370 }
371 PangoGlyphItemIter cluster_iter;
372 gboolean have_cluster;
373 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text);
374 have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
375 const int start_byte_index = cluster_iter.start_index;
376 const int end_byte_index = cluster_iter.end_index;
377 std::string text =
378 std::string(full_text + start_byte_index, end_byte_index - start_byte_index);
379 if (IsUTF8Whitespace(text.c_str())) {
380 tlog(2, "Found whitespace\n");
381 text = " ";
382 }
383 tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index, end_byte_index, text.c_str());
384 if (add_ligatures_) {
385 // Make sure the output box files have ligatured text in case the font
386 // decided to use an unmapped glyph.
387 text = LigatureTable::Get()->AddLigatures(text, nullptr);
388 }
389 start_byte_to_text[start_byte_index] = text;
390 }
391 } while (pango_layout_iter_next_run(run_iter));
392 pango_layout_iter_free(run_iter);
393
394 cluster_text->clear();
395 for (auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) {
396 cluster_text->push_back(it->second);
397 }
398 return !cluster_text->empty();
399}
400
401// Merges an array of BoxChars into words based on the identification of
402// BoxChars containing the space character as inter-word separators.
403//
404// Sometime two adjacent characters in the sequence may be detected as lying on
405// different lines based on their spatial positions. This may be the result of a
406// newline character at end of the last word on a line in the source text, or of
407// a discretionary line-break created by Pango at intra-word locations like
408// hyphens. When this is detected the word is split at that location into
409// multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
410// its bounding box.
411static void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) {
412 std::vector<BoxChar *> result;
413 bool started_word = false;
414 for (auto &boxchar : *boxchars) {
415 if (boxchar->ch() == " " || boxchar->box() == nullptr) {
416 result.push_back(boxchar);
417 boxchar = nullptr;
418 started_word = false;
419 continue;
420 }
421
422 if (!started_word) {
423 // Begin new word
424 started_word = true;
425 result.push_back(boxchar);
426 boxchar = nullptr;
427 } else {
428 BoxChar *last_boxchar = result.back();
429 // Compute bounding box union
430 const Box *box = boxchar->box();
431 Box *last_box = last_boxchar->mutable_box();
432 int left = std::min(last_box->x, box->x);
433 int right = std::max(last_box->x + last_box->w, box->x + box->w);
434 int top = std::min(last_box->y, box->y);
435 int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
436 // Conclude that the word was broken to span multiple lines based on the
437 // size of the merged bounding box in relation to those of the individual
438 // characters seen so far.
439 if (right - left > last_box->w + 5 * box->w) {
440 tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
441 // Insert a fake interword space and start a new word with the current
442 // boxchar.
443 result.push_back(new BoxChar(" ", 1));
444 result.push_back(boxchar);
445 boxchar = nullptr;
446 continue;
447 }
448 // Append to last word
449 last_boxchar->mutable_ch()->append(boxchar->ch());
450 last_box->x = left;
451 last_box->w = right - left;
452 last_box->y = top;
453 last_box->h = bottom - top;
454 delete boxchar;
455 boxchar = nullptr;
456 }
457 }
458 boxchars->swap(result);
459}
460
462 const char *text = pango_layout_get_text(layout_);
463 PangoLayoutIter *cluster_iter = pango_layout_get_iter(layout_);
464
465 // Do a first pass to store cluster start indexes.
466 std::vector<int> cluster_start_indices;
467 do {
468 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
469 tlog(3, "Added %d\n", cluster_start_indices.back());
470 } while (pango_layout_iter_next_cluster(cluster_iter));
471 pango_layout_iter_free(cluster_iter);
472 cluster_start_indices.push_back(strlen(text));
473 tlog(3, "Added last index %d\n", cluster_start_indices.back());
474 // Sort the indices and create a map from start to end indices.
475 std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
476 std::map<int, int> cluster_start_to_end_index;
477 for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
478 cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1];
479 }
480
481 // Iterate again to compute cluster boxes and their text with the obtained
482 // cluster extent information.
483 cluster_iter = pango_layout_get_iter(layout_);
484 // Store BoxChars* sorted by their byte start positions
485 std::map<int, BoxChar *> start_byte_to_box;
486 do {
487 PangoRectangle cluster_rect;
488 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
489 pango_extents_to_pixels(&cluster_rect, nullptr);
490 const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491 const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492 std::string cluster_text =
493 std::string(text + start_byte_index, end_byte_index - start_byte_index);
494 if (!cluster_text.empty() && cluster_text[0] == '\n') {
495 tlog(2, "Skipping newlines at start of text.\n");
496 continue;
497 }
498 if (!cluster_rect.width || !cluster_rect.height || IsUTF8Whitespace(cluster_text.c_str())) {
499 tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n", cluster_rect.width,
500 cluster_rect.height, cluster_text.c_str());
501 auto *boxchar = new BoxChar(" ", 1);
502 boxchar->set_page(page_);
503 start_byte_to_box[start_byte_index] = boxchar;
504 continue;
505 }
506 // Prepare a boxchar for addition at this byte position.
507 tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", cluster_rect.x, cluster_rect.y,
508 cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index,
509 cluster_text.c_str());
510 ASSERT_HOST_MSG(cluster_rect.width, "cluster_text:%s start_byte_index:%d\n",
511 cluster_text.c_str(), start_byte_index);
512 ASSERT_HOST_MSG(cluster_rect.height, "cluster_text:%s start_byte_index:%d\n",
513 cluster_text.c_str(), start_byte_index);
514 if (box_padding_) {
515 cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
516 cluster_rect.width += 2 * box_padding_;
517 cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
518 cluster_rect.height += 2 * box_padding_;
519 }
520 if (add_ligatures_) {
521 // Make sure the output box files have ligatured text in case the font
522 // decided to use an unmapped glyph.
523 cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
524 }
525 auto *boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
526 boxchar->set_page(page_);
527 boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height);
528 start_byte_to_box[start_byte_index] = boxchar;
529 } while (pango_layout_iter_next_cluster(cluster_iter));
530 pango_layout_iter_free(cluster_iter);
531
532 // There is a subtle bug in the cluster text reported by the PangoLayoutIter
533 // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
534 // around this, we use text reported using the PangoGlyphIter which is
535 // accurate.
536 // TODO(ranjith): Revisit whether this is still needed in newer versions of
537 // pango.
538 std::vector<std::string> cluster_text;
539 if (GetClusterStrings(&cluster_text)) {
540 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
541 int ind = 0;
542 for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) {
543 it->second->mutable_ch()->swap(cluster_text[ind]);
544 }
545 }
546
547 // Append to the boxchars list in byte order.
548 std::vector<BoxChar *> page_boxchars;
549 page_boxchars.reserve(start_byte_to_box.size());
550 std::string last_ch;
551 for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) {
552 if (it->second->ch() == kWordJoinerUTF8) {
553 // Skip zero-width joiner characters (ZWJs) here.
554 delete it->second;
555 } else {
556 page_boxchars.push_back(it->second);
557 }
558 }
559 CorrectBoxPositionsToLayout(&page_boxchars);
560
562 for (auto &it : start_byte_to_box) {
563 // Convert fullwidth Latin characters to their halfwidth forms.
564 std::string half(ConvertFullwidthLatinToBasicLatin(it.second->ch()));
565 it.second->mutable_ch()->swap(half);
566 }
567 }
568
569 // Merge the character boxes into word boxes if we are rendering n-grams.
570 if (output_word_boxes_) {
571 MergeBoxCharsToWords(&page_boxchars);
572 }
573
574 boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
575
576 // Compute the page bounding box
577 Box *page_box = nullptr;
578 Boxa *all_boxes = nullptr;
579 for (auto &page_boxchar : page_boxchars) {
580 if (page_boxchar->box() == nullptr) {
581 continue;
582 }
583 if (all_boxes == nullptr) {
584 all_boxes = boxaCreate(0);
585 }
586 boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE);
587 }
588 if (all_boxes != nullptr) {
589 boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
590 boxaDestroy(&all_boxes);
591 if (page_boxes_ == nullptr) {
592 page_boxes_ = boxaCreate(0);
593 }
594 boxaAddBox(page_boxes_, page_box, L_INSERT);
595 }
596}
597
598void StringRenderer::CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars) {
599 if (vertical_text_) {
600 const double rotation = -pango_gravity_to_rotation(
601 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
603 BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 0, boxchars->size(),
604 boxchars);
605 } else {
607 }
608}
609
610int StringRenderer::StripUnrenderableWords(std::string *utf8_text) const {
611 std::string output_text;
612 std::string unrenderable_words;
613 const char *text = utf8_text->c_str();
614 size_t offset = 0;
615 int num_dropped = 0;
616 while (offset < utf8_text->length()) {
617 int space_len = SpanUTF8Whitespace(text + offset);
618 output_text.append(text + offset, space_len);
619 offset += space_len;
620 if (offset == utf8_text->length()) {
621 break;
622 }
623
624 int word_len = SpanUTF8NotWhitespace(text + offset);
625 if (font_.CanRenderString(text + offset, word_len)) {
626 output_text.append(text + offset, word_len);
627 } else {
628 ++num_dropped;
629 unrenderable_words.append(text + offset, word_len);
630 unrenderable_words.append(" ");
631 }
632 offset += word_len;
633 }
634 utf8_text->swap(output_text);
635
636 if (num_dropped > 0) {
637 tprintf("Stripped %d unrenderable word(s): '%s'\n", num_dropped, unrenderable_words.c_str());
638 }
639 return num_dropped;
640}
641
642int StringRenderer::RenderToGrayscaleImage(const char *text, int text_length, Image *pix) {
643 Image orig_pix = nullptr;
644 int offset = RenderToImage(text, text_length, &orig_pix);
645 if (orig_pix) {
646 *pix = pixConvertTo8(orig_pix, false);
647 orig_pix.destroy();
648 }
649 return offset;
650}
651
652int StringRenderer::RenderToBinaryImage(const char *text, int text_length, int threshold,
653 Image *pix) {
654 Image orig_pix = nullptr;
655 int offset = RenderToImage(text, text_length, &orig_pix);
656 if (orig_pix) {
657 Image gray_pix = pixConvertTo8(orig_pix, false);
658 orig_pix.destroy();
659 *pix = pixThresholdToBinary(gray_pix, threshold);
660 gray_pix.destroy();
661 } else {
662 *pix = orig_pix;
663 }
664 return offset;
665}
666
667// Add word joiner (WJ) characters between adjacent non-space characters except
668// immediately before a combiner.
669/* static */
670std::string StringRenderer::InsertWordJoiners(const std::string &text) {
671 std::string out_str;
672 const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), text.length());
673 for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); it != it_end;
674 ++it) {
675 // Add the symbol to the output string.
676 out_str.append(it.utf8_data(), it.utf8_len());
677 // Check the next symbol.
678 UNICHAR::const_iterator next_it = it;
679 ++next_it;
680 bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
681 bool next_char_is_combiner = (next_it == it_end) ? false : IsCombiner(*next_it);
682 if (*it != ' ' && *it != '\n' && !next_char_is_boundary && !next_char_is_combiner) {
683 out_str += kWordJoinerUTF8;
684 }
685 }
686 return out_str;
687}
688
689// Convert halfwidth Basic Latin characters to their fullwidth forms.
690std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string &str) {
691 std::string full_str;
692 const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
693 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
694 // Convert printable and non-space 7-bit ASCII characters to
695 // their fullwidth forms.
696 if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
697 // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
698 char32 full_char = *it + 0xFEE0;
699 full_str.append(EncodeAsUTF8(full_char));
700 } else {
701 full_str.append(it.utf8_data(), it.utf8_len());
702 }
703 }
704 return full_str;
705}
706
707// Convert fullwidth Latin characters to their halfwidth forms.
708std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string &str) {
709 std::string half_str;
710 UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
711 for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
712 char32 half_char = FullwidthToHalfwidth(*it);
713 // Convert fullwidth Latin characters to their halfwidth forms
714 // only if halfwidth forms are printable and non-space 7-bit ASCII.
715 if (IsInterchangeValid7BitAscii(half_char) && isprint(half_char) && !isspace(half_char)) {
716 half_str.append(EncodeAsUTF8(half_char));
717 } else {
718 half_str.append(it.utf8_data(), it.utf8_len());
719 }
720 }
721 return half_str;
722}
723
724// Returns offset to end of text substring rendered in this method.
725int StringRenderer::RenderToImage(const char *text, int text_length, Image *pix) {
726 if (pix && *pix) {
727 pix->destroy();
728 }
730
731 const int page_offset = FindFirstPageBreakOffset(text, text_length);
732 if (!page_offset) {
733 return 0;
734 }
735 start_box_ = boxchars_.size();
736
737 if (!vertical_text_) {
738 // Translate by the specified margin
739 cairo_translate(cr_, h_margin_, v_margin_);
740 } else {
741 // Vertical text rendering is achieved by a two-step process of first
742 // performing regular horizontal layout with character orientation set to
743 // EAST, and then translating and rotating the layout before rendering onto
744 // the desired image surface. The settings required for the former step are
745 // done within InitPangoCairo().
746 //
747 // Translate to the top-right margin of page
748 cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
749 // Rotate the layout
750 double rotation = -pango_gravity_to_rotation(
751 pango_context_get_base_gravity(pango_layout_get_context(layout_)));
752 tlog(2, "Rotating by %f radians\n", rotation);
753 cairo_rotate(cr_, rotation);
754 pango_cairo_update_layout(cr_, layout_);
755 }
756 std::string page_text(text, page_offset);
758 // Convert Basic Latin to their fullwidth forms.
759 page_text = ConvertBasicLatinToFullwidthLatin(page_text);
760 }
762 StripUnrenderableWords(&page_text);
763 }
764 if (drop_uncovered_chars_ && !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
765 int num_dropped = font_.DropUncoveredChars(&page_text);
766 if (num_dropped) {
767 tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
768 }
769 }
770 if (add_ligatures_) {
771 // Add ligatures wherever possible, including custom ligatures.
772 page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
773 }
774 if (underline_start_prob_ > 0) {
776 }
777
778 pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
779
780 if (pix) {
781 // Set a white background for the target image surface.
782 cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
783 // Fill the surface with the active colour (if you don't do this, you will
784 // be given a surface with a transparent background to draw on)
785 cairo_paint(cr_);
786 // Set the ink color to black
787 cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
788 // If the target surface or transformation properties of the cairo instance
789 // have changed, update the pango layout to reflect this
790 pango_cairo_update_layout(cr_, layout_);
791 {
792 DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
793 // Draw the pango layout onto the cairo surface
794 pango_cairo_show_layout(cr_, layout_);
795 }
796 *pix = CairoARGB32ToPixFormat(surface_);
797 }
800 // Update internal state variables.
801 ++page_;
802 return page_offset;
803}
804
805// Render a string to an image, returning it as an 8 bit pix. Behaves as
806// RenderString, except that it ignores the font set at construction and works
807// through all the fonts, returning 0 until they are exhausted, at which point
808// it returns the value it should have returned all along, but no pix this time.
809// Fonts that don't contain a given proportion of the characters in the string
810// get skipped.
811// Fonts that work each get rendered and the font name gets added
812// to the image.
813// NOTE that no boxes are produced by this function.
814//
815// Example usage: To render a null terminated char-array "txt"
816//
817// int offset = 0;
818// do {
819// Image pix;
820// offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
821// strlen(txt + offset), nullptr,
822// &pix);
823// ...
824// } while (offset < strlen(text));
825//
826int StringRenderer::RenderAllFontsToImage(double min_coverage, const char *text, int text_length,
827 std::string *font_used, Image *image) {
828 *image = nullptr;
829 // Select a suitable font to render the title with.
830 const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
831 std::string title_font;
832 if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), &title_font, nullptr)) {
833 tprintf("WARNING: Could not find a font to render image title with!\n");
834 title_font = "Arial";
835 }
836 title_font += " 8";
837 tlog(1, "Selected title font: %s\n", title_font.c_str());
838 if (font_used) {
839 font_used->clear();
840 }
841
842 std::string orig_font = font_.DescriptionName();
843 if (char_map_.empty()) {
844 total_chars_ = 0;
845 // Fill the hash table and use that for computing which fonts to use.
846 for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
847 it != UNICHAR::end(text, text_length); ++it) {
848 ++total_chars_;
849 ++char_map_[*it];
850 }
851 tprintf("Total chars = %d\n", total_chars_);
852 }
853 const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();
854
855 for (size_t i = font_index_; i < all_fonts.size(); ++i) {
856 ++font_index_;
857 int raw_score = 0;
858 int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
859 if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
860 set_font(all_fonts[i]);
861 int offset = RenderToBinaryImage(text, text_length, 128, image);
862 ClearBoxes(); // Get rid of them as they are garbage.
863 const int kMaxTitleLength = 1024;
864 char title[kMaxTitleLength];
865 snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[i].c_str(), ok_chars,
866 100.0 * ok_chars / total_chars_, raw_score, 100.0 * raw_score / char_map_.size());
867 tprintf("%s\n", title);
868 // This is a good font! Store the offset to return once we've tried all
869 // the fonts.
870 if (offset) {
871 last_offset_ = offset;
872 if (font_used) {
873 *font_used = all_fonts[i];
874 }
875 }
876 // Add the font to the image.
877 set_font(title_font);
878 v_margin_ /= 8;
879 Image title_image = nullptr;
880 RenderToBinaryImage(title, strlen(title), 128, &title_image);
881 *image |= title_image;
882 title_image.destroy();
883
884 v_margin_ *= 8;
885 set_font(orig_font);
886 // We return the real offset only after cycling through the list of fonts.
887 return 0;
888 } else {
889 tprintf("Font %s failed with %d hits = %.2f%%\n", all_fonts[i].c_str(), ok_chars,
890 100.0 * ok_chars / total_chars_);
891 }
892 }
893 font_index_ = 0;
894 char_map_.clear();
895 return last_offset_ == 0 ? -1 : last_offset_;
896}
897
898} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:57
#define DISABLE_HEAP_LEAK_CHECK
#define tlog(level,...)
Definition: tlog.h:36
signed int char32
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:237
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:49
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:249
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213
const char * utf8_data() const
Definition: unichar.h:133
void destroy()
Definition: image.cpp:32
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:324
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:96
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:331
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:83
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:302
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static LigatureTable * Get()
void set_resolution(const int resolution)
int DropUncoveredChars(std::string *utf8_text) const
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool ParseFontDescriptionName(const std::string &name)
std::string DescriptionName() const
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
static const std::vector< std::string > & ListAvailableFonts()
static std::string InsertWordJoiners(const std::string &text)
void SetWordUnderlineAttributes(const std::string &page_text)
bool GetClusterStrings(std::vector< std::string > *cluster_text)
void set_features(const char *features)
bool set_font(const std::string &desc)
int RenderToImage(const char *text, int text_length, Image *pix)
void set_underline_start_prob(const double frac)
static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text)
int StripUnrenderableWords(std::string *utf8_text) const
int RenderAllFontsToImage(double min_coverage, const char *text, int text_length, std::string *font_used, Image *pix)
int RenderToBinaryImage(const char *text, int text_length, int threshold, Image *pix)
static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text)
StringRenderer(const std::string &font_desc, int page_width, int page_height)
int FindFirstPageBreakOffset(const char *text, int text_length)
void CorrectBoxPositionsToLayout(std::vector< BoxChar * > *boxchars)
const std::vector< BoxChar * > & GetBoxes() const
void set_resolution(const int resolution)
void set_underline_continuation_prob(const double frac)
int RenderToGrayscaleImage(const char *text, int text_length, Image *pix)
cairo_surface_t * surface_
std::vector< BoxChar * > boxchars_
void WriteAllBoxes(const std::string &filename)
void RotatePageBoxes(float rotation)
std::unordered_map< char32, int64_t > char_map_
PangoUnderline underline_style_