22#include <allheaders.h>
31#include "pango/pango-font.h"
32#include "pango/pango-glyph-item.h"
33#include "unicode/uchar.h"
43#define DISABLE_HEAP_LEAK_CHECK
47static const int kDefaultOutputResolution = 300;
52static const char *kWordJoinerUTF8 =
"\u2060";
54static bool IsCombiner(
int ch) {
55 const int char_type = u_charType(
ch);
56 return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) ||
57 (char_type == U_COMBINING_SPACING_MARK));
60static std::string EncodeAsUTF8(
const char32 ch32) {
62 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
66static bool RandBool(
const double prob, TRand *rand) {
73 return rand->UnsignedRand(1.0) < prob;
77static Image CairoARGB32ToPixFormat(cairo_surface_t *surface) {
78 if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
79 printf(
"Unexpected surface format %d\n", cairo_image_surface_get_format(surface));
82 const int width = cairo_image_surface_get_width(surface);
83 const int height = cairo_image_surface_get_height(surface);
84 Image pix = pixCreate(width, height, 32);
85 int byte_stride = cairo_image_surface_get_stride(surface);
87 for (
int i = 0;
i < height; ++
i) {
88 memcpy(
reinterpret_cast<unsigned char *
>(pixGetData(pix) +
i * pixGetWpl(pix)) + 1,
89 cairo_image_surface_get_data(surface) +
i * byte_stride,
90 byte_stride - ((
i == height - 1) ? 1 : 0));
97 , page_width_(page_width)
98 , page_height_(page_height)
101 , pen_color_{0.0, 0.0, 0.0}
104 , vertical_text_(false)
105 , gravity_hint_strong_(false)
106 , render_fullwidth_latin_(false)
107 , underline_start_prob_(0)
108 , underline_continuation_prob_(0)
109 , underline_style_(PANGO_UNDERLINE_SINGLE)
110 , drop_uncovered_chars_(true)
111 , strip_unrenderable_words_(false)
112 , add_ligatures_(false)
113 , output_word_boxes_(false)
120 , page_boxes_(nullptr)
162 PangoContext *context = pango_layout_get_context(
layout_);
163 pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
165 pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
167 pango_layout_context_changed(
layout_);
176 PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str());
178 pango_layout_set_font_description(
layout_, desc);
179 pango_font_description_free(desc);
184 tlog(3,
"max_width = %d, max_height = %d\n", max_width, max_height);
187 swap(max_width, max_height);
189 pango_layout_set_width(
layout_, max_width * PANGO_SCALE);
191 pango_layout_set_wrap(
layout_, PANGO_WRAP_WORD_CHAR);
194 PangoAttrList *attr_list = pango_attr_list_new();
196 PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(
char_spacing_ * PANGO_SCALE);
197 spacing_attr->start_index = 0;
198 spacing_attr->end_index =
static_cast<guint
>(-1);
199 pango_attr_list_change(attr_list, spacing_attr);
204 PangoAttribute *feature_attr = pango_attr_font_features_new(
features_.c_str());
205 pango_attr_list_change(attr_list, feature_attr);
208 pango_layout_set_attributes(
layout_, attr_list);
209 pango_attr_list_unref(attr_list);
235 PangoAttrList *attr_list = pango_layout_get_attributes(
layout_);
237 const char *text = page_text.c_str();
240 bool started_underline =
false;
241 PangoAttribute *und_attr =
nullptr;
243 while (offset < page_text.length()) {
245 if (offset == page_text.length()) {
249 int word_start = offset;
252 if (started_underline) {
256 und_attr->end_index = word_start + word_len;
260 pango_attr_list_insert(attr_list, und_attr);
261 started_underline =
false;
268 und_attr->start_index = word_start;
269 und_attr->end_index = word_start + word_len;
270 started_underline =
true;
274 if (started_underline) {
275 und_attr->end_index = page_text.length();
276 pango_attr_list_insert(attr_list, und_attr);
287 const int max_layout_height =
vertical_text_ ? max_width : max_height;
291 const int kMaxUnicodeBufLength = 15000;
292 for (
int i = 0;
i < kMaxUnicodeBufLength && it != it_end; ++it, ++
i) {
296 tlog(1,
"len = %d buf_len = %d\n", text_length, buf_length);
297 pango_layout_set_text(
layout_, text, buf_length);
299 PangoLayoutIter *line_iter =
nullptr;
302 line_iter = pango_layout_get_iter(
layout_);
304 bool first_page =
true;
306 int offset = buf_length;
309 PangoRectangle line_ink_rect;
310 pango_layout_iter_get_line_extents(line_iter, &line_ink_rect,
nullptr);
311 pango_extents_to_pixels(&line_ink_rect,
nullptr);
312 PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter);
314 page_top = line_ink_rect.y;
317 int line_bottom = line_ink_rect.y + line_ink_rect.height;
318 if (line_bottom - page_top > max_layout_height) {
319 offset = line->start_index;
320 tlog(1,
"Found offset = %d\n", offset);
323 }
while (pango_layout_iter_next_line(line_iter));
324 pango_layout_iter_free(line_iter);
361 std::map<int, std::string> start_byte_to_text;
362 PangoLayoutIter *run_iter = pango_layout_get_iter(
layout_);
363 const char *full_text = pango_layout_get_text(
layout_);
365 PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);
368 tlog(2,
"Found end of line marker\n");
371 PangoGlyphItemIter cluster_iter;
372 gboolean have_cluster;
373 for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text);
374 have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
375 const int start_byte_index = cluster_iter.start_index;
376 const int end_byte_index = cluster_iter.end_index;
378 std::string(full_text + start_byte_index, end_byte_index - start_byte_index);
380 tlog(2,
"Found whitespace\n");
383 tlog(2,
"start_byte=%d end_byte=%d : '%s'\n", start_byte_index, end_byte_index, text.c_str());
389 start_byte_to_text[start_byte_index] = text;
391 }
while (pango_layout_iter_next_run(run_iter));
392 pango_layout_iter_free(run_iter);
394 cluster_text->clear();
395 for (
auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) {
396 cluster_text->push_back(it->second);
398 return !cluster_text->empty();
411static void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) {
412 std::vector<BoxChar *> result;
413 bool started_word =
false;
414 for (
auto &boxchar : *boxchars) {
415 if (boxchar->ch() ==
" " || boxchar->box() ==
nullptr) {
416 result.push_back(boxchar);
418 started_word =
false;
425 result.push_back(boxchar);
428 BoxChar *last_boxchar = result.back();
430 const Box *box = boxchar->box();
431 Box *last_box = last_boxchar->mutable_box();
432 int left = std::min(last_box->x, box->x);
433 int right = std::max(last_box->x + last_box->w, box->x + box->w);
434 int top = std::min(last_box->y, box->y);
435 int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
439 if (right - left > last_box->w + 5 * box->w) {
440 tlog(1,
"Found line break after '%s'", last_boxchar->ch().c_str());
443 result.push_back(
new BoxChar(
" ", 1));
444 result.push_back(boxchar);
449 last_boxchar->mutable_ch()->append(boxchar->ch());
451 last_box->w = right - left;
453 last_box->h = bottom - top;
458 boxchars->swap(result);
462 const char *text = pango_layout_get_text(
layout_);
463 PangoLayoutIter *cluster_iter = pango_layout_get_iter(
layout_);
466 std::vector<int> cluster_start_indices;
468 cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
469 tlog(3,
"Added %d\n", cluster_start_indices.back());
470 }
while (pango_layout_iter_next_cluster(cluster_iter));
471 pango_layout_iter_free(cluster_iter);
472 cluster_start_indices.push_back(strlen(text));
473 tlog(3,
"Added last index %d\n", cluster_start_indices.back());
475 std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
476 std::map<int, int> cluster_start_to_end_index;
477 for (
size_t i = 0;
i + 1 < cluster_start_indices.size(); ++
i) {
478 cluster_start_to_end_index[cluster_start_indices[
i]] = cluster_start_indices[
i + 1];
483 cluster_iter = pango_layout_get_iter(
layout_);
485 std::map<int, BoxChar *> start_byte_to_box;
487 PangoRectangle cluster_rect;
488 pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect,
nullptr);
489 pango_extents_to_pixels(&cluster_rect,
nullptr);
490 const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491 const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492 std::string cluster_text =
493 std::string(text + start_byte_index, end_byte_index - start_byte_index);
494 if (!cluster_text.empty() && cluster_text[0] ==
'\n') {
495 tlog(2,
"Skipping newlines at start of text.\n");
498 if (!cluster_rect.width || !cluster_rect.height ||
IsUTF8Whitespace(cluster_text.c_str())) {
499 tlog(2,
"Skipping whitespace with boxdim (%d,%d) '%s'\n", cluster_rect.width,
500 cluster_rect.height, cluster_text.c_str());
501 auto *boxchar =
new BoxChar(
" ", 1);
502 boxchar->set_page(
page_);
503 start_byte_to_box[start_byte_index] = boxchar;
507 tlog(2,
"[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", cluster_rect.x, cluster_rect.y,
508 cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index,
509 cluster_text.c_str());
510 ASSERT_HOST_MSG(cluster_rect.width,
"cluster_text:%s start_byte_index:%d\n",
511 cluster_text.c_str(), start_byte_index);
512 ASSERT_HOST_MSG(cluster_rect.height,
"cluster_text:%s start_byte_index:%d\n",
513 cluster_text.c_str(), start_byte_index);
515 cluster_rect.x = std::max(0, cluster_rect.x -
box_padding_);
517 cluster_rect.y = std::max(0, cluster_rect.y -
box_padding_);
525 auto *boxchar =
new BoxChar(cluster_text.c_str(), cluster_text.size());
526 boxchar->set_page(
page_);
527 boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height);
528 start_byte_to_box[start_byte_index] = boxchar;
529 }
while (pango_layout_iter_next_cluster(cluster_iter));
530 pango_layout_iter_free(cluster_iter);
538 std::vector<std::string> cluster_text;
540 ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
542 for (
auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) {
543 it->second->mutable_ch()->swap(cluster_text[ind]);
548 std::vector<BoxChar *> page_boxchars;
549 page_boxchars.reserve(start_byte_to_box.size());
551 for (
auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) {
552 if (it->second->ch() == kWordJoinerUTF8) {
556 page_boxchars.push_back(it->second);
562 for (
auto &it : start_byte_to_box) {
565 it.second->mutable_ch()->swap(half);
571 MergeBoxCharsToWords(&page_boxchars);
577 Box *page_box =
nullptr;
578 Boxa *all_boxes =
nullptr;
579 for (
auto &page_boxchar : page_boxchars) {
580 if (page_boxchar->box() ==
nullptr) {
583 if (all_boxes ==
nullptr) {
584 all_boxes = boxaCreate(0);
586 boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE);
588 if (all_boxes !=
nullptr) {
589 boxaGetExtent(all_boxes,
nullptr,
nullptr, &page_box);
590 boxaDestroy(&all_boxes);
600 const double rotation = -pango_gravity_to_rotation(
601 pango_context_get_base_gravity(pango_layout_get_context(
layout_)));
611 std::string output_text;
612 std::string unrenderable_words;
613 const char *text = utf8_text->c_str();
616 while (offset < utf8_text->length()) {
618 output_text.append(text + offset, space_len);
620 if (offset == utf8_text->length()) {
626 output_text.append(text + offset, word_len);
629 unrenderable_words.append(text + offset, word_len);
630 unrenderable_words.append(
" ");
634 utf8_text->swap(output_text);
636 if (num_dropped > 0) {
637 tprintf(
"Stripped %d unrenderable word(s): '%s'\n", num_dropped, unrenderable_words.c_str());
643 Image orig_pix =
nullptr;
646 *pix = pixConvertTo8(orig_pix,
false);
654 Image orig_pix =
nullptr;
657 Image gray_pix = pixConvertTo8(orig_pix,
false);
659 *pix = pixThresholdToBinary(gray_pix, threshold);
676 out_str.append(it.utf8_data(), it.utf8_len());
680 bool next_char_is_boundary = (next_it == it_end || *next_it ==
' ');
681 bool next_char_is_combiner = (next_it == it_end) ?
false : IsCombiner(*next_it);
682 if (*it !=
' ' && *it !=
'\n' && !next_char_is_boundary && !next_char_is_combiner) {
683 out_str += kWordJoinerUTF8;
691 std::string full_str;
698 char32 full_char = *it + 0xFEE0;
699 full_str.append(EncodeAsUTF8(full_char));
701 full_str.append(it.utf8_data(), it.utf8_len());
709 std::string half_str;
716 half_str.append(EncodeAsUTF8(half_char));
718 half_str.append(it.utf8_data(), it.utf8_len());
750 double rotation = -pango_gravity_to_rotation(
751 pango_context_get_base_gravity(pango_layout_get_context(
layout_)));
752 tlog(2,
"Rotating by %f radians\n", rotation);
753 cairo_rotate(
cr_, rotation);
756 std::string page_text(text, page_offset);
767 tprintf(
"WARNING: Dropped %d uncovered characters\n", num_dropped);
778 pango_layout_set_text(
layout_, page_text.c_str(), page_text.length());
782 cairo_set_source_rgb(
cr_, 1.0, 1.0, 1.0);
796 *pix = CairoARGB32ToPixFormat(
surface_);
827 std::string *font_used,
Image *image) {
830 const char kTitleTemplate[] =
"%s : %d hits = %.2f%%, raw = %d = %.2f%%";
831 std::string title_font;
833 tprintf(
"WARNING: Could not find a font to render image title with!\n");
834 title_font =
"Arial";
837 tlog(1,
"Selected title font: %s\n", title_font.c_str());
859 if (ok_chars > 0 && ok_chars >=
total_chars_ * min_coverage) {
863 const int kMaxTitleLength = 1024;
864 char title[kMaxTitleLength];
865 snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[
i].c_str(), ok_chars,
873 *font_used = all_fonts[
i];
879 Image title_image =
nullptr;
881 *image |= title_image;
889 tprintf(
"Font %s failed with %d hits = %.2f%%\n", all_fonts[
i].c_str(), ok_chars,
#define ASSERT_HOST_MSG(x,...)
#define DISABLE_HEAP_LEAK_CHECK
unsigned int SpanUTF8Whitespace(const char *text)
void tprintf(const char *format,...)
bool IsInterchangeValid7BitAscii(const char32 ch)
char32 FullwidthToHalfwidth(const char32 ch)
unsigned int SpanUTF8NotWhitespace(const char *text)
bool IsUTF8Whitespace(const char *text)
static const_iterator begin(const char *utf8_str, int byte_length)
static const_iterator end(const char *utf8_str, int byte_length)
const char * utf8_data() const
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
std::string AddLigatures(const std::string &str, const PangoFontInfo *font) const
static LigatureTable * Get()
void set_resolution(const int resolution)
int DropUncoveredChars(std::string *utf8_text) const
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
bool ParseFontDescriptionName(const std::string &name)
std::string DescriptionName() const
bool CanRenderString(const char *utf8_word, int len, std::vector< std::string > *graphemes) const
static int FontScore(const std::unordered_map< char32, int64_t > &ch_map, const std::string &fontname, int *raw_score, std::vector< bool > *ch_flags)
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
static const std::vector< std::string > & ListAvailableFonts()
static std::string InsertWordJoiners(const std::string &text)
void SetWordUnderlineAttributes(const std::string &page_text)
double underline_continuation_prob_
double underline_start_prob_
bool GetClusterStrings(std::vector< std::string > *cluster_text)
void set_features(const char *features)
bool gravity_hint_strong_
bool set_font(const std::string &desc)
int RenderToImage(const char *text, int text_length, Image *pix)
void set_underline_start_prob(const double frac)
static std::string ConvertBasicLatinToFullwidthLatin(const std::string &text)
void SetLayoutProperties()
int StripUnrenderableWords(std::string *utf8_text) const
int RenderAllFontsToImage(double min_coverage, const char *text, int text_length, std::string *font_used, Image *pix)
int RenderToBinaryImage(const char *text, int text_length, int threshold, Image *pix)
static std::string ConvertFullwidthLatinToBasicLatin(const std::string &text)
StringRenderer(const std::string &font_desc, int page_width, int page_height)
int FindFirstPageBreakOffset(const char *text, int text_length)
void CorrectBoxPositionsToLayout(std::vector< BoxChar * > *boxchars)
std::string GetBoxesStr()
Boxa * GetPageBoxes() const
const std::vector< BoxChar * > & GetBoxes() const
void set_resolution(const int resolution)
bool render_fullwidth_latin_
void set_underline_continuation_prob(const double frac)
int RenderToGrayscaleImage(const char *text, int text_length, Image *pix)
cairo_surface_t * surface_
std::vector< BoxChar * > boxchars_
bool strip_unrenderable_words_
void ComputeClusterBoxes()
void WriteAllBoxes(const std::string &filename)
void RotatePageBoxes(float rotation)
bool drop_uncovered_chars_
std::unordered_map< char32, int64_t > char_map_
PangoUnderline underline_style_