20# include "config_auto.h"
26#include <allheaders.h>
174static const int kCharWidth = 2;
179static const int kMaxBytesPerCodepoint = 20;
187 textonly_ = textonly;
188 offsets_.push_back(0);
191void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
192 offsets_.push_back(objectsize + offsets_.back());
196void TessPDFRenderer::AppendPDFObject(
const char *data) {
197 AppendPDFObjectDIY(strlen(data));
204static double prec(
double x) {
205 double kPrecision = 1000.0;
206 double a = round(
x * kPrecision) / kPrecision;
213static long dist2(
int x1,
int y1,
int x2,
int y2) {
214 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
225static void GetWordBaseline(
int writing_direction,
int ppi,
int height,
int word_x1,
int word_y1,
226 int word_x2,
int word_y2,
int line_x1,
int line_y1,
int line_x2,
227 int line_y2,
double *x0,
double *y0,
double *length) {
229 std::swap(word_x1, word_x2);
230 std::swap(word_y1, word_y2);
237 double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
242 double t = ((px - line_x2) * (line_x2 - line_x1) + (py - line_y2) * (line_y2 - line_y1)) / l2;
243 x = line_x2 + t * (line_x2 - line_x1);
244 y = line_y2 + t * (line_y2 - line_y1);
246 word_length = sqrt(
static_cast<double>(dist2(word_x1, word_y1, word_x2, word_y2)));
247 word_length = word_length * 72.0 / ppi;
249 y = height - (
y * 72.0 / ppi);
253 *length = word_length;
264static void AffineMatrix(
int writing_direction,
int line_x1,
int line_y1,
int line_x2,
int line_y2,
265 double *a,
double *b,
double *c,
double *d) {
267 atan2(
static_cast<double>(line_y1 - line_y2),
static_cast<double>(line_x2 - line_x1));
272 switch (writing_direction) {
292static void ClipBaseline(
int ppi,
int x1,
int y1,
int x2,
int y2,
int *line_x1,
int *line_y1,
293 int *line_x2,
int *line_y2) {
298 int rise = abs(y2 - y1) * 72;
299 int run = abs(x2 - x1) * 72;
300 if (rise < 2 * ppi && 2 * ppi < run) {
301 *line_y1 = *line_y2 = (y1 + y2) / 2;
305static bool CodepointToUtf16be(
int code,
char utf16[kMaxBytesPerCodepoint]) {
306 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
307 tprintf(
"Dropping invalid codepoint %d\n", code);
310 if (code < 0x10000) {
311 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
313 int a = code - 0x010000;
314 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
315 int low_surrogate = (0x03FF & a) + 0xDC00;
316 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X%04X", high_surrogate, low_surrogate);
321char *TessPDFRenderer::GetPDFTextObjects(
TessBaseAPI *api,
double width,
double height) {
322 double ppi = api->GetSourceYResolution();
325 double old_x = 0.0, old_y = 0.0;
326 int old_fontsize = 0;
328 bool new_block =
true;
335 std::stringstream pdf_str;
337 pdf_str.imbue(std::locale::classic());
339 pdf_str.precision(8);
344 pdf_str <<
"q " << prec(width) <<
" 0 0 " << prec(height) <<
" 0 0 cm";
346 pdf_str <<
" /Im1 Do";
355 const std::unique_ptr< ResultIterator> res_it(api->GetIterator());
357 if (res_it->IsAtBeginningOf(
RIL_BLOCK)) {
358 auto block_type = res_it->BlockType();
364 pdf_str <<
"BT\n3 Tr";
372 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
386 res_it->Orientation(&orientation, &writing_direction, &textline_order, &deskew_angle);
388 switch (res_it->WordDirection()) {
396 writing_direction = old_writing_direction;
402 double x,
y, word_length;
404 int word_x1, word_y1, word_x2, word_y2;
405 res_it->Baseline(
RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
406 GetWordBaseline(writing_direction, ppi, height, word_x1, word_y1, word_x2, word_y2, line_x1,
407 line_y1, line_x2, line_y2, &
x, &
y, &word_length);
410 if (writing_direction != old_writing_direction || new_block) {
411 AffineMatrix(writing_direction, line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
412 pdf_str <<
" " << prec(a)
421 double dx =
x - old_x;
422 double dy =
y - old_y;
423 pdf_str <<
" " << prec(dx * a + dy * b) <<
" " << prec(dx * c + dy * d)
428 old_writing_direction = writing_direction;
435 bool bold, italic, underlined, monospace, serif, smallcaps;
437 res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps,
438 &fontsize, &font_id);
439 const int kDefaultFontsize = 8;
441 fontsize = kDefaultFontsize;
443 if (fontsize != old_fontsize) {
444 pdf_str <<
"/f-0-0 " << fontsize <<
" Tf ";
445 old_fontsize = fontsize;
451 std::string pdf_word;
452 int pdf_word_len = 0;
454 const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(
RIL_SYMBOL));
455 if (grapheme && grapheme[0] !=
'\0') {
457 char utf16[kMaxBytesPerCodepoint];
458 for (
char32 code : unicodes) {
459 if (CodepointToUtf16be(code, utf16)) {
467 if (res_it->IsAtBeginningOf(
RIL_WORD)) {
471 if (word_length > 0 && pdf_word_len > 0) {
472 double h_stretch = kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
473 pdf_str << h_stretch <<
" Tz"
474 <<
" [ <" << pdf_word
477 if (last_word_in_line) {
480 if (last_word_in_block) {
484 const std::string &text = pdf_str.str();
485 char *result =
new char[text.length() + 1];
486 strcpy(result, text.c_str());
491 AppendPDFObject(
"%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
510 " /BaseFont /GlyphLessFont\n"
511 " /DescendantFonts [ 4 0 R ]\n"
512 " /Encoding /Identity-H\n"
514 " /ToUnicode 6 0 R\n"
520 std::stringstream stream;
522 stream.imbue(std::locale::classic());
523 stream <<
"4 0 obj\n"
525 " /BaseFont /GlyphLessFont\n"
526 " /CIDToGIDMap 5 0 R\n"
529 " /Ordering (Identity)\n"
530 " /Registry (Adobe)\n"
533 " /FontDescriptor 7 0 R\n"
534 " /Subtype /CIDFontType2\n"
537 << (1000 / kCharWidth)
541 AppendPDFObject(stream.str().c_str());
544 const int kCIDToGIDMapSize = 2 * (1 << 16);
545 const std::unique_ptr<unsigned char[]> cidtogidmap(
new unsigned char[kCIDToGIDMapSize]);
546 for (
int i = 0;
i < kCIDToGIDMapSize;
i++) {
547 cidtogidmap[
i] = (
i % 2) ? 1 : 0;
550 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
552 stream <<
"5 0 obj\n"
556 <<
" /Filter /FlateDecode\n"
560 long objsize = stream.str().size();
561 AppendData(
reinterpret_cast<char *
>(comp), len);
564 const char *endstream_endobj =
568 objsize += strlen(endstream_endobj);
569 AppendPDFObjectDIY(objsize);
571 const char stream2[] =
572 "/CIDInit /ProcSet findresource begin\n"
577 " /Registry (Adobe)\n"
581 "/CMapName /Adobe-Identify-UCS def\n"
583 "1 begincodespacerange\n"
585 "endcodespacerange\n"
587 "<0000> <FFFF> <0000>\n"
590 "CMapName currentdict /CMap defineresource pop\n"
596 stream <<
"6 0 obj\n"
598 << (
sizeof(stream2) - 1)
604 AppendPDFObject(stream.str().c_str());
608 stream <<
"7 0 obj\n"
615 << (1000 / kCharWidth)
617 " /FontFile2 8 0 R\n"
618 " /FontName /GlyphLessFont\n"
621 " /Type /FontDescriptor\n"
624 AppendPDFObject(stream.str().c_str());
627 stream << datadir_.c_str() <<
"/pdf.ttf";
629 std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
630 std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
631 auto size = buffer.size();
633 font = buffer.data();
636 tprintf(
"Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
639 size =
sizeof(pdf_ttf);
644 stream <<
"8 0 obj\n"
655 objsize = stream.str().size();
656 AppendData(
reinterpret_cast<const char *
>(font), size);
659 objsize += strlen(endstream_endobj);
660 AppendPDFObjectDIY(objsize);
664bool TessPDFRenderer::imageToPDFObj(Pix *pix,
const char *filename,
long int objnum,
665 char **pdf_object,
long int *pdf_object_size,
666 const int jpg_quality) {
667 if (!pdf_object_size || !pdf_object) {
670 *pdf_object =
nullptr;
671 *pdf_object_size = 0;
672 if (!filename && !pix) {
676 L_Compressed_Data *cid =
nullptr;
679 if (pixGetInputFormat(pix) == IFF_PNG) {
680 sad = pixGenerateCIData(pix, L_FLATE_ENCODE, 0, 0, &cid);
683 sad = l_generateCIDataForPdf(filename, pix, jpg_quality, &cid);
687 l_CIDataDestroy(&cid);
691 const char *group4 =
"";
695 filter =
"/FlateDecode";
698 filter =
"/DCTDecode";
701 filter =
"/CCITTFaxDecode";
705 filter =
"/JPXDecode";
708 l_CIDataDestroy(&cid);
715 std::stringstream colorspace;
717 colorspace.imbue(std::locale::classic());
718 if (cid->ncolors > 0) {
719 colorspace <<
" /ColorSpace [ /Indexed /DeviceRGB " << (cid->ncolors - 1) <<
" "
720 << cid->cmapdatahex <<
" ]\n";
724 if (cid->bps == 1 && pixGetInputFormat(pix) == IFF_PNG) {
726 " /ColorSpace /DeviceGray\n"
729 colorspace.str(
" /ColorSpace /DeviceGray\n");
733 colorspace.str(
" /ColorSpace /DeviceRGB\n");
736 l_CIDataDestroy(&cid);
741 int predictor = (cid->predictor) ? 14 : 1;
744 std::stringstream b1;
746 b1.imbue(std::locale::classic());
753 " /Subtype /Image\n";
755 std::stringstream b2;
757 b2.imbue(std::locale::classic());
758 b2 <<
" /Width " << cid->w
763 " /BitsPerComponent "
776 << group4 <<
" /Columns " << cid->w
778 " /BitsPerComponent "
789 size_t b1_len = b1.str().size();
790 size_t b2_len = b2.str().size();
791 size_t b3_len = strlen(b3);
792 size_t colorspace_len = colorspace.str().size();
794 *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
795 *pdf_object =
new char[*pdf_object_size];
797 char *
p = *pdf_object;
798 memcpy(
p, b1.str().c_str(), b1_len);
800 memcpy(
p, colorspace.str().c_str(), colorspace_len);
802 memcpy(
p, b2.str().c_str(), b2_len);
804 memcpy(
p, cid->datacomp, cid->nbytescomp);
805 p += cid->nbytescomp;
806 memcpy(
p, b3, b3_len);
807 l_CIDataDestroy(&cid);
815 if (!pix || ppi <= 0) {
818 double width = pixGetWidth(pix) * 72.0 / ppi;
819 double height = pixGetHeight(pix) * 72.0 / ppi;
821 std::stringstream xobject;
823 xobject.imbue(std::locale::classic());
825 xobject <<
"/XObject << /Im1 " << (obj_ + 2) <<
" 0 R >>\n";
829 std::stringstream stream;
831 stream.imbue(std::locale::classic());
833 stream << std::fixed << obj_
839 << width <<
" " << height
848 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
849 " /Font << /f-0-0 3 0 R >>\n"
853 pages_.push_back(obj_);
854 AppendPDFObject(stream.str().c_str());
857 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
858 const size_t pdftext_len = strlen(pdftext.get());
860 unsigned char *comp_pdftext =
861 zlibCompress(
reinterpret_cast<unsigned char *
>(pdftext.get()), pdftext_len, &len);
862 long comp_pdftext_len = len;
869 <<
" /Filter /FlateDecode\n"
873 long objsize = stream.str().size();
874 AppendData(
reinterpret_cast<char *
>(comp_pdftext), comp_pdftext_len);
875 objsize += comp_pdftext_len;
876 lept_free(comp_pdftext);
881 objsize += strlen(b2);
882 AppendPDFObjectDIY(objsize);
885 char *pdf_object =
nullptr;
888 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
892 AppendPDFObjectDIY(objsize);
906 const long int kPagesObjectNumber = 2;
907 offsets_[kPagesObjectNumber] = offsets_.back();
908 std::stringstream stream;
910 stream.imbue(std::locale::classic());
911 stream << kPagesObjectNumber <<
" 0 obj\n<<\n /Type /Pages\n /Kids [ ";
913 size_t pages_objsize = stream.str().size();
914 for (
const auto &page : pages_) {
916 stream << page <<
" 0 R ";
918 pages_objsize += stream.str().size();
921 stream <<
"]\n /Count " << pages_.size() <<
"\n>>\nendobj\n";
923 pages_objsize += stream.str().size();
924 offsets_.back() += pages_objsize;
927 std::string utf16_title =
"FEFF";
929 char utf16[kMaxBytesPerCodepoint];
930 for (
char32 code : unicodes) {
931 if (CodepointToUtf16be(code, utf16)) {
932 utf16_title += utf16;
936 char *datestr = l_getFormattedDate();
941 " /Producer (Tesseract "
948 << utf16_title.c_str()
953 AppendPDFObject(stream.str().c_str());
955 stream <<
"xref\n0 " << obj_ <<
"\n0000000000 65535 f \n";
957 for (
int i = 1;
i < obj_;
i++) {
961 stream << offsets_[
i] <<
" 00000 n \n";
965 stream <<
"trailer\n<<\n /Size " << obj_
972 << offsets_.back() <<
"\n%%EOF\n";
struct TessBaseAPI TessBaseAPI
void tprintf(const char *format,...)
@ WRITING_DIRECTION_TOP_TO_BOTTOM
@ WRITING_DIRECTION_LEFT_TO_RIGHT
@ WRITING_DIRECTION_RIGHT_TO_LEFT
bool PTIsTextType(PolyBlockType type)
const char * GetInputName()
bool GetIntVariable(const char *name, int *value) const
static const char * Version()
int GetSourceYResolution()
void AppendString(const char *s)
const char * title() const
void AppendData(const char *s, int len)
bool EndDocumentHandler() override
bool BeginDocumentHandler() override
TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly=false)
bool AddImageHandler(TessBaseAPI *api) override
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)