20 #include "config_auto.h" 24 #include "allheaders.h" 169 static const int kBasicBufSize = 2048;
172 static const int kCharWidth = 2;
177 static const int kMaxBytesPerCodepoint = 20;
188 textonly_ = textonly;
192 void TessPDFRenderer::AppendPDFObjectDIY(
size_t objectsize) {
197 void TessPDFRenderer::AppendPDFObject(
const char *data) {
198 AppendPDFObjectDIY(strlen(data));
206 double kPrecision = 1000.0;
207 double a = round(x * kPrecision) / kPrecision;
213 long dist2(
int x1,
int y1,
int x2,
int y2) {
214 return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
226 int word_x1,
int word_y1,
int word_x2,
int word_y2,
227 int line_x1,
int line_y1,
int line_x2,
int line_y2,
228 double *x0,
double *y0,
double *length) {
230 Swap(&word_x1, &word_x2);
231 Swap(&word_y1, &word_y2);
238 double l2 =
dist2(line_x1, line_y1, line_x2, line_y2);
243 double t = ((px - line_x2) * (line_x2 - line_x1) +
244 (py - line_y2) * (line_y2 - line_y1)) / l2;
245 x = line_x2 + t * (line_x2 - line_x1);
246 y = line_y2 + t * (line_y2 - line_y1);
248 word_length = sqrt(static_cast<double>(
dist2(word_x1, word_y1,
250 word_length = word_length * 72.0 / ppi;
252 y = height - (y * 72.0 / ppi);
256 *length = word_length;
268 int line_x1,
int line_y1,
int line_x2,
int line_y2,
269 double *a,
double *b,
double *c,
double *d) {
270 double theta = atan2(static_cast<double>(line_y1 - line_y2),
271 static_cast<double>(line_x2 - line_x1));
276 switch(writing_direction) {
297 int *line_x1,
int *line_y1,
298 int *line_x2,
int *line_y2) {
303 double rise = abs(y2 - y1) * 72 / ppi;
304 double run = abs(x2 - x1) * 72 / ppi;
305 if (rise < 2.0 && 2.0 < run)
306 *line_y1 = *line_y2 = (y1 + y2) / 2;
310 if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) {
311 tprintf(
"Dropping invalid codepoint %d\n", code);
314 if (code < 0x10000) {
315 snprintf(utf16, kMaxBytesPerCodepoint,
"%04X", code);
317 int a = code - 0x010000;
318 int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
319 int low_surrogate = (0x03FF & a) + 0xDC00;
320 snprintf(utf16, kMaxBytesPerCodepoint,
321 "%04X%04X", high_surrogate, low_surrogate);
327 double width,
double height) {
332 double old_x = 0.0, old_y = 0.0;
333 int old_fontsize = 0;
336 bool new_block =
true;
350 pdf_str +=
" 0 0 cm";
352 pdf_str +=
" /Im1 Do";
364 pdf_str +=
"BT\n3 Tr";
372 ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
386 res_it->
Orientation(&orientation, &writing_direction,
387 &textline_order, &deskew_angle);
397 writing_direction = old_writing_direction;
403 double x, y, word_length;
405 int word_x1, word_y1, word_x2, word_y2;
408 word_x1, word_y1, word_x2, word_y2,
409 line_x1, line_y1, line_x2, line_y2,
410 &x, &y, &word_length);
413 if (writing_direction != old_writing_direction || new_block) {
415 line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
425 double dx = x - old_x;
426 double dy = y - old_y;
433 old_writing_direction = writing_direction;
440 bool bold, italic, underlined, monospace, serif, smallcaps;
443 &serif, &smallcaps, &fontsize, &font_id);
444 const int kDefaultFontsize = 8;
446 fontsize = kDefaultFontsize;
447 if (fontsize != old_fontsize) {
449 snprintf(textfont,
sizeof(textfont),
"/f-0-0 %d Tf ", fontsize);
451 old_fontsize = fontsize;
458 int pdf_word_len = 0;
460 const std::unique_ptr<const char[]> grapheme(
462 if (grapheme && grapheme[0] !=
'\0') {
464 char utf16[kMaxBytesPerCodepoint];
465 for (
char32 code : unicodes) {
474 if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
476 kCharWidth *
prec(100.0 * word_length / (fontsize * pdf_word_len));
483 if (last_word_in_line) {
486 if (last_word_in_block) {
490 char *ret =
new char[pdf_str.
length() + 1];
491 strcpy(ret, pdf_str.
string());
497 char buf[kBasicBufSize];
500 n = snprintf(buf,
sizeof(buf),
503 0xDE, 0xAD, 0xBE, 0xEB);
504 if (n >=
sizeof(buf))
return false;
505 AppendPDFObject(buf);
508 n = snprintf(buf,
sizeof(buf),
516 if (n >=
sizeof(buf))
return false;
517 AppendPDFObject(buf);
525 n = snprintf(buf,
sizeof(buf),
528 " /BaseFont /GlyphLessFont\n" 529 " /DescendantFonts [ %ld 0 R ]\n" 530 " /Encoding /Identity-H\n" 532 " /ToUnicode %ld 0 R\n" 539 if (n >=
sizeof(buf))
return false;
540 AppendPDFObject(buf);
543 n = snprintf(buf,
sizeof(buf),
546 " /BaseFont /GlyphLessFont\n" 547 " /CIDToGIDMap %ld 0 R\n" 550 " /Ordering (Identity)\n" 551 " /Registry (Adobe)\n" 554 " /FontDescriptor %ld 0 R\n" 555 " /Subtype /CIDFontType2\n" 563 if (n >=
sizeof(buf))
return false;
564 AppendPDFObject(buf);
567 const int kCIDToGIDMapSize = 2 * (1 << 16);
568 const std::unique_ptr<unsigned char[]> cidtogidmap(
569 new unsigned char[kCIDToGIDMapSize]);
570 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
571 cidtogidmap[i] = (i % 2) ? 1 : 0;
574 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
575 n = snprintf(buf,
sizeof(buf),
578 " /Length %lu /Filter /FlateDecode\n" 582 if (n >=
sizeof(buf)) {
587 long objsize = strlen(buf);
588 AppendData(reinterpret_cast<char *>(comp), len);
591 const char *endstream_endobj =
595 objsize += strlen(endstream_endobj);
596 AppendPDFObjectDIY(objsize);
599 "/CIDInit /ProcSet findresource begin\n" 604 " /Registry (Adobe)\n" 608 "/CMapName /Adobe-Identify-UCS def\n" 610 "1 begincodespacerange\n" 612 "endcodespacerange\n" 614 "<0000> <FFFF> <0000>\n" 617 "CMapName currentdict /CMap defineresource pop\n" 622 n = snprintf(buf,
sizeof(buf),
624 "<< /Length %lu >>\n" 628 "endobj\n", (
unsigned long) strlen(stream), stream);
629 if (n >=
sizeof(buf))
return false;
630 AppendPDFObject(buf);
633 n = snprintf(buf,
sizeof(buf),
640 " /FontBBox [ 0 0 %d %d ]\n" 641 " /FontFile2 %ld 0 R\n" 642 " /FontName /GlyphLessFont\n" 645 " /Type /FontDescriptor\n" 654 if (n >=
sizeof(buf))
return false;
655 AppendPDFObject(buf);
657 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_);
658 if (n >=
sizeof(buf))
return false;
659 FILE *fp = fopen(buf,
"rb");
661 tprintf(
"Can not open file \"%s\"!\n", buf);
664 fseek(fp, 0, SEEK_END);
665 long int size = ftell(fp);
670 fseek(fp, 0, SEEK_SET);
671 const std::unique_ptr<char[]> buffer(
new char[size]);
672 if (fread(buffer.get(), 1, size, fp) != static_cast<size_t>(size)) {
678 n = snprintf(buf,
sizeof(buf),
684 "stream\n", size, size);
685 if (n >=
sizeof(buf)) {
689 objsize = strlen(buf);
693 objsize += strlen(endstream_endobj);
694 AppendPDFObjectDIY(objsize);
698 bool TessPDFRenderer::imageToPDFObj(Pix *pix,
702 long int *pdf_object_size) {
704 char b0[kBasicBufSize];
705 char b1[kBasicBufSize];
706 char b2[kBasicBufSize];
707 if (!pdf_object_size || !pdf_object)
710 *pdf_object_size = 0;
714 L_Compressed_Data *cid = NULL;
715 const int kJpegQuality = 85;
718 findFileFormat(filename, &format);
719 if (pixGetSpp(pix) == 4 && format == IFF_PNG) {
720 Pix *p1 = pixAlphaBlendUniform(pix, 0xffffff00);
721 sad = pixGenerateCIData(p1, L_FLATE_ENCODE, 0, 0, &cid);
724 sad = l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);
728 l_CIDataDestroy(&cid);
732 const char *group4 =
"";
736 filter =
"/FlateDecode";
739 filter =
"/DCTDecode";
742 filter =
"/CCITTFaxDecode";
746 filter =
"/JPXDecode";
749 l_CIDataDestroy(&cid);
756 const char *colorspace;
757 if (cid->ncolors > 0) {
758 n = snprintf(b0,
sizeof(b0),
759 " /ColorSpace [ /Indexed /DeviceRGB %d %s ]\n",
760 cid->ncolors - 1, cid->cmapdatahex);
761 if (n >=
sizeof(b0)) {
762 l_CIDataDestroy(&cid);
769 colorspace =
" /ColorSpace /DeviceGray\n";
772 colorspace =
" /ColorSpace /DeviceRGB\n";
775 l_CIDataDestroy(&cid);
780 int predictor = (cid->predictor) ? 14 : 1;
783 n = snprintf(b1,
sizeof(b1),
787 " /Subtype /Image\n",
788 objnum, (
unsigned long) cid->nbytescomp);
789 if (n >=
sizeof(b1)) {
790 l_CIDataDestroy(&cid);
794 n = snprintf(b2,
sizeof(b2),
797 " /BitsPerComponent %d\n" 805 " /BitsPerComponent %d\n" 809 cid->w, cid->h, cid->bps, filter, predictor, cid->spp,
810 group4, cid->w, cid->bps);
811 if (n >=
sizeof(b2)) {
812 l_CIDataDestroy(&cid);
820 size_t b1_len = strlen(b1);
821 size_t b2_len = strlen(b2);
822 size_t b3_len = strlen(b3);
823 size_t colorspace_len = strlen(colorspace);
826 b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len;
827 *pdf_object =
new char[*pdf_object_size];
829 char *p = *pdf_object;
830 memcpy(p, b1, b1_len);
832 memcpy(p, colorspace, colorspace_len);
834 memcpy(p, b2, b2_len);
836 memcpy(p, cid->datacomp, cid->nbytescomp);
837 p += cid->nbytescomp;
838 memcpy(p, b3, b3_len);
839 l_CIDataDestroy(&cid);
845 char buf[kBasicBufSize];
846 char buf2[kBasicBufSize];
850 if (!pix || ppi <= 0)
852 double width = pixGetWidth(pix) * 72.0 / ppi;
853 double height = pixGetHeight(pix) * 72.0 / ppi;
855 snprintf(buf2,
sizeof(buf2),
"/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
856 const char *xobject = (textonly_) ?
"" : buf2;
859 n = snprintf(buf,
sizeof(buf),
864 " /MediaBox [0 0 %.2f %.2f]\n" 865 " /Contents %ld 0 R\n" 869 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 870 " /Font << /f-0-0 %ld 0 R >>\n" 880 if (n >=
sizeof(buf))
return false;
882 AppendPDFObject(buf);
885 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
886 const size_t pdftext_len = strlen(pdftext.get());
888 unsigned char *comp_pdftext = zlibCompress(
889 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
890 long comp_pdftext_len = len;
891 n = snprintf(buf,
sizeof(buf),
894 " /Length %ld /Filter /FlateDecode\n" 896 "stream\n", obj_, comp_pdftext_len);
897 if (n >=
sizeof(buf)) {
898 lept_free(comp_pdftext);
902 long objsize = strlen(buf);
903 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
904 objsize += comp_pdftext_len;
905 lept_free(comp_pdftext);
910 objsize += strlen(b2);
911 AppendPDFObjectDIY(objsize);
914 char *pdf_object =
nullptr;
915 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
919 AppendPDFObjectDIY(objsize);
928 char buf[kBasicBufSize];
937 const long int kPagesObjectNumber = 2;
938 offsets_[kPagesObjectNumber] = offsets_.
back();
939 n = snprintf(buf,
sizeof(buf),
943 " /Kids [ ", kPagesObjectNumber);
944 if (n >=
sizeof(buf))
return false;
946 size_t pages_objsize = strlen(buf);
948 n = snprintf(buf,
sizeof(buf),
949 "%ld 0 R ", pages_[i]);
950 if (n >=
sizeof(buf))
return false;
952 pages_objsize += strlen(buf);
954 n = snprintf(buf,
sizeof(buf),
958 "endobj\n", pages_.
size());
959 if (n >=
sizeof(buf))
return false;
961 pages_objsize += strlen(buf);
962 offsets_.
back() += pages_objsize;
965 STRING utf16_title =
"FEFF";
967 char utf16[kMaxBytesPerCodepoint];
968 for (
char32 code : unicodes) {
970 utf16_title += utf16;
974 char* datestr = l_getFormattedDate();
975 n = snprintf(buf,
sizeof(buf),
978 " /Producer (Tesseract %s)\n" 979 " /CreationDate (D:%s)\n" 984 datestr, utf16_title.
c_str());
986 if (n >=
sizeof(buf))
return false;
987 AppendPDFObject(buf);
988 n = snprintf(buf,
sizeof(buf),
991 "0000000000 65535 f \n", obj_);
992 if (n >=
sizeof(buf))
return false;
994 for (
int i = 1; i < obj_; i++) {
995 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
996 if (n >=
sizeof(buf))
return false;
999 n = snprintf(buf,
sizeof(buf),
1013 if (n >=
sizeof(buf))
return false;
virtual bool AddImageHandler(TessBaseAPI *api)
void AffineMatrix(int writing_direction, int line_x1, int line_y1, int line_x2, int line_y2, double *a, double *b, double *c, double *d)
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const
void add_str_double(const char *str, double number)
virtual bool Next(PageIteratorLevel level)
const char * title() const
virtual char * GetUTF8Text(PageIteratorLevel level) const
size_t unsigned_size() const
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
const char * GetInputName()
TessPDFRenderer(const char *outputbase, const char *datadir, bool textonly)
virtual bool BeginDocumentHandler()
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, int *line_x1, int *line_y1, int *line_x2, int *line_y2)
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void AppendData(const char *s, int len)
int GetSourceYResolution()
static const char * Version()
bool Empty(PageIteratorLevel level) const
long dist2(int x1, int y1, int x2, int y2)
const char * c_str() const
const char * string() const
virtual bool EndDocumentHandler()
ResultIterator * GetIterator()
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
void GetWordBaseline(int writing_direction, int ppi, int height, int word_x1, int word_y1, int word_x2, int word_y2, int line_x1, int line_y1, int line_x2, int line_y2, double *x0, double *y0, double *length)
StrongScriptDirection WordDirection() const
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
void AppendString(const char *s)