tesseract v5.3.3.20231005
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir, bool textonly=false)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
bool happy () const
 
int imagenum () const
 

Protected Member Functions

bool BeginDocumentHandler () override
 
bool AddImageHandler (TessBaseAPI *api) override
 
bool EndDocumentHandler () override
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)=0
 
virtual bool EndDocumentHandler ()
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 215 of file renderer.h.

Constructor & Destructor Documentation

◆ TessPDFRenderer()

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir,
bool  textonly = false 
)

Definition at line 184 of file pdfrenderer.cpp.

185 : TessResultRenderer(outputbase, "pdf"), datadir_(datadir) {
186 obj_ = 0;
187 textonly_ = textonly;
188 offsets_.push_back(0);
189}
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:33

Member Function Documentation

◆ AddImageHandler()

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
overrideprotectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 811 of file pdfrenderer.cpp.

811 {
812 Pix *pix = api->GetInputImage();
813 const char *filename = api->GetInputName();
814 int ppi = api->GetSourceYResolution();
815 if (!pix || ppi <= 0) {
816 return false;
817 }
818 double width = pixGetWidth(pix) * 72.0 / ppi;
819 double height = pixGetHeight(pix) * 72.0 / ppi;
820
821 std::stringstream xobject;
822 // Use "C" locale (needed for int values larger than 999).
823 xobject.imbue(std::locale::classic());
824 if (!textonly_) {
825 xobject << "/XObject << /Im1 " << (obj_ + 2) << " 0 R >>\n";
826 }
827
828 // PAGE
829 std::stringstream stream;
830 // Use "C" locale (needed for double values width and height).
831 stream.imbue(std::locale::classic());
832 stream.precision(2);
833 stream << std::fixed << obj_
834 << " 0 obj\n"
835 "<<\n"
836 " /Type /Page\n"
837 " /Parent 2 0 R\n" // Pages object
838 " /MediaBox [0 0 "
839 << width << " " << height
840 << "]\n"
841 " /Contents "
842 << (obj_ + 1)
843 << " 0 R\n" // Contents object
844 " /Resources\n"
845 " <<\n"
846 " "
847 << xobject.str() << // Image object
848 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
849 " /Font << /f-0-0 3 0 R >>\n" // Type0 Font
850 " >>\n"
851 ">>\n"
852 "endobj\n";
853 pages_.push_back(obj_);
854 AppendPDFObject(stream.str().c_str());
855
856 // CONTENTS
857 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(api, width, height));
858 const size_t pdftext_len = strlen(pdftext.get());
859 size_t len;
860 unsigned char *comp_pdftext =
861 zlibCompress(reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
862 long comp_pdftext_len = len;
863 stream.str("");
864 stream << obj_
865 << " 0 obj\n"
866 "<<\n"
867 " /Length "
868 << comp_pdftext_len
869 << " /Filter /FlateDecode\n"
870 ">>\n"
871 "stream\n";
872 AppendString(stream.str().c_str());
873 long objsize = stream.str().size();
874 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
875 objsize += comp_pdftext_len;
876 lept_free(comp_pdftext);
877 const char *b2 =
878 "endstream\n"
879 "endobj\n";
880 AppendString(b2);
881 objsize += strlen(b2);
882 AppendPDFObjectDIY(objsize);
883
884 if (!textonly_) {
885 char *pdf_object = nullptr;
886 int jpg_quality;
887 api->GetIntVariable("jpg_quality", &jpg_quality);
888 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize, jpg_quality)) {
889 return false;
890 }
891 AppendData(pdf_object, objsize);
892 AppendPDFObjectDIY(objsize);
893 delete[] pdf_object;
894 }
895 return true;
896}
void AppendString(const char *s)
Definition: renderer.cpp:111
void AppendData(const char *s, int len)
Definition: renderer.cpp:118

◆ BeginDocumentHandler()

bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 490 of file pdfrenderer.cpp.

490 {
491 AppendPDFObject("%PDF-1.5\n%\xDE\xAD\xBE\xEB\n");
492
493 // CATALOG
494 AppendPDFObject(
495 "1 0 obj\n"
496 "<<\n"
497 " /Type /Catalog\n"
498 " /Pages 2 0 R\n"
499 ">>\nendobj\n");
500
501 // We are reserving object #2 for the /Pages
502 // object, which I am going to create and write
503 // at the end of the PDF file.
504 AppendPDFObject("");
505
506 // TYPE0 FONT
507 AppendPDFObject(
508 "3 0 obj\n"
509 "<<\n"
510 " /BaseFont /GlyphLessFont\n"
511 " /DescendantFonts [ 4 0 R ]\n" // CIDFontType2 font
512 " /Encoding /Identity-H\n"
513 " /Subtype /Type0\n"
514 " /ToUnicode 6 0 R\n" // ToUnicode
515 " /Type /Font\n"
516 ">>\n"
517 "endobj\n");
518
519 // CIDFONTTYPE2
520 std::stringstream stream;
521 // Use "C" locale (needed for int values larger than 999).
522 stream.imbue(std::locale::classic());
523 stream << "4 0 obj\n"
524 "<<\n"
525 " /BaseFont /GlyphLessFont\n"
526 " /CIDToGIDMap 5 0 R\n" // CIDToGIDMap
527 " /CIDSystemInfo\n"
528 " <<\n"
529 " /Ordering (Identity)\n"
530 " /Registry (Adobe)\n"
531 " /Supplement 0\n"
532 " >>\n"
533 " /FontDescriptor 7 0 R\n" // Font descriptor
534 " /Subtype /CIDFontType2\n"
535 " /Type /Font\n"
536 " /DW "
537 << (1000 / kCharWidth)
538 << "\n"
539 ">>\n"
540 "endobj\n";
541 AppendPDFObject(stream.str().c_str());
542
543 // CIDTOGIDMAP
544 const int kCIDToGIDMapSize = 2 * (1 << 16);
545 const std::unique_ptr<unsigned char[]> cidtogidmap(new unsigned char[kCIDToGIDMapSize]);
546 for (int i = 0; i < kCIDToGIDMapSize; i++) {
547 cidtogidmap[i] = (i % 2) ? 1 : 0;
548 }
549 size_t len;
550 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
551 stream.str("");
552 stream << "5 0 obj\n"
553 "<<\n"
554 " /Length "
555 << len
556 << " /Filter /FlateDecode\n"
557 ">>\n"
558 "stream\n";
559 AppendString(stream.str().c_str());
560 long objsize = stream.str().size();
561 AppendData(reinterpret_cast<char *>(comp), len);
562 objsize += len;
563 lept_free(comp);
564 const char *endstream_endobj =
565 "endstream\n"
566 "endobj\n";
567 AppendString(endstream_endobj);
568 objsize += strlen(endstream_endobj);
569 AppendPDFObjectDIY(objsize);
570
571 const char stream2[] =
572 "/CIDInit /ProcSet findresource begin\n"
573 "12 dict begin\n"
574 "begincmap\n"
575 "/CIDSystemInfo\n"
576 "<<\n"
577 " /Registry (Adobe)\n"
578 " /Ordering (UCS)\n"
579 " /Supplement 0\n"
580 ">> def\n"
581 "/CMapName /Adobe-Identify-UCS def\n"
582 "/CMapType 2 def\n"
583 "1 begincodespacerange\n"
584 "<0000> <FFFF>\n"
585 "endcodespacerange\n"
586 "1 beginbfrange\n"
587 "<0000> <FFFF> <0000>\n"
588 "endbfrange\n"
589 "endcmap\n"
590 "CMapName currentdict /CMap defineresource pop\n"
591 "end\n"
592 "end\n";
593
594 // TOUNICODE
595 stream.str("");
596 stream << "6 0 obj\n"
597 "<< /Length "
598 << (sizeof(stream2) - 1)
599 << " >>\n"
600 "stream\n"
601 << stream2
602 << "endstream\n"
603 "endobj\n";
604 AppendPDFObject(stream.str().c_str());
605
606 // FONT DESCRIPTOR
607 stream.str("");
608 stream << "7 0 obj\n"
609 "<<\n"
610 " /Ascent 1000\n"
611 " /CapHeight 1000\n"
612 " /Descent -1\n" // Spec says must be negative
613 " /Flags 5\n" // FixedPitch + Symbolic
614 " /FontBBox [ 0 0 "
615 << (1000 / kCharWidth)
616 << " 1000 ]\n"
617 " /FontFile2 8 0 R\n"
618 " /FontName /GlyphLessFont\n"
619 " /ItalicAngle 0\n"
620 " /StemV 80\n"
621 " /Type /FontDescriptor\n"
622 ">>\n"
623 "endobj\n";
624 AppendPDFObject(stream.str().c_str());
625
626 stream.str("");
627 stream << datadir_.c_str() << "/pdf.ttf";
628 const uint8_t *font;
629 std::ifstream input(stream.str().c_str(), std::ios::in | std::ios::binary);
630 std::vector<unsigned char> buffer(std::istreambuf_iterator<char>(input), {});
631 auto size = buffer.size();
632 if (size) {
633 font = buffer.data();
634 } else {
635#if !defined(NDEBUG)
636 tprintf("Cannot open file \"%s\"!\nUsing internal glyphless font.\n", stream.str().c_str());
637#endif
638 font = pdf_ttf;
639 size = sizeof(pdf_ttf);
640 }
641
642 // FONTFILE2
643 stream.str("");
644 stream << "8 0 obj\n"
645 "<<\n"
646 " /Length "
647 << size
648 << "\n"
649 " /Length1 "
650 << size
651 << "\n"
652 ">>\n"
653 "stream\n";
654 AppendString(stream.str().c_str());
655 objsize = stream.str().size();
656 AppendData(reinterpret_cast<const char *>(font), size);
657 objsize += size;
658 AppendString(endstream_endobj);
659 objsize += strlen(endstream_endobj);
660 AppendPDFObjectDIY(objsize);
661 return true;
662}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

◆ EndDocumentHandler()

bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
overrideprotectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 898 of file pdfrenderer.cpp.

898 {
899 // We reserved the /Pages object number early, so that the /Page
900 // objects could refer to their parent. We finally have enough
901 // information to go fill it in. Using lower level calls to manipulate
902 // the offset record in two spots, because we are placing objects
903 // out of order in the file.
904
905 // PAGES
906 const long int kPagesObjectNumber = 2;
907 offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
908 std::stringstream stream;
909 // Use "C" locale (needed for int values larger than 999).
910 stream.imbue(std::locale::classic());
911 stream << kPagesObjectNumber << " 0 obj\n<<\n /Type /Pages\n /Kids [ ";
912 AppendString(stream.str().c_str());
913 size_t pages_objsize = stream.str().size();
914 for (const auto &page : pages_) {
915 stream.str("");
916 stream << page << " 0 R ";
917 AppendString(stream.str().c_str());
918 pages_objsize += stream.str().size();
919 }
920 stream.str("");
921 stream << "]\n /Count " << pages_.size() << "\n>>\nendobj\n";
922 AppendString(stream.str().c_str());
923 pages_objsize += stream.str().size();
924 offsets_.back() += pages_objsize; // manipulation #2
925
926 // INFO
927 std::string utf16_title = "FEFF"; // byte_order_marker
928 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(title());
929 char utf16[kMaxBytesPerCodepoint];
930 for (char32 code : unicodes) {
931 if (CodepointToUtf16be(code, utf16)) {
932 utf16_title += utf16;
933 }
934 }
935
936 char *datestr = l_getFormattedDate();
937 stream.str("");
938 stream << obj_
939 << " 0 obj\n"
940 "<<\n"
941 " /Producer (Tesseract "
943 << ")\n"
944 " /CreationDate (D:"
945 << datestr
946 << ")\n"
947 " /Title <"
948 << utf16_title.c_str()
949 << ">\n"
950 ">>\n"
951 "endobj\n";
952 lept_free(datestr);
953 AppendPDFObject(stream.str().c_str());
954 stream.str("");
955 stream << "xref\n0 " << obj_ << "\n0000000000 65535 f \n";
956 AppendString(stream.str().c_str());
957 for (int i = 1; i < obj_; i++) {
958 stream.str("");
959 stream.width(10);
960 stream.fill('0');
961 stream << offsets_[i] << " 00000 n \n";
962 AppendString(stream.str().c_str());
963 }
964 stream.str("");
965 stream << "trailer\n<<\n /Size " << obj_
966 << "\n"
967 " /Root 1 0 R\n" // catalog
968 " /Info "
969 << (obj_ - 1)
970 << " 0 R\n" // info
971 ">>\nstartxref\n"
972 << offsets_.back() << "\n%%EOF\n";
973 AppendString(stream.str().c_str());
974 return true;
975}
signed int char32
static const char * Version()
Definition: baseapi.cpp:241
const char * title() const
Definition: renderer.h:87
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220

The documentation for this class was generated from the following files: