#include <renderer.h>
Renders tesseract output into searchable PDF
Definition at line 186 of file renderer.h.
◆ TessPDFRenderer()
tesseract::TessPDFRenderer::TessPDFRenderer |
( |
const char * |
outputbase, |
|
|
const char * |
datadir, |
|
|
bool |
textonly |
|
) |
| |
Definition at line 183 of file pdfrenderer.cpp.
188 textonly_ = textonly;
TessResultRenderer(const char *outputbase, const char *extension)
◆ AddImageHandler()
bool tesseract::TessPDFRenderer::AddImageHandler |
( |
TessBaseAPI * |
api | ) |
|
|
protectedvirtual |
Implements tesseract::TessResultRenderer.
Definition at line 839 of file pdfrenderer.cpp.
841 char buf[kBasicBufSize];
842 char buf2[kBasicBufSize];
843 Pix *pix =
api->GetInputImage();
845 int ppi =
api->GetSourceYResolution();
846 if (!pix || ppi <= 0)
848 double width = pixGetWidth(pix) * 72.0 / ppi;
849 double height = pixGetHeight(pix) * 72.0 / ppi;
851 snprintf(buf2,
sizeof(buf2),
"/XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
852 const char *xobject = (textonly_) ?
"" : buf2;
855 n = snprintf(buf,
sizeof(buf),
860 " /MediaBox [0 0 %.2f %.2f]\n" 861 " /Contents %ld 0 R\n" 865 " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" 866 " /Font << /f-0-0 %ld 0 R >>\n" 876 if (n >=
sizeof(buf))
return false;
878 AppendPDFObject(buf);
881 const std::unique_ptr<char[]> pdftext(GetPDFTextObjects(
api, width, height));
882 const size_t pdftext_len = strlen(pdftext.get());
884 unsigned char *comp_pdftext = zlibCompress(
885 reinterpret_cast<unsigned char *>(pdftext.get()), pdftext_len, &len);
886 long comp_pdftext_len = len;
887 n = snprintf(buf,
sizeof(buf),
890 " /Length %ld /Filter /FlateDecode\n" 892 "stream\n", obj_, comp_pdftext_len);
893 if (n >=
sizeof(buf)) {
894 lept_free(comp_pdftext);
898 long objsize = strlen(buf);
899 AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
900 objsize += comp_pdftext_len;
901 lept_free(comp_pdftext);
906 objsize += strlen(b2);
907 AppendPDFObjectDIY(objsize);
910 char *pdf_object =
nullptr;
911 if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
915 AppendPDFObjectDIY(objsize);
void AppendString(const char *s)
void AppendData(const char *s, int len)
◆ BeginDocumentHandler()
bool tesseract::TessPDFRenderer::BeginDocumentHandler |
( |
| ) |
|
|
protectedvirtual |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 496 of file pdfrenderer.cpp.
497 char buf[kBasicBufSize];
500 n = snprintf(buf,
sizeof(buf),
503 0xDE, 0xAD, 0xBE, 0xEB);
504 if (n >=
sizeof(buf))
return false;
505 AppendPDFObject(buf);
508 n = snprintf(buf,
sizeof(buf),
516 if (n >=
sizeof(buf))
return false;
517 AppendPDFObject(buf);
525 n = snprintf(buf,
sizeof(buf),
528 " /BaseFont /GlyphLessFont\n" 529 " /DescendantFonts [ %ld 0 R ]\n" 530 " /Encoding /Identity-H\n" 532 " /ToUnicode %ld 0 R\n" 539 if (n >=
sizeof(buf))
return false;
540 AppendPDFObject(buf);
543 n = snprintf(buf,
sizeof(buf),
546 " /BaseFont /GlyphLessFont\n" 547 " /CIDToGIDMap %ld 0 R\n" 550 " /Ordering (Identity)\n" 551 " /Registry (Adobe)\n" 554 " /FontDescriptor %ld 0 R\n" 555 " /Subtype /CIDFontType2\n" 563 if (n >=
sizeof(buf))
return false;
564 AppendPDFObject(buf);
567 const int kCIDToGIDMapSize = 2 * (1 << 16);
568 const std::unique_ptr<unsigned char[]> cidtogidmap(
569 new unsigned char[kCIDToGIDMapSize]);
570 for (
int i = 0; i < kCIDToGIDMapSize; i++) {
571 cidtogidmap[i] = (i % 2) ? 1 : 0;
574 unsigned char *comp = zlibCompress(cidtogidmap.get(), kCIDToGIDMapSize, &len);
575 n = snprintf(buf,
sizeof(buf),
578 " /Length %lu /Filter /FlateDecode\n" 582 if (n >=
sizeof(buf)) {
587 long objsize = strlen(buf);
588 AppendData(reinterpret_cast<char *>(comp), len);
591 const char *endstream_endobj =
595 objsize += strlen(endstream_endobj);
596 AppendPDFObjectDIY(objsize);
599 "/CIDInit /ProcSet findresource begin\n" 604 " /Registry (Adobe)\n" 608 "/CMapName /Adobe-Identify-UCS def\n" 610 "1 begincodespacerange\n" 612 "endcodespacerange\n" 614 "<0000> <FFFF> <0000>\n" 617 "CMapName currentdict /CMap defineresource pop\n" 622 n = snprintf(buf,
sizeof(buf),
624 "<< /Length %lu >>\n" 628 "endobj\n", (
unsigned long) strlen(stream), stream);
629 if (n >=
sizeof(buf))
return false;
630 AppendPDFObject(buf);
633 n = snprintf(buf,
sizeof(buf),
640 " /FontBBox [ 0 0 %d %d ]\n" 641 " /FontFile2 %ld 0 R\n" 642 " /FontName /GlyphLessFont\n" 645 " /Type /FontDescriptor\n" 654 if (n >=
sizeof(buf))
return false;
655 AppendPDFObject(buf);
657 n = snprintf(buf,
sizeof(buf),
"%s/pdf.ttf", datadir_);
658 if (n >=
sizeof(buf))
return false;
659 FILE *fp = fopen(buf,
"rb");
661 tprintf(
"Can not open file \"%s\"!\n", buf);
664 fseek(fp, 0, SEEK_END);
665 long int size = ftell(fp);
666 fseek(fp, 0, SEEK_SET);
667 const std::unique_ptr<char[]> buffer(
new char[size]);
668 if (fread(buffer.get(), 1, size, fp) != static_cast<size_t>(size)) {
674 n = snprintf(buf,
sizeof(buf),
680 "stream\n", size, size);
681 if (n >=
sizeof(buf)) {
685 objsize = strlen(buf);
689 objsize += strlen(endstream_endobj);
690 AppendPDFObjectDIY(objsize);
void AppendString(const char *s)
void AppendData(const char *s, int len)
◆ EndDocumentHandler()
bool tesseract::TessPDFRenderer::EndDocumentHandler |
( |
| ) |
|
|
protectedvirtual |
Reimplemented from tesseract::TessResultRenderer.
Definition at line 922 of file pdfrenderer.cpp.
924 char buf[kBasicBufSize];
933 const long int kPagesObjectNumber = 2;
934 offsets_[kPagesObjectNumber] = offsets_.
back();
935 n = snprintf(buf,
sizeof(buf),
939 " /Kids [ ", kPagesObjectNumber);
940 if (n >=
sizeof(buf))
return false;
942 size_t pages_objsize = strlen(buf);
944 n = snprintf(buf,
sizeof(buf),
945 "%ld 0 R ", pages_[i]);
946 if (n >=
sizeof(buf))
return false;
948 pages_objsize += strlen(buf);
950 n = snprintf(buf,
sizeof(buf),
954 "endobj\n", pages_.
size());
955 if (n >=
sizeof(buf))
return false;
957 pages_objsize += strlen(buf);
958 offsets_.
back() += pages_objsize;
961 STRING utf16_title =
"FEFF";
963 char utf16[kMaxBytesPerCodepoint];
964 for (
char32 code : unicodes) {
966 utf16_title += utf16;
970 char* datestr = l_getFormattedDate();
971 n = snprintf(buf,
sizeof(buf),
974 " /Producer (Tesseract %s)\n" 975 " /CreationDate (D:%s)\n" 981 if (n >=
sizeof(buf))
return false;
982 AppendPDFObject(buf);
983 n = snprintf(buf,
sizeof(buf),
986 "0000000000 65535 f \n", obj_);
987 if (n >=
sizeof(buf))
return false;
989 for (
int i = 1; i < obj_; i++) {
990 n = snprintf(buf,
sizeof(buf),
"%010ld 00000 n \n", offsets_[i]);
991 if (n >=
sizeof(buf))
return false;
994 n = snprintf(buf,
sizeof(buf),
1008 if (n >=
sizeof(buf))
return false;
bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint])
size_t unsigned_size() const
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
const char * title() const
void AppendString(const char *s)
const char * c_str() const
#define TESSERACT_VERSION_STR
The documentation for this class was generated from the following files:
- /home/stweil/src/github/tesseract-ocr/tesseract/api/renderer.h
- /home/stweil/src/github/tesseract-ocr/tesseract/api/pdfrenderer.cpp