All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::TessPDFRenderer Class Reference

#include <renderer.h>

Inheritance diagram for tesseract::TessPDFRenderer:
tesseract::TessResultRenderer

Public Member Functions

 TessPDFRenderer (const char *outputbase, const char *datadir)
 
- Public Member Functions inherited from tesseract::TessResultRenderer
virtual ~TessResultRenderer ()
 
void insert (TessResultRenderer *next)
 
TessResultRenderernext ()
 
bool BeginDocument (const char *title)
 
bool AddImage (TessBaseAPI *api)
 
bool EndDocument ()
 
const char * file_extension () const
 
const char * title () const
 
int imagenum () const
 

Protected Member Functions

virtual bool BeginDocumentHandler ()
 
virtual bool AddImageHandler (TessBaseAPI *api)
 
virtual bool EndDocumentHandler ()
 
- Protected Member Functions inherited from tesseract::TessResultRenderer
 TessResultRenderer (const char *outputbase, const char *extension)
 
void AppendString (const char *s)
 
void AppendData (const char *s, int len)
 

Detailed Description

Renders tesseract output into searchable PDF

Definition at line 168 of file renderer.h.

Constructor & Destructor Documentation

tesseract::TessPDFRenderer::TessPDFRenderer ( const char *  outputbase,
const char *  datadir 
)

Definition at line 164 of file pdfrenderer.cpp.

165  : TessResultRenderer(outputbase, "pdf") {
166  obj_ = 0;
167  datadir_ = datadir;
168  offsets_.push_back(0);
169 }
TessResultRenderer(const char *outputbase, const char *extension)
Definition: renderer.cpp:16
int push_back(T object)

Member Function Documentation

bool tesseract::TessPDFRenderer::AddImageHandler ( TessBaseAPI api)
protectedvirtual

Implements tesseract::TessResultRenderer.

Definition at line 807 of file pdfrenderer.cpp.

807  {
808  size_t n;
809  char buf[kBasicBufSize];
810  Pix *pix = api->GetInputImage();
811  char *filename = (char *)api->GetInputName();
812  int ppi = api->GetSourceYResolution();
813  if (!pix || ppi <= 0)
814  return false;
815  double width = pixGetWidth(pix) * 72.0 / ppi;
816  double height = pixGetHeight(pix) * 72.0 / ppi;
817 
818  // PAGE
819  n = snprintf(buf, sizeof(buf),
820  "%ld 0 obj\n"
821  "<<\n"
822  " /Type /Page\n"
823  " /Parent %ld 0 R\n"
824  " /MediaBox [0 0 %.2f %.2f]\n"
825  " /Contents %ld 0 R\n"
826  " /Resources\n"
827  " <<\n"
828  " /XObject << /Im1 %ld 0 R >>\n"
829  " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
830  " /Font << /f-0-0 %ld 0 R >>\n"
831  " >>\n"
832  ">>\n"
833  "endobj\n",
834  obj_,
835  2L, // Pages object
836  width,
837  height,
838  obj_ + 1, // Contents object
839  obj_ + 2, // Image object
840  3L); // Type0 Font
841  if (n >= sizeof(buf)) return false;
842  pages_.push_back(obj_);
843  AppendPDFObject(buf);
844 
845  // CONTENTS
846  char* pdftext = GetPDFTextObjects(api, width, height);
847  long pdftext_len = strlen(pdftext);
848  unsigned char *pdftext_casted = reinterpret_cast<unsigned char *>(pdftext);
849  size_t len;
850  unsigned char *comp_pdftext =
851  zlibCompress(pdftext_casted, pdftext_len, &len);
852  long comp_pdftext_len = len;
853  n = snprintf(buf, sizeof(buf),
854  "%ld 0 obj\n"
855  "<<\n"
856  " /Length %ld /Filter /FlateDecode\n"
857  ">>\n"
858  "stream\n", obj_, comp_pdftext_len);
859  if (n >= sizeof(buf)) {
860  delete[] pdftext;
861  lept_free(comp_pdftext);
862  return false;
863  }
864  AppendString(buf);
865  long objsize = strlen(buf);
866  AppendData(reinterpret_cast<char *>(comp_pdftext), comp_pdftext_len);
867  objsize += comp_pdftext_len;
868  lept_free(comp_pdftext);
869  delete[] pdftext;
870  const char *b2 =
871  "endstream\n"
872  "endobj\n";
873  AppendString(b2);
874  objsize += strlen(b2);
875  AppendPDFObjectDIY(objsize);
876 
877  char *pdf_object;
878  if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
879  return false;
880  }
881  AppendData(pdf_object, objsize);
882  AppendPDFObjectDIY(objsize);
883  delete[] pdf_object;
884  return true;
885 }
int push_back(T object)
void AppendString(const char *s)
Definition: renderer.cpp:83
const int kBasicBufSize
void AppendData(const char *s, int len)
Definition: renderer.cpp:87
bool tesseract::TessPDFRenderer::BeginDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 452 of file pdfrenderer.cpp.

452  {
453  char buf[kBasicBufSize];
454  size_t n;
455 
456  n = snprintf(buf, sizeof(buf),
457  "%%PDF-1.5\n"
458  "%%%c%c%c%c\n",
459  0xDE, 0xAD, 0xBE, 0xEB);
460  if (n >= sizeof(buf)) return false;
461  AppendPDFObject(buf);
462 
463  // CATALOG
464  n = snprintf(buf, sizeof(buf),
465  "1 0 obj\n"
466  "<<\n"
467  " /Type /Catalog\n"
468  " /Pages %ld 0 R\n"
469  ">>\n"
470  "endobj\n",
471  2L);
472  if (n >= sizeof(buf)) return false;
473  AppendPDFObject(buf);
474 
475  // We are reserving object #2 for the /Pages
476  // object, which I am going to create and write
477  // at the end of the PDF file.
478  AppendPDFObject("");
479 
480  // TYPE0 FONT
481  n = snprintf(buf, sizeof(buf),
482  "3 0 obj\n"
483  "<<\n"
484  " /BaseFont /GlyphLessFont\n"
485  " /DescendantFonts [ %ld 0 R ]\n"
486  " /Encoding /Identity-H\n"
487  " /Subtype /Type0\n"
488  " /ToUnicode %ld 0 R\n"
489  " /Type /Font\n"
490  ">>\n"
491  "endobj\n",
492  4L, // CIDFontType2 font
493  6L // ToUnicode
494  );
495  if (n >= sizeof(buf)) return false;
496  AppendPDFObject(buf);
497 
498  // CIDFONTTYPE2
499  n = snprintf(buf, sizeof(buf),
500  "4 0 obj\n"
501  "<<\n"
502  " /BaseFont /GlyphLessFont\n"
503  " /CIDToGIDMap %ld 0 R\n"
504  " /CIDSystemInfo\n"
505  " <<\n"
506  " /Ordering (Identity)\n"
507  " /Registry (Adobe)\n"
508  " /Supplement 0\n"
509  " >>\n"
510  " /FontDescriptor %ld 0 R\n"
511  " /Subtype /CIDFontType2\n"
512  " /Type /Font\n"
513  " /DW %d\n"
514  ">>\n"
515  "endobj\n",
516  5L, // CIDToGIDMap
517  7L, // Font descriptor
518  1000 / kCharWidth);
519  if (n >= sizeof(buf)) return false;
520  AppendPDFObject(buf);
521 
522  // CIDTOGIDMAP
523  const int kCIDToGIDMapSize = 2 * (1 << 16);
524  unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize];
525  for (int i = 0; i < kCIDToGIDMapSize; i++) {
526  cidtogidmap[i] = (i % 2) ? 1 : 0;
527  }
528  size_t len;
529  unsigned char *comp =
530  zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len);
531  delete[] cidtogidmap;
532  n = snprintf(buf, sizeof(buf),
533  "5 0 obj\n"
534  "<<\n"
535  " /Length %ld /Filter /FlateDecode\n"
536  ">>\n"
537  "stream\n", len);
538  if (n >= sizeof(buf)) {
539  lept_free(comp);
540  return false;
541  }
542  AppendString(buf);
543  long objsize = strlen(buf);
544  AppendData(reinterpret_cast<char *>(comp), len);
545  objsize += len;
546  lept_free(comp);
547  const char *endstream_endobj =
548  "endstream\n"
549  "endobj\n";
550  AppendString(endstream_endobj);
551  objsize += strlen(endstream_endobj);
552  AppendPDFObjectDIY(objsize);
553 
554  const char *stream =
555  "/CIDInit /ProcSet findresource begin\n"
556  "12 dict begin\n"
557  "begincmap\n"
558  "/CIDSystemInfo\n"
559  "<<\n"
560  " /Registry (Adobe)\n"
561  " /Ordering (UCS)\n"
562  " /Supplement 0\n"
563  ">> def\n"
564  "/CMapName /Adobe-Identify-UCS def\n"
565  "/CMapType 2 def\n"
566  "1 begincodespacerange\n"
567  "<0000> <FFFF>\n"
568  "endcodespacerange\n"
569  "1 beginbfrange\n"
570  "<0000> <FFFF> <0000>\n"
571  "endbfrange\n"
572  "endcmap\n"
573  "CMapName currentdict /CMap defineresource pop\n"
574  "end\n"
575  "end\n";
576 
577  // TOUNICODE
578  n = snprintf(buf, sizeof(buf),
579  "6 0 obj\n"
580  "<< /Length %lu >>\n"
581  "stream\n"
582  "%s"
583  "endstream\n"
584  "endobj\n", (unsigned long) strlen(stream), stream);
585  if (n >= sizeof(buf)) return false;
586  AppendPDFObject(buf);
587 
588  // FONT DESCRIPTOR
589  const int kCharHeight = 2; // Effect: highlights are half height
590  n = snprintf(buf, sizeof(buf),
591  "7 0 obj\n"
592  "<<\n"
593  " /Ascent %d\n"
594  " /CapHeight %d\n"
595  " /Descent -1\n" // Spec says must be negative
596  " /Flags 5\n" // FixedPitch + Symbolic
597  " /FontBBox [ 0 0 %d %d ]\n"
598  " /FontFile2 %ld 0 R\n"
599  " /FontName /GlyphLessFont\n"
600  " /ItalicAngle 0\n"
601  " /StemV 80\n"
602  " /Type /FontDescriptor\n"
603  ">>\n"
604  "endobj\n",
605  1000 / kCharHeight,
606  1000 / kCharHeight,
607  1000 / kCharWidth,
608  1000 / kCharHeight,
609  8L // Font data
610  );
611  if (n >= sizeof(buf)) return false;
612  AppendPDFObject(buf);
613 
614  n = snprintf(buf, sizeof(buf), "%s/pdf.ttf", datadir_);
615  if (n >= sizeof(buf)) return false;
616  FILE *fp = fopen(buf, "rb");
617  if (!fp) {
618  tprintf("Can not open file \"%s\"!\n", buf);
619  return false;
620  }
621  fseek(fp, 0, SEEK_END);
622  long int size = ftell(fp);
623  fseek(fp, 0, SEEK_SET);
624  char *buffer = new char[size];
625  if (fread(buffer, 1, size, fp) != size) {
626  fclose(fp);
627  delete[] buffer;
628  return false;
629  }
630  fclose(fp);
631  // FONTFILE2
632  n = snprintf(buf, sizeof(buf),
633  "8 0 obj\n"
634  "<<\n"
635  " /Length %ld\n"
636  " /Length1 %ld\n"
637  ">>\n"
638  "stream\n", size, size);
639  if (n >= sizeof(buf)) {
640  delete[] buffer;
641  return false;
642  }
643  AppendString(buf);
644  objsize = strlen(buf);
645  AppendData(buffer, size);
646  delete[] buffer;
647  objsize += size;
648  AppendString(endstream_endobj);
649  objsize += strlen(endstream_endobj);
650  AppendPDFObjectDIY(objsize);
651  return true;
652 }
const int kCharWidth
#define tprintf(...)
Definition: tprintf.h:31
void AppendString(const char *s)
Definition: renderer.cpp:83
const int kBasicBufSize
void AppendData(const char *s, int len)
Definition: renderer.cpp:87
bool tesseract::TessPDFRenderer::EndDocumentHandler ( )
protectedvirtual

Reimplemented from tesseract::TessResultRenderer.

Definition at line 888 of file pdfrenderer.cpp.

888  {
889  size_t n;
890  char buf[kBasicBufSize];
891 
892  // We reserved the /Pages object number early, so that the /Page
893  // objects could refer to their parent. We finally have enough
894  // information to go fill it in. Using lower level calls to manipulate
895  // the offset record in two spots, because we are placing objects
896  // out of order in the file.
897 
898  // PAGES
899  const long int kPagesObjectNumber = 2;
900  offsets_[kPagesObjectNumber] = offsets_.back(); // manipulation #1
901  n = snprintf(buf, sizeof(buf),
902  "%ld 0 obj\n"
903  "<<\n"
904  " /Type /Pages\n"
905  " /Kids [ ", kPagesObjectNumber);
906  if (n >= sizeof(buf)) return false;
907  AppendString(buf);
908  size_t pages_objsize = strlen(buf);
909  for (size_t i = 0; i < pages_.size(); i++) {
910  n = snprintf(buf, sizeof(buf),
911  "%ld 0 R ", pages_[i]);
912  if (n >= sizeof(buf)) return false;
913  AppendString(buf);
914  pages_objsize += strlen(buf);
915  }
916  n = snprintf(buf, sizeof(buf),
917  "]\n"
918  " /Count %d\n"
919  ">>\n"
920  "endobj\n", pages_.size());
921  if (n >= sizeof(buf)) return false;
922  AppendString(buf);
923  pages_objsize += strlen(buf);
924  offsets_.back() += pages_objsize; // manipulation #2
925 
926  // INFO
927  char* datestr = l_getFormattedDate();
928  n = snprintf(buf, sizeof(buf),
929  "%ld 0 obj\n"
930  "<<\n"
931  " /Producer (Tesseract %s)\n"
932  " /CreationDate (D:%s)\n"
933  " /Title (%s)"
934  ">>\n"
935  "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title());
936  lept_free(datestr);
937  if (n >= sizeof(buf)) return false;
938  AppendPDFObject(buf);
939  n = snprintf(buf, sizeof(buf),
940  "xref\n"
941  "0 %ld\n"
942  "0000000000 65535 f \n", obj_);
943  if (n >= sizeof(buf)) return false;
944  AppendString(buf);
945  for (int i = 1; i < obj_; i++) {
946  n = snprintf(buf, sizeof(buf), "%010ld 00000 n \n", offsets_[i]);
947  if (n >= sizeof(buf)) return false;
948  AppendString(buf);
949  }
950  n = snprintf(buf, sizeof(buf),
951  "trailer\n"
952  "<<\n"
953  " /Size %ld\n"
954  " /Root %ld 0 R\n"
955  " /Info %ld 0 R\n"
956  ">>\n"
957  "startxref\n"
958  "%ld\n"
959  "%%%%EOF\n",
960  obj_,
961  1L, // catalog
962  obj_ - 1, // info
963  offsets_.back());
964  if (n >= sizeof(buf)) return false;
965  AppendString(buf);
966  return true;
967 }
int size() const
Definition: genericvector.h:72
T & back() const
void AppendString(const char *s)
Definition: renderer.cpp:83
const char * title() const
Definition: renderer.h:80
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23
const int kBasicBufSize

The documentation for this class was generated from the following files: