34 std::stringstream &alto_str) {
35 int left, top, right, bottom;
36 it->BoundingBox(level, &left, &top, &right, &bottom);
40 int height = bottom - top;
41 int width = right - left;
43 alto_str <<
" HPOS=\"" << hpos <<
"\"";
44 alto_str <<
" VPOS=\"" << vpos <<
"\"";
45 alto_str <<
" WIDTH=\"" << width <<
"\"";
46 alto_str <<
" HEIGHT=\"" << height <<
"\"";
50 alto_str <<
" WC=\"0." << wc <<
"\"";
61 begin_document =
true;
71 "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
72 "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" "
73 "xmlns:xlink=\"http://www.w3.org/1999/xlink\" "
74 "xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" "
75 "xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# "
76 "http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n"
78 "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n"
79 "\t\t<sourceImageInformation>\n"
86 "\t\t</sourceImageInformation>\n"
87 "\t\t<OCRProcessing ID=\"OCR_0\">\n"
88 "\t\t\t<ocrProcessingStep>\n"
89 "\t\t\t\t<processingSoftware>\n"
90 "\t\t\t\t\t<softwareName>tesseract ");
94 "\t\t\t\t</processingSoftware>\n"
95 "\t\t\t</ocrProcessingStep>\n"
96 "\t\t</OCRProcessing>\n"
99 begin_document =
false;
103 if (text ==
nullptr) {
123 begin_document(false) {}
142 int lcnt = 0, tcnt = 0, bcnt = 0, wcnt = 0;
150 int str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1,
nullptr, 0);
151 wchar_t *uni16_str =
new WCHAR[str16_len];
152 str16_len = MultiByteToWideChar(CP_ACP, 0,
input_file_.c_str(), -1, uni16_str, str16_len);
154 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len,
nullptr, 0,
nullptr,
nullptr);
155 char *utf8_str =
new char[utf8_len];
156 WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len,
nullptr,
nullptr);
162 std::stringstream alto_str;
164 alto_str.imbue(std::locale::classic());
166 <<
"\" PHYSICAL_IMG_NR=\"" << page_number <<
"\""
167 <<
" ID=\"page_" << page_number <<
"\">\n"
168 <<
"\t\t\t<PrintSpace HPOS=\"0\" VPOS=\"0\""
179 int left, top, right, bottom;
182 switch (block_type) {
188 alto_str <<
"\t\t\t\t<Illustration ID=\"cblock_" << bcnt++ <<
"\"";
189 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
190 alto_str <<
"</Illustration>\n";
197 alto_str <<
"\t\t\t\t<GraphicalElement ID=\"cblock_" << bcnt++ <<
"\"";
198 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
199 alto_str <<
"</GraphicalElement >\n";
203 tprintf(
"TODO: Please report image which triggers the noise case.\n");
210 alto_str <<
"\t\t\t\t<ComposedBlock ID=\"cblock_" << bcnt <<
"\"";
211 AddBoxToAlto(res_it,
RIL_BLOCK, alto_str);
216 alto_str <<
"\t\t\t\t\t<TextBlock ID=\"block_" << tcnt <<
"\"";
217 AddBoxToAlto(res_it,
RIL_PARA, alto_str);
222 alto_str <<
"\t\t\t\t\t\t<TextLine ID=\"line_" << lcnt <<
"\"";
227 alto_str <<
"\t\t\t\t\t\t\t<String ID=\"string_" << wcnt <<
"\"";
228 AddBoxToAlto(res_it,
RIL_WORD, alto_str);
229 alto_str <<
" CONTENT=\"";
239 if (grapheme && grapheme[0] != 0) {
240 alto_str <<
HOcrEscape(grapheme.get()).c_str();
249 if (last_word_in_line) {
250 alto_str <<
"\n\t\t\t\t\t\t</TextLine>\n";
256 int width = left - hpos;
257 alto_str <<
"<SP WIDTH=\"" << width <<
"\" VPOS=\"" << vpos <<
"\" HPOS=\"" << hpos
261 if (last_word_in_tblock) {
262 alto_str <<
"\t\t\t\t\t</TextBlock>\n";
266 if (last_word_in_cblock) {
267 alto_str <<
"\t\t\t\t</ComposedBlock>\n";
272 alto_str <<
"\t\t\t</PrintSpace>\n"
274 const std::string &text = alto_str.str();
276 char *result =
new char[text.length() + 1];
277 strcpy(result, text.c_str());
void tprintf(const char *format,...)
std::string HOcrEscape(const char *text)
const char * GetInputName()
std::string input_file_
Name used by training code.
int Recognize(ETEXT_DESC *monitor)
PAGE_RES * page_res_
The page-level data.
Tesseract * tesseract_
The underlying data object.
static const char * Version()
ResultIterator * GetIterator()
char * GetAltoText(ETEXT_DESC *monitor, int page_number)
void SetInputName(const char *name)
PolyBlockType BlockType() const
bool Empty(PageIteratorLevel level) const
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void AppendString(const char *s)
bool BeginDocumentHandler() override
TessAltoRenderer(const char *outputbase)
bool EndDocumentHandler() override
bool AddImageHandler(TessBaseAPI *api) override
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
bool Next(PageIteratorLevel level) override