39static const char *kMultiBlobLabelCode =
"WordStr";
42static std::string BoxFileName(
const char *image_filename) {
43 std::string box_filename = image_filename;
44 size_t length = box_filename.length();
45 std::string
last = (length > 8) ? box_filename.substr(length - 8) :
"";
46 if (
last ==
".bin.png" ||
last ==
".nrm.png" ||
last ==
".raw.png") {
47 box_filename.resize(length - 8);
49 size_t lastdot = box_filename.find_last_of(
'.');
50 if (lastdot < length) {
51 box_filename.resize(lastdot);
54 box_filename +=
".box";
60 std::string filename = BoxFileName(fname);
61 FILE *box_file =
nullptr;
62 if (!(box_file = fopen(filename.c_str(),
"rb"))) {
64 tprintf(
"Can't open box file %s", filename.c_str());
76bool ReadAllBoxes(
int target_page,
bool skip_blanks,
const char *filename, std::vector<TBOX> *boxes,
77 std::vector<std::string> *texts, std::vector<std::string> *box_texts,
78 std::vector<int> *pages) {
79 std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
81 tprintf(
"Cannot read box data from '%s'.\n", BoxFileName(filename).c_str());
85 std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
86 if (box_data.empty()) {
87 tprintf(
"No box data found in '%s'.\n", BoxFileName(filename).c_str());
91 box_data.push_back(
'\0');
92 return ReadMemBoxes(target_page, skip_blanks, &box_data[0],
93 true, boxes, texts, box_texts, pages);
97bool ReadMemBoxes(
int target_page,
bool skip_blanks,
const char *box_data,
bool continue_on_failure,
98 std::vector<TBOX> *boxes, std::vector<std::string> *texts,
99 std::vector<std::string> *box_texts, std::vector<int> *pages) {
100 std::string box_str(box_data);
101 std::vector<std::string> lines =
split(box_str,
'\n');
106 for (
auto &line : lines) {
108 std::string utf8_str;
111 if (continue_on_failure) {
117 if (skip_blanks && (utf8_str ==
" " || utf8_str ==
"\t")) {
120 if (target_page >= 0 && page != target_page) {
123 if (boxes !=
nullptr) {
124 boxes->push_back(box);
126 if (texts !=
nullptr) {
127 texts->push_back(utf8_str);
129 if (box_texts !=
nullptr) {
130 std::string full_text;
132 box_texts->push_back(full_text);
134 if (pages !=
nullptr) {
135 pages->push_back(page);
139 return num_boxes > 0;
153bool ReadNextBox(
int *line_number, FILE *box_file, std::string &utf8_str,
TBOX *bounding_box) {
154 return ReadNextBox(-1, line_number, box_file, utf8_str, bounding_box);
160bool ReadNextBox(
int target_page,
int *line_number, FILE *box_file, std::string &utf8_str,
161 TBOX *bounding_box) {
164 char *buffptr = buff;
166 while (fgets(buff,
sizeof(buff) - 1, box_file)) {
170 const auto *ubuf =
reinterpret_cast<const unsigned char *
>(buffptr);
171 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
175 if (*buffptr ==
'\n' || *buffptr ==
'\0') {
179 if (*buffptr ==
' ' || *buffptr ==
'\t') {
182 if (*buffptr !=
'\0') {
184 tprintf(
"Box file format error on line %i; ignored\n", *line_number);
187 if (target_page >= 0 && target_page != page) {
206 TBOX *bounding_box) {
207 *bounding_box =
TBOX();
210 const char *buffptr = boxfile_str;
217 const auto *ubuf =
reinterpret_cast<const unsigned char *
>(buffptr);
218 if (ubuf[0] == 0xef && ubuf[1] == 0xbb && ubuf[2] == 0xbf) {
223 if (*buffptr ==
'\0') {
227 uch[uch_len++] = *buffptr++;
228 }
while (*buffptr !=
'\0' && *buffptr !=
' ' && *buffptr !=
'\t' &&
231 if (*buffptr !=
'\0') {
239 std::stringstream stream(buffptr);
240 stream.imbue(std::locale::classic());
245 stream >> *page_number;
246 if (x_max < x_min || y_max < y_min) {
247 tprintf(
"Bad box coordinates in boxfile string! %s\n", ubuf);
251 if (strcmp(uch, kMultiBlobLabelCode) == 0 && (buffptr = strchr(buffptr,
'#')) !=
nullptr) {
255 uch_len = strlen(uch);
259 while (used < uch_len) {
261 int new_used =
ch.utf8_len();
263 tprintf(
"Bad UTF-8 str %s starts with 0x%02x at col %d\n", uch + used, uch[used], used + 1);
270 std::swap(x_min, x_max);
273 std::swap(y_min, y_max);
281 box_str = unichar_str;
282 box_str +=
" " + std::to_string(box.
left());
283 box_str +=
" " + std::to_string(box.
bottom());
284 box_str +=
" " + std::to_string(box.
right());
285 box_str +=
" " + std::to_string(box.
top());
286 box_str +=
" " + std::to_string(page_num);
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, TBOX *bounding_box)
void tprintf(const char *format,...)
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str)
void chomp_string(char *str)
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
const int kBoxReadBufSize
FILE * OpenBoxFile(const char *fname)
constexpr ERRCODE CANTOPENFILE("Can't open file")
const std::vector< std::string > split(const std::string &s, char c)
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
void set_to_given_coords(int x_min, int y_min, int x_max, int y_max)
TDimension bottom() const
void error(const char *caller, TessErrorLogCode action, const char *format,...) const __attribute__((format(gnu_printf