tesseract v5.3.3.20231005
boxread.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: boxread.h
3 * Description: Read data from a box file.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 2007, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef TESSERACT_CCUTIL_BOXREAD_H_
20#define TESSERACT_CCUTIL_BOXREAD_H_
21
22#include <cstdio> // for FILE
23#include <string> // for std::string
24#include <vector> // for std::vector
25
26#include <tesseract/export.h> // for TESS_API
27
28namespace tesseract {
29
30class TBOX;
31
32// Size of buffer used to read a line from a box file.
33const int kBoxReadBufSize = 1024;
34
35// Open the boxfile based on the given image filename.
36// Returns nullptr if the box file cannot be opened.
38FILE *OpenBoxFile(const char *filename);
39
40// Reads all boxes from the given filename.
41// Reads a specific target_page number if >= 0, or all pages otherwise.
42// Skips blanks if skip_blanks is true.
43// The UTF-8 label of the box is put in texts, and the full box definition as
44// a string is put in box_texts, with the corresponding page number in pages.
45// Each of the output vectors is optional (may be nullptr).
46// Returns false if no boxes are found.
47bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector<TBOX> *boxes,
48 std::vector<std::string> *texts, std::vector<std::string> *box_texts,
49 std::vector<int> *pages);
50
51// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
52// continue_on_failure allows reading to continue even if an invalid box is
53// encountered and will return true if it succeeds in reading some boxes.
54// It otherwise gives up and returns false on encountering an invalid box.
56bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure,
57 std::vector<TBOX> *boxes, std::vector<std::string> *texts,
58 std::vector<std::string> *box_texts, std::vector<int> *pages);
59
60// ReadNextBox factors out the code to interpret a line of a box
61// file so that applybox and unicharset_extractor interpret the same way.
62// This function returns the next valid box file utf8 string and coords
63// and returns true, or false on eof (and closes the file).
64// It ignores the utf8 file signature ByteOrderMark (U+FEFF=EF BB BF), checks
65// for valid utf-8 and allows space or tab between fields.
66// utf8_str is set with the unichar string, and bounding box with the box.
67// If there are page numbers in the file, it reads them all.
69bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box);
70// As ReadNextBox above, but get a specific page number. (0-based)
71// Use -1 to read any page number. Files without page number all
72// read as if they are page 0.
74bool ReadNextBox(int target_page, int *line_number, FILE *box_file, std::string &utf8_str,
75 TBOX *bounding_box);
76
77// Parses the given box file string into a page_number, utf8_str, and
78// bounding_box. Returns true on a successful parse.
80bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str,
81 TBOX *bounding_box);
82
83// Creates a box file string from a unichar string, TBOX and page number.
85void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str);
86
87} // namespace tesseract
88
89#endif // TESSERACT_CCUTIL_BOXREAD_H_
@ TBOX
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:97
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:205
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str)
Definition: boxread.cpp:280
bool ReadAllBoxes(int target_page, bool skip_blanks, const char *filename, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
Definition: boxread.cpp:76
const int kBoxReadBufSize
Definition: boxread.h:33
FILE * OpenBoxFile(const char *fname)
Definition: boxread.cpp:59
bool ReadNextBox(int *line_number, FILE *box_file, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:153
#define TESS_API
Definition: export.h:32