tesseract v5.3.3.20231005
tesseract::BoxChar Class Reference

#include <boxchar.h>

Public Member Functions

 BoxChar (const char *utf8_str, int len)
 
 ~BoxChar ()
 
const std::string & ch () const
 
const Box * box () const
 
const int & page () const
 
void set_rtl_index (int index)
 
const int & rtl_index () const
 
void AddBox (int x, int y, int width, int height)
 
void set_page (int page)
 
std::string * mutable_ch ()
 
Box * mutable_box ()
 
bool operator< (const BoxChar &other) const
 
void GetDirection (int *num_rtl, int *num_ltr) const
 
void ReverseUnicodesInBox ()
 

Static Public Member Functions

static void TranslateBoxes (int xshift, int yshift, std::vector< BoxChar * > *boxes)
 
static void PrepareToWrite (std::vector< BoxChar * > *boxes)
 
static void InsertNewlines (bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
 
static void InsertSpaces (bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
 
static void ReorderRTLText (std::vector< BoxChar * > *boxes)
 
static bool ContainsMostlyRTL (const std::vector< BoxChar * > &boxes)
 
static bool MostlyVertical (const std::vector< BoxChar * > &boxes)
 
static int TotalByteLength (const std::vector< BoxChar * > &boxes)
 
static void RotateBoxes (float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
 
static void WriteTesseractBoxFile (const std::string &name, int height, const std::vector< BoxChar * > &boxes)
 
static std::string GetTesseractBoxStr (int height, const std::vector< BoxChar * > &boxes)
 

Detailed Description

Definition at line 36 of file boxchar.h.

Constructor & Destructor Documentation

◆ BoxChar()

tesseract::BoxChar::BoxChar ( const char *  utf8_str,
int  len 
)

Definition at line 39 of file boxchar.cpp.

40 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}

◆ ~BoxChar()

tesseract::BoxChar::~BoxChar ( )

Definition at line 42 of file boxchar.cpp.

42 {
43 boxDestroy(&box_);
44}

Member Function Documentation

◆ AddBox()

void tesseract::BoxChar::AddBox ( int  x,
int  y,
int  width,
int  height 
)

Definition at line 46 of file boxchar.cpp.

46 {
47 box_ = boxCreate(x, y, width, height);
48}
const double y

◆ box()

const Box * tesseract::BoxChar::box ( ) const
inline

Definition at line 46 of file boxchar.h.

46 {
47 return box_;
48 }

◆ ch()

const std::string & tesseract::BoxChar::ch ( ) const
inline

Definition at line 43 of file boxchar.h.

43 {
44 return ch_;
45 }

◆ ContainsMostlyRTL()

bool tesseract::BoxChar::ContainsMostlyRTL ( const std::vector< BoxChar * > &  boxes)
static

Definition at line 263 of file boxchar.cpp.

263 {
264 int num_rtl = 0, num_ltr = 0;
265 for (auto boxe : boxes) {
266 boxe->GetDirection(&num_rtl, &num_ltr);
267 }
268 return num_rtl > num_ltr;
269}

◆ GetDirection()

void tesseract::BoxChar::GetDirection ( int *  num_rtl,
int *  num_ltr 
) const

Definition at line 52 of file boxchar.cpp.

52 {
53 // Convert the unichar to UTF32 representation
54 std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());
55 if (uni_vector.empty()) {
56 tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str());
57 for (char c : ch_) {
58 tprintf(" 0x%x", c);
59 }
60 tprintf("\n");
61 return;
62 }
63 for (char32 ch : uni_vector) {
64 UCharDirection dir = u_charDirection(ch);
65 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) {
66 ++*num_rtl;
67 } else if ((dir == U_ARABIC_NUMBER) ||
68 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
69 ++*num_ltr;
70 }
71 }
72}
signed int char32
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
const std::string & ch() const
Definition: boxchar.h:43

◆ GetTesseractBoxStr()

std::string tesseract::BoxChar::GetTesseractBoxStr ( int  height,
const std::vector< BoxChar * > &  boxes 
)
static

Definition at line 331 of file boxchar.cpp.

331 {
332 std::string output;
333 char buffer[kMaxLineLength];
334 for (auto boxe : boxes) {
335 const Box *box = boxe->box_;
336 if (box == nullptr) {
337 tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
338 return "";
339 }
340 int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", boxe->ch_.c_str(), box->x,
341 height - box->y - box->h, box->x + box->w, height - box->y, boxe->page_);
342 output.append(buffer, nbytes);
343 }
344 return output;
345}
const int kMaxLineLength
Definition: boxchar.cpp:322
const Box * box() const
Definition: boxchar.h:46

◆ InsertNewlines()

void tesseract::BoxChar::InsertNewlines ( bool  rtl_rules,
bool  vertical_rules,
std::vector< BoxChar * > *  boxes 
)
static

Definition at line 113 of file boxchar.cpp.

113 {
114 size_t prev_i = SIZE_MAX;
115 int max_shift = 0;
116 for (size_t i = 0; i < boxes->size(); ++i) {
117 Box *box = (*boxes)[i]->box_;
118 if (box == nullptr) {
119 if (prev_i == SIZE_MAX || prev_i + 1 < i || i + 1 == boxes->size()) {
120 // Erase null boxes at the start of a line and after another null box.
121 do {
122 delete (*boxes)[i];
123 boxes->erase(boxes->begin() + i);
124 if (i == 0) {
125 break;
126 }
127 } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);
128 }
129 continue;
130 }
131 if (prev_i != SIZE_MAX) {
132 Box *prev_box = (*boxes)[prev_i]->box_;
133 int shift = box->x - prev_box->x;
134 if (vertical_rules) {
135 shift = box->y - prev_box->y;
136 } else if (rtl_rules) {
137 shift = -shift;
138 }
139 if (-shift > max_shift) {
140 // This is a newline. Since nothing cares about the size of the box,
141 // except the out-of-bounds checker, minimize the chance of creating
142 // a box outside the image by making the width and height 1.
143 int width = 1;
144 int height = 1;
145 int x = prev_box->x + prev_box->w;
146 int y = prev_box->y;
147 if (vertical_rules) {
148 x = prev_box->x;
149 y = prev_box->y + prev_box->h;
150 } else if (rtl_rules) {
151 x = prev_box->x - width;
152 if (x < 0) {
153 tprintf("prev x = %d, width=%d\n", prev_box->x, width);
154 x = 0;
155 }
156 }
157 if (prev_i + 1 == i) {
158 // New character needed.
159 auto *new_box = new BoxChar("\t", 1);
160 new_box->AddBox(x, y, width, height);
161 new_box->page_ = (*boxes)[i]->page_;
162 boxes->insert(boxes->begin() + i, new_box);
163 ++i;
164 } else {
165 (*boxes)[i - 1]->AddBox(x, y, width, height);
166 (*boxes)[i - 1]->ch_ = "\t";
167 }
168 max_shift = 0;
169 } else if (shift > max_shift) {
170 max_shift = shift;
171 }
172 }
173 prev_i = i;
174 }
175}
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:39

◆ InsertSpaces()

void tesseract::BoxChar::InsertSpaces ( bool  rtl_rules,
bool  vertical_rules,
std::vector< BoxChar * > *  boxes 
)
static

Definition at line 179 of file boxchar.cpp.

179 {
180 // After InsertNewlines, any remaining null boxes are not newlines, and are
181 // singletons, so add a box to each remaining null box.
182 for (size_t i = 1; i + 1 < boxes->size(); ++i) {
183 Box *box = (*boxes)[i]->box_;
184 if (box == nullptr) {
185 Box *prev = (*boxes)[i - 1]->box_;
186 Box *next = (*boxes)[i + 1]->box_;
187 ASSERT_HOST(prev != nullptr && next != nullptr);
188 int top = std::min(prev->y, next->y);
189 int bottom = std::max(prev->y + prev->h, next->y + next->h);
190 int left = prev->x + prev->w;
191 int right = next->x;
192 if (vertical_rules) {
193 top = prev->y + prev->h;
194 bottom = next->y;
195 left = std::min(prev->x, next->x);
196 right = std::max(prev->x + prev->w, next->x + next->w);
197 } else if (rtl_rules) {
198 // With RTL we have to account for BiDi.
199 // Right becomes the min left of all prior boxes back to the first
200 // space or newline.
201 right = prev->x;
202 left = next->x + next->w;
203 for (int j = i - 2; j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; --j) {
204 prev = (*boxes)[j]->box_;
205 ASSERT_HOST(prev != nullptr);
206 if (prev->x < right) {
207 right = prev->x;
208 }
209 }
210 // Left becomes the max right of all next boxes forward to the first
211 // space or newline.
212 for (size_t j = i + 2;
213 j < boxes->size() && (*boxes)[j]->box_ != nullptr && (*boxes)[j]->ch_ != "\t"; ++j) {
214 next = (*boxes)[j]->box_;
215 if (next->x + next->w > left) {
216 left = next->x + next->w;
217 }
218 }
219 }
220 // Italic and stylized characters can produce negative spaces, which
221 // Leptonica doesn't like, so clip to a positive size.
222 if (right <= left) {
223 right = left + 1;
224 }
225 if (bottom <= top) {
226 bottom = top + 1;
227 }
228 (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
229 (*boxes)[i]->ch_ = " ";
230 }
231 }
232}
#define ASSERT_HOST(x)
Definition: errcode.h:54
def next(obj)
Definition: ast.py:56

◆ MostlyVertical()

bool tesseract::BoxChar::MostlyVertical ( const std::vector< BoxChar * > &  boxes)
static

Definition at line 273 of file boxchar.cpp.

273 {
274 int64_t total_dx = 0, total_dy = 0;
275 for (size_t i = 1; i < boxes.size(); ++i) {
276 if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
277 boxes[i - 1]->page_ == boxes[i]->page_) {
278 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
279 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
280 if (abs(dx) > abs(dy) * kMinNewlineRatio || abs(dy) > abs(dx) * kMinNewlineRatio) {
281 total_dx += static_cast<int64_t>(dx) * dx;
282 total_dy += static_cast<int64_t>(dy) * dy;
283 }
284 }
285 }
286 return total_dy > total_dx;
287}
const int kMinNewlineRatio
Definition: boxchar.cpp:35

◆ mutable_box()

Box * tesseract::BoxChar::mutable_box ( )
inline

Definition at line 69 of file boxchar.h.

69 {
70 return box_;
71 }

◆ mutable_ch()

std::string * tesseract::BoxChar::mutable_ch ( )
inline

Definition at line 66 of file boxchar.h.

66 {
67 return &ch_;
68 }

◆ operator<()

bool tesseract::BoxChar::operator< ( const BoxChar other) const
inline

Definition at line 75 of file boxchar.h.

75 {
76 if (box_ == nullptr) {
77 return true;
78 }
79 if (other.box_ == nullptr) {
80 return false;
81 }
82 return box_->x < other.box_->x;
83 }

◆ page()

const int & tesseract::BoxChar::page ( ) const
inline

Definition at line 49 of file boxchar.h.

49 {
50 return page_;
51 }

◆ PrepareToWrite()

void tesseract::BoxChar::PrepareToWrite ( std::vector< BoxChar * > *  boxes)
static

Definition at line 96 of file boxchar.cpp.

96 {
97 bool rtl_rules = ContainsMostlyRTL(*boxes);
98 bool vertical_rules = MostlyVertical(*boxes);
99 InsertNewlines(rtl_rules, vertical_rules, boxes);
100 InsertSpaces(rtl_rules, vertical_rules, boxes);
101 for (size_t i = 0; i < boxes->size(); ++i) {
102 if ((*boxes)[i]->box_ == nullptr) {
103 tprintf("Null box at index %zu\n", i);
104 }
105 }
106 if (rtl_rules) {
107 ReorderRTLText(boxes);
108 }
109}
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:273
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:179
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:236
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:263
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
Definition: boxchar.cpp:113

◆ ReorderRTLText()

void tesseract::BoxChar::ReorderRTLText ( std::vector< BoxChar * > *  boxes)
static

Definition at line 236 of file boxchar.cpp.

236 {
237 // Ideally we need the inverse of the algorithm used by ResultIterator.
238 // For now, let's try a sort that reverses original positions for RTL
239 // characters, otherwise by x-position. This should be much closer to
240 // correct than just sorting by x-position.
241 size_t num_boxes = boxes->size();
242 for (size_t i = 0; i < num_boxes; ++i) {
243 int num_rtl = 0, num_ltr = 0;
244 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
245 if (num_rtl > num_ltr) {
246 (*boxes)[i]->set_rtl_index(i);
247 (*boxes)[i]->ReverseUnicodesInBox();
248 }
249 }
250 BoxCharPtrSort sorter;
251 size_t end = 0;
252 for (size_t start = 0; start < boxes->size(); start = end + 1) {
253 end = start + 1;
254 while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") {
255 ++end;
256 }
257 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
258 }
259}

◆ ReverseUnicodesInBox()

void tesseract::BoxChar::ReverseUnicodesInBox ( )

Definition at line 76 of file boxchar.cpp.

76 {
77 std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());
78 std::reverse(unicodes.begin(), unicodes.end());
79 ch_ = UNICHAR::UTF32ToUTF8(unicodes);
80}
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:237

◆ RotateBoxes()

void tesseract::BoxChar::RotateBoxes ( float  rotation,
int  xcenter,
int  ycenter,
int  start_box,
int  end_box,
std::vector< BoxChar * > *  boxes 
)
static

Definition at line 302 of file boxchar.cpp.

303 {
304 Boxa *orig = boxaCreate(0);
305 for (int i = start_box; i < end_box; ++i) {
306 Box *box = (*boxes)[i]->box_;
307 if (box) {
308 boxaAddBox(orig, box, L_CLONE);
309 }
310 }
311 Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation);
312 boxaDestroy(&orig);
313 for (int i = start_box, box_ind = 0; i < end_box; ++i) {
314 if ((*boxes)[i]->box_) {
315 boxDestroy(&((*boxes)[i]->box_));
316 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
317 }
318 }
319 boxaDestroy(&rotated);
320}

◆ rtl_index()

const int & tesseract::BoxChar::rtl_index ( ) const
inline

Definition at line 55 of file boxchar.h.

55 {
56 return rtl_index_;
57 }

◆ set_page()

void tesseract::BoxChar::set_page ( int  page)
inline

Definition at line 62 of file boxchar.h.

62 {
63 page_ = page;
64 }
const int & page() const
Definition: boxchar.h:49

◆ set_rtl_index()

void tesseract::BoxChar::set_rtl_index ( int  index)
inline

Definition at line 52 of file boxchar.h.

52 {
53 rtl_index_ = index;
54 }

◆ TotalByteLength()

int tesseract::BoxChar::TotalByteLength ( const std::vector< BoxChar * > &  boxes)
static

Definition at line 291 of file boxchar.cpp.

291 {
292 int total_length = 0;
293 for (auto boxe : boxes) {
294 total_length += boxe->ch_.size();
295 }
296 return total_length;
297}

◆ TranslateBoxes()

void tesseract::BoxChar::TranslateBoxes ( int  xshift,
int  yshift,
std::vector< BoxChar * > *  boxes 
)
static

Definition at line 83 of file boxchar.cpp.

83 {
84 for (auto &boxe : *boxes) {
85 Box *box = boxe->box_;
86 if (box != nullptr) {
87 box->x += xshift;
88 box->y += yshift;
89 }
90 }
91}

◆ WriteTesseractBoxFile()

void tesseract::BoxChar::WriteTesseractBoxFile ( const std::string &  name,
int  height,
const std::vector< BoxChar * > &  boxes 
)
static

Definition at line 324 of file boxchar.cpp.

325 {
326 std::string output = GetTesseractBoxStr(height, boxes);
328}
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
Definition: boxchar.cpp:331
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)
Definition: fileio.cpp:54

The documentation for this class was generated from the following files: