tesseract  4.00.00dev
tesseract::BoxChar Class Reference

#include <boxchar.h>

Public Member Functions

 BoxChar (const char *utf8_str, int len)
 
 ~BoxChar ()
 
const string & ch () const
 
const Box * box () const
 
const int & page () const
 
void set_rtl_index (int index)
 
const int & rtl_index () const
 
void AddBox (int x, int y, int width, int height)
 
void set_page (int page)
 
string * mutable_ch ()
 
Box * mutable_box ()
 
bool operator< (const BoxChar &other) const
 
void GetDirection (int *num_rtl, int *num_ltr) const
 
void ReverseUnicodesInBox ()
 

Static Public Member Functions

static void TranslateBoxes (int xshift, int yshift, std::vector< BoxChar *> *boxes)
 
static void PrepareToWrite (std::vector< BoxChar *> *boxes)
 
static void InsertNewlines (bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
 
static void InsertSpaces (bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
 
static void ReorderRTLText (std::vector< BoxChar *> *boxes)
 
static bool ContainsMostlyRTL (const std::vector< BoxChar *> &boxes)
 
static bool MostlyVertical (const std::vector< BoxChar *> &boxes)
 
static int TotalByteLength (const std::vector< BoxChar *> &boxes)
 
static void RotateBoxes (float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar *> *boxes)
 
static void WriteTesseractBoxFile (const string &name, int height, const std::vector< BoxChar *> &boxes)
 
static string GetTesseractBoxStr (int height, const std::vector< BoxChar *> &boxes)
 

Detailed Description

Definition at line 36 of file boxchar.h.

Constructor & Destructor Documentation

◆ BoxChar()

tesseract::BoxChar::BoxChar ( const char *  utf8_str,
int  len 
)

Definition at line 42 of file boxchar.cpp.

43  : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}

◆ ~BoxChar()

tesseract::BoxChar::~BoxChar ( )

Definition at line 45 of file boxchar.cpp.

45 { boxDestroy(&box_); }

Member Function Documentation

◆ AddBox()

void tesseract::BoxChar::AddBox ( int  x,
int  y,
int  width,
int  height 
)

Definition at line 47 of file boxchar.cpp.

47  {
48  box_ = boxCreate(x, y, width, height);
49 }

◆ box()

const Box* tesseract::BoxChar::box ( ) const
inline

Definition at line 44 of file boxchar.h.

44 { return box_; }

◆ ch()

const string& tesseract::BoxChar::ch ( ) const
inline

Definition at line 43 of file boxchar.h.

43 { return ch_; }

◆ ContainsMostlyRTL()

bool tesseract::BoxChar::ContainsMostlyRTL ( const std::vector< BoxChar *> &  boxes)
static

Definition at line 261 of file boxchar.cpp.

261  {
262  int num_rtl = 0, num_ltr = 0;
263  for (int i = 0; i < boxes.size(); ++i) {
264  boxes[i]->GetDirection(&num_rtl, &num_ltr);
265  }
266  return num_rtl > num_ltr;
267 }

◆ GetDirection()

void tesseract::BoxChar::GetDirection ( int *  num_rtl,
int *  num_ltr 
) const

Definition at line 53 of file boxchar.cpp.

53  {
54  // Convert the unichar to UTF32 representation
55  std::vector<char32> uni_vector = UNICHAR::UTF8ToUTF32(ch_.c_str());
56  if (uni_vector.empty()) {
57  tprintf("Illegal utf8 in boxchar string:%s = ", ch_.c_str());
58  for (int c = 0; c < ch_.size(); ++c) {
59  tprintf(" 0x%x", ch_[c]);
60  }
61  tprintf("\n");
62  return;
63  }
64  for (char32 ch : uni_vector) {
65  UCharDirection dir = u_charDirection(ch);
66  if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
67  dir == U_ARABIC_NUMBER || dir == U_RIGHT_TO_LEFT_ISOLATE) {
68  ++*num_rtl;
69  } else if (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL) {
70  ++*num_ltr;
71  }
72  }
73 }
const string & ch() const
Definition: boxchar.h:43
#define tprintf(...)
Definition: tprintf.h:31
signed int char32
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213

◆ GetTesseractBoxStr()

string tesseract::BoxChar::GetTesseractBoxStr ( int  height,
const std::vector< BoxChar *> &  boxes 
)
static

Definition at line 328 of file boxchar.cpp.

329  {
330  string output;
331  char buffer[kMaxLineLength];
332  for (size_t i = 0; i < boxes.size(); ++i) {
333  const Box* box = boxes[i]->box_;
334  if (box == nullptr) {
335  tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
336  return "";
337  }
338  int nbytes =
339  snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
340  boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
341  box->x + box->w, height - box->y, boxes[i]->page_);
342  output.append(buffer, nbytes);
343  }
344  return output;
345 }
const int kMaxLineLength
Definition: boxchar.cpp:319
#define tprintf(...)
Definition: tprintf.h:31
const Box * box() const
Definition: boxchar.h:44

◆ InsertNewlines()

void tesseract::BoxChar::InsertNewlines ( bool  rtl_rules,
bool  vertical_rules,
std::vector< BoxChar *> *  boxes 
)
static

Definition at line 113 of file boxchar.cpp.

114  {
115  int prev_i = -1;
116  int max_shift = 0;
117  for (size_t i = 0; i < boxes->size(); ++i) {
118  Box* box = (*boxes)[i]->box_;
119  if (box == nullptr) {
120  if (prev_i < 0 || prev_i + 1 < i || i + 1 == boxes->size()) {
121  // Erase null boxes at the start of a line and after another null box.
122  do {
123  delete (*boxes)[i];
124  boxes->erase(boxes->begin() + i);
125  if (i == 0) break;
126  } while (i-- == boxes->size() && (*boxes)[i]->box_ == nullptr);
127  }
128  continue;
129  }
130  if (prev_i >= 0) {
131  Box* prev_box = (*boxes)[prev_i]->box_;
132  int shift = box->x - prev_box->x;
133  if (vertical_rules) {
134  shift = box->y - prev_box->y;
135  } else if (rtl_rules) {
136  shift = -shift;
137  }
138  if (-shift > max_shift) {
139  // This is a newline. Since nothing cares about the size of the box,
140  // except the out-of-bounds checker, minimize the chance of creating
141  // a box outside the image by making the width and height 1.
142  int width = 1;
143  int height = 1;
144  int x = prev_box->x + prev_box->w;
145  int y = prev_box->y;
146  if (vertical_rules) {
147  x = prev_box->x;
148  y = prev_box->y + prev_box->h;
149  } else if (rtl_rules) {
150  x = prev_box->x - width;
151  if (x < 0) {
152  tprintf("prev x = %d, width=%d\n", prev_box->x, width);
153  x = 0;
154  }
155  }
156  if (prev_i + 1 == i) {
157  // New character needed.
158  BoxChar* new_box = new BoxChar("\t", 1);
159  new_box->AddBox(x, y, width, height);
160  new_box->page_ = (*boxes)[i]->page_;
161  boxes->insert(boxes->begin() + i, new_box);
162  ++i;
163  } else {
164  (*boxes)[i - 1]->AddBox(x, y, width, height);
165  (*boxes)[i - 1]->ch_ = "\t";
166  }
167  max_shift = 0;
168  } else if (shift > max_shift) {
169  max_shift = shift;
170  }
171  }
172  prev_i = i;
173  }
174 }
BoxChar(const char *utf8_str, int len)
Definition: boxchar.cpp:42
#define tprintf(...)
Definition: tprintf.h:31
const Box * box() const
Definition: boxchar.h:44

◆ InsertSpaces()

void tesseract::BoxChar::InsertSpaces ( bool  rtl_rules,
bool  vertical_rules,
std::vector< BoxChar *> *  boxes 
)
static

Definition at line 178 of file boxchar.cpp.

179  {
180  // After InsertNewlines, any remaining null boxes are not newlines, and are
181  // singletons, so add a box to each remaining null box.
182  for (size_t i = 1; i + 1 < boxes->size(); ++i) {
183  Box* box = (*boxes)[i]->box_;
184  if (box == nullptr) {
185  Box* prev = (*boxes)[i - 1]->box_;
186  Box* next = (*boxes)[i + 1]->box_;
187  ASSERT_HOST(prev != nullptr && next != nullptr);
188  int top = MIN(prev->y, next->y);
189  int bottom = MAX(prev->y + prev->h, next->y + next->h);
190  int left = prev->x + prev->w;
191  int right = next->x;
192  if (vertical_rules) {
193  top = prev->y + prev->h;
194  bottom = next->y;
195  left = MIN(prev->x, next->x);
196  right = MAX(prev->x + prev->w, next->x + next->w);
197  } else if (rtl_rules) {
198  // With RTL we have to account for BiDi.
199  // Right becomes the min left of all prior boxes back to the first
200  // space or newline.
201  right = prev->x;
202  left = next->x + next->w;
203  for (int j = i - 2;
204  j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t";
205  --j) {
206  prev = (*boxes)[j]->box_;
207  ASSERT_HOST(prev != nullptr);
208  if (prev->x < right) {
209  right = prev->x;
210  }
211  }
212  // Left becomes the max right of all next boxes forward to the first
213  // space or newline.
214  for (size_t j = i + 2;
215  j < boxes->size() && (*boxes)[j]->box_ != nullptr &&
216  (*boxes)[j]->ch_ != "\t";
217  ++j) {
218  next = (*boxes)[j]->box_;
219  if (next->x + next->w > left) {
220  left = next->x + next->w;
221  }
222  }
223  }
224  // Italic and stylized characters can produce negative spaces, which
225  // Leptonica doesn't like, so clip to a positive size.
226  if (right <= left) right = left + 1;
227  if (bottom <= top) bottom = top + 1;
228  (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
229  (*boxes)[i]->ch_ = " ";
230  }
231  }
232 }
#define MIN(x, y)
Definition: ndminx.h:28
#define MAX(x, y)
Definition: ndminx.h:24
#define ASSERT_HOST(x)
Definition: errcode.h:84
const Box * box() const
Definition: boxchar.h:44

◆ MostlyVertical()

bool tesseract::BoxChar::MostlyVertical ( const std::vector< BoxChar *> &  boxes)
static

Definition at line 271 of file boxchar.cpp.

271  {
272  inT64 total_dx = 0, total_dy = 0;
273  for (size_t i = 1; i < boxes.size(); ++i) {
274  if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr &&
275  boxes[i - 1]->page_ == boxes[i]->page_) {
276  int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
277  int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
278  if (abs(dx) > abs(dy) * kMinNewlineRatio ||
279  abs(dy) > abs(dx) * kMinNewlineRatio) {
280  total_dx += dx * dx;
281  total_dy += dy * dy;
282  }
283  }
284  }
285  return total_dy > total_dx;
286 }
const int kMinNewlineRatio
Definition: boxchar.cpp:38
int64_t inT64
Definition: host.h:40

◆ mutable_box()

Box* tesseract::BoxChar::mutable_box ( )
inline

Definition at line 55 of file boxchar.h.

55 { return box_; }

◆ mutable_ch()

string* tesseract::BoxChar::mutable_ch ( )
inline

Definition at line 54 of file boxchar.h.

54 { return &ch_; }

◆ operator<()

bool tesseract::BoxChar::operator< ( const BoxChar other) const
inline

Definition at line 59 of file boxchar.h.

59  {
60  if (box_ == nullptr) return true;
61  if (other.box_ == nullptr) return false;
62  return box_->x < other.box_->x;
63  }

◆ page()

const int& tesseract::BoxChar::page ( ) const
inline

Definition at line 45 of file boxchar.h.

45 { return page_; }

◆ PrepareToWrite()

void tesseract::BoxChar::PrepareToWrite ( std::vector< BoxChar *> *  boxes)
static

Definition at line 98 of file boxchar.cpp.

98  {
99  bool rtl_rules = ContainsMostlyRTL(*boxes);
100  bool vertical_rules = MostlyVertical(*boxes);
101  InsertNewlines(rtl_rules, vertical_rules, boxes);
102  InsertSpaces(rtl_rules, vertical_rules, boxes);
103  for (unsigned int i = 0; i < boxes->size(); ++i) {
104  if ((*boxes)[i]->box_ == nullptr) tprintf("Null box at index %u\n", i);
105  }
106  if (rtl_rules) {
107  ReorderRTLText(boxes);
108  }
109 }
static bool ContainsMostlyRTL(const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:261
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:178
static bool MostlyVertical(const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:271
#define tprintf(...)
Definition: tprintf.h:31
static void ReorderRTLText(std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:236
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
Definition: boxchar.cpp:113

◆ ReorderRTLText()

void tesseract::BoxChar::ReorderRTLText ( std::vector< BoxChar *> *  boxes)
static

Definition at line 236 of file boxchar.cpp.

236  {
237  // Ideally we need the inverse of the algorithm used by ResultIterator.
238  // For now, let's try a sort that reverses original positions for RTL
239  // characters, otherwise by x-position. This should be much closer to
240  // correct than just sorting by x-position.
241  int num_boxes = boxes->size();
242  for (int i = 0; i < num_boxes; ++i) {
243  int num_rtl = 0, num_ltr = 0;
244  (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
245  if (num_rtl > num_ltr) {
246  (*boxes)[i]->set_rtl_index(i);
247  (*boxes)[i]->ReverseUnicodesInBox();
248  }
249  }
250  BoxCharPtrSort sorter;
251  size_t end = 0;
252  for (size_t start = 0; start < boxes->size(); start = end + 1) {
253  end = start + 1;
254  while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end;
255  std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
256  }
257 }

◆ ReverseUnicodesInBox()

void tesseract::BoxChar::ReverseUnicodesInBox ( )

Definition at line 77 of file boxchar.cpp.

77  {
78  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(ch_.c_str());
79  std::reverse(unicodes.begin(), unicodes.end());
80  ch_ = UNICHAR::UTF32ToUTF8(unicodes);
81 }
LIST reverse(LIST list)
Definition: oldlist.cpp:319
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213
static string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:230

◆ RotateBoxes()

void tesseract::BoxChar::RotateBoxes ( float  rotation,
int  xcenter,
int  ycenter,
int  start_box,
int  end_box,
std::vector< BoxChar *> *  boxes 
)
static

Definition at line 300 of file boxchar.cpp.

302  {
303  Boxa* orig = boxaCreate(0);
304  for (int i = start_box; i < end_box; ++i) {
305  BOX* box = (*boxes)[i]->box_;
306  if (box) boxaAddBox(orig, box, L_CLONE);
307  }
308  Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
309  boxaDestroy(&orig);
310  for (int i = start_box, box_ind = 0; i < end_box; ++i) {
311  if ((*boxes)[i]->box_) {
312  boxDestroy(&((*boxes)[i]->box_));
313  (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
314  }
315  }
316  boxaDestroy(&rotated);
317 }
const Box * box() const
Definition: boxchar.h:44

◆ rtl_index()

const int& tesseract::BoxChar::rtl_index ( ) const
inline

Definition at line 47 of file boxchar.h.

47 { return rtl_index_; }

◆ set_page()

void tesseract::BoxChar::set_page ( int  page)
inline

Definition at line 52 of file boxchar.h.

52 { page_ = page; }
const int & page() const
Definition: boxchar.h:45

◆ set_rtl_index()

void tesseract::BoxChar::set_rtl_index ( int  index)
inline

Definition at line 46 of file boxchar.h.

46 { rtl_index_ = index; }

◆ TotalByteLength()

int tesseract::BoxChar::TotalByteLength ( const std::vector< BoxChar *> &  boxes)
static

Definition at line 290 of file boxchar.cpp.

290  {
291  int total_length = 0;
292  for (size_t i = 0; i < boxes.size(); ++i)
293  total_length += boxes[i]->ch_.size();
294  return total_length;
295 }

◆ TranslateBoxes()

void tesseract::BoxChar::TranslateBoxes ( int  xshift,
int  yshift,
std::vector< BoxChar *> *  boxes 
)
static

Definition at line 84 of file boxchar.cpp.

85  {
86  for (size_t i = 0; i < boxes->size(); ++i) {
87  BOX* box = (*boxes)[i]->box_;
88  if (box != nullptr) {
89  box->x += xshift;
90  box->y += yshift;
91  }
92  }
93 }
const Box * box() const
Definition: boxchar.h:44

◆ WriteTesseractBoxFile()

void tesseract::BoxChar::WriteTesseractBoxFile ( const string &  name,
int  height,
const std::vector< BoxChar *> &  boxes 
)
static

Definition at line 321 of file boxchar.cpp.

322  {
323  string output = GetTesseractBoxStr(height, boxes);
325 }
static string GetTesseractBoxStr(int height, const std::vector< BoxChar *> &boxes)
Definition: boxchar.cpp:328
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:52

The documentation for this class was generated from the following files: