35 #include "unicode/uchar.h" 43 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
48 box_ = boxCreate(x, y, width, height);
56 if (uni_vector.empty()) {
57 tprintf(
"Illegal utf8 in boxchar string:%s = ", ch_.c_str());
58 for (
int c = 0; c < ch_.size(); ++c) {
65 UCharDirection dir = u_charDirection(
ch);
66 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC ||
67 dir == U_ARABIC_NUMBER || dir == U_RIGHT_TO_LEFT_ISOLATE) {
69 }
else if (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL) {
85 std::vector<BoxChar*>* boxes) {
86 for (
size_t i = 0; i < boxes->size(); ++i) {
87 BOX*
box = (*boxes)[i]->box_;
103 for (
unsigned int i = 0; i < boxes->size(); ++i) {
104 if ((*boxes)[i]->box_ ==
nullptr)
tprintf(
"Null box at index %u\n", i);
114 std::vector<BoxChar*>* boxes) {
117 for (
size_t i = 0; i < boxes->size(); ++i) {
118 Box*
box = (*boxes)[i]->box_;
119 if (box ==
nullptr) {
120 if (prev_i < 0 || prev_i + 1 < i || i + 1 == boxes->size()) {
124 boxes->erase(boxes->begin() + i);
126 }
while (i-- == boxes->size() && (*boxes)[i]->box_ ==
nullptr);
131 Box* prev_box = (*boxes)[prev_i]->box_;
132 int shift = box->x - prev_box->x;
133 if (vertical_rules) {
134 shift = box->y - prev_box->y;
135 }
else if (rtl_rules) {
138 if (-shift > max_shift) {
144 int x = prev_box->x + prev_box->w;
146 if (vertical_rules) {
148 y = prev_box->y + prev_box->h;
149 }
else if (rtl_rules) {
150 x = prev_box->x - width;
152 tprintf(
"prev x = %d, width=%d\n", prev_box->x, width);
156 if (prev_i + 1 == i) {
159 new_box->
AddBox(x, y, width, height);
160 new_box->page_ = (*boxes)[i]->page_;
161 boxes->insert(boxes->begin() + i, new_box);
164 (*boxes)[i - 1]->AddBox(x, y, width, height);
165 (*boxes)[i - 1]->ch_ =
"\t";
168 }
else if (shift > max_shift) {
179 std::vector<BoxChar*>* boxes) {
182 for (
size_t i = 1; i + 1 < boxes->size(); ++i) {
183 Box*
box = (*boxes)[i]->box_;
184 if (box ==
nullptr) {
185 Box* prev = (*boxes)[i - 1]->box_;
186 Box* next = (*boxes)[i + 1]->box_;
188 int top =
MIN(prev->y, next->y);
189 int bottom =
MAX(prev->y + prev->h, next->y + next->h);
190 int left = prev->x + prev->w;
192 if (vertical_rules) {
193 top = prev->y + prev->h;
195 left =
MIN(prev->x, next->x);
196 right =
MAX(prev->x + prev->w, next->x + next->w);
197 }
else if (rtl_rules) {
202 left = next->x + next->w;
204 j >= 0 && (*boxes)[j]->ch_ !=
" " && (*boxes)[j]->ch_ !=
"\t";
206 prev = (*boxes)[j]->box_;
208 if (prev->x < right) {
214 for (
size_t j = i + 2;
215 j < boxes->size() && (*boxes)[j]->box_ !=
nullptr &&
216 (*boxes)[j]->ch_ !=
"\t";
218 next = (*boxes)[j]->box_;
219 if (next->x + next->w > left) {
220 left = next->x + next->w;
226 if (right <= left) right = left + 1;
227 if (bottom <= top) bottom = top + 1;
228 (*boxes)[i]->AddBox(left, top, right - left, bottom - top);
229 (*boxes)[i]->ch_ =
" ";
241 int num_boxes = boxes->size();
242 for (
int i = 0; i < num_boxes; ++i) {
243 int num_rtl = 0, num_ltr = 0;
244 (*boxes)[i]->GetDirection(&num_rtl, &num_ltr);
245 if (num_rtl > num_ltr) {
246 (*boxes)[i]->set_rtl_index(i);
247 (*boxes)[i]->ReverseUnicodesInBox();
252 for (
size_t start = 0; start < boxes->size(); start = end + 1) {
254 while (end < boxes->size() && (*boxes)[end]->ch_ !=
"\t") ++end;
255 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
262 int num_rtl = 0, num_ltr = 0;
263 for (
int i = 0; i < boxes.size(); ++i) {
264 boxes[i]->GetDirection(&num_rtl, &num_ltr);
266 return num_rtl > num_ltr;
272 inT64 total_dx = 0, total_dy = 0;
273 for (
size_t i = 1; i < boxes.size(); ++i) {
274 if (boxes[i - 1]->box_ !=
nullptr && boxes[i]->box_ !=
nullptr &&
275 boxes[i - 1]->page_ == boxes[i]->page_) {
276 int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x;
277 int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y;
285 return total_dy > total_dx;
291 int total_length = 0;
292 for (
size_t i = 0; i < boxes.size(); ++i)
293 total_length += boxes[i]->ch_.size();
301 int start_box,
int end_box,
302 std::vector<BoxChar*>* boxes) {
303 Boxa* orig = boxaCreate(0);
304 for (
int i = start_box; i < end_box; ++i) {
305 BOX*
box = (*boxes)[i]->box_;
306 if (box) boxaAddBox(orig, box, L_CLONE);
308 Boxa* rotated = boxaRotate(orig, xcenter, ycenter, rotation);
310 for (
int i = start_box, box_ind = 0; i < end_box; ++i) {
311 if ((*boxes)[i]->box_) {
312 boxDestroy(&((*boxes)[i]->box_));
313 (*boxes)[i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
316 boxaDestroy(&rotated);
322 const std::vector<BoxChar*>& boxes) {
329 const std::vector<BoxChar*>& boxes) {
332 for (
size_t i = 0; i < boxes.size(); ++i) {
333 const Box*
box = boxes[i]->box_;
334 if (box ==
nullptr) {
335 tprintf(
"Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
339 snprintf(buffer, kMaxLineLength,
"%s %d %d %d %d %d\n",
340 boxes[i]->ch_.c_str(), box->x, height - box->y - box->h,
341 box->x + box->w, height - box->y, boxes[i]->page_);
342 output.append(buffer, nbytes);
static bool ContainsMostlyRTL(const std::vector< BoxChar *> &boxes)
const string & ch() const
static string GetTesseractBoxStr(int height, const std::vector< BoxChar *> &boxes)
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
BoxChar(const char *utf8_str, int len)
void ReverseUnicodesInBox()
static bool MostlyVertical(const std::vector< BoxChar *> &boxes)
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar *> *boxes)
void GetDirection(int *num_rtl, int *num_ltr) const
static void ReorderRTLText(std::vector< BoxChar *> *boxes)
void AddBox(int x, int y, int width, int height)
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar *> *boxes)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar *> *boxes)
static string UTF32ToUTF8(const std::vector< char32 > &str32)
static int TotalByteLength(const std::vector< BoxChar *> &boxes)
static void WriteTesseractBoxFile(const string &name, int height, const std::vector< BoxChar *> &boxes)
const int kMinNewlineRatio
static void PrepareToWrite(std::vector< BoxChar *> *boxes)
static void WriteStringToFileOrDie(const string &str, const string &filename)