28#include "unicode/uchar.h"
40 : ch_(utf8_str, len), box_(nullptr), page_(0), rtl_index_(-1) {}
47 box_ = boxCreate(
x,
y, width, height);
55 if (uni_vector.empty()) {
56 tprintf(
"Illegal utf8 in boxchar string:%s = ", ch_.c_str());
64 UCharDirection dir = u_charDirection(
ch);
65 if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || dir == U_RIGHT_TO_LEFT_ISOLATE) {
67 }
else if ((dir == U_ARABIC_NUMBER) ||
68 (dir != U_DIR_NON_SPACING_MARK && dir != U_BOUNDARY_NEUTRAL)) {
78 std::reverse(unicodes.begin(), unicodes.end());
84 for (
auto &boxe : *boxes) {
85 Box *
box = boxe->box_;
101 for (
size_t i = 0;
i < boxes->size(); ++
i) {
102 if ((*boxes)[
i]->box_ ==
nullptr) {
103 tprintf(
"Null box at index %zu\n",
i);
114 size_t prev_i = SIZE_MAX;
116 for (
size_t i = 0;
i < boxes->size(); ++
i) {
117 Box *
box = (*boxes)[
i]->box_;
118 if (
box ==
nullptr) {
119 if (prev_i == SIZE_MAX || prev_i + 1 <
i ||
i + 1 == boxes->size()) {
123 boxes->erase(boxes->begin() +
i);
127 }
while (
i-- == boxes->size() && (*boxes)[
i]->box_ ==
nullptr);
131 if (prev_i != SIZE_MAX) {
132 Box *prev_box = (*boxes)[prev_i]->box_;
133 int shift =
box->x - prev_box->x;
134 if (vertical_rules) {
135 shift =
box->y - prev_box->y;
136 }
else if (rtl_rules) {
139 if (-shift > max_shift) {
145 int x = prev_box->x + prev_box->w;
147 if (vertical_rules) {
149 y = prev_box->y + prev_box->h;
150 }
else if (rtl_rules) {
151 x = prev_box->x - width;
153 tprintf(
"prev x = %d, width=%d\n", prev_box->x, width);
157 if (prev_i + 1 ==
i) {
159 auto *new_box =
new BoxChar(
"\t", 1);
160 new_box->AddBox(
x,
y, width, height);
161 new_box->page_ = (*boxes)[
i]->page_;
162 boxes->insert(boxes->begin() +
i, new_box);
165 (*boxes)[
i - 1]->AddBox(
x,
y, width, height);
166 (*boxes)[
i - 1]->ch_ =
"\t";
169 }
else if (shift > max_shift) {
182 for (
size_t i = 1;
i + 1 < boxes->size(); ++
i) {
183 Box *
box = (*boxes)[
i]->box_;
184 if (
box ==
nullptr) {
185 Box *prev = (*boxes)[
i - 1]->box_;
186 Box *
next = (*boxes)[
i + 1]->box_;
188 int top = std::min(prev->y,
next->y);
189 int bottom = std::max(prev->y + prev->h,
next->y +
next->h);
190 int left = prev->x + prev->w;
192 if (vertical_rules) {
193 top = prev->y + prev->h;
195 left = std::min(prev->x,
next->x);
196 right = std::max(prev->x + prev->w,
next->x +
next->w);
197 }
else if (rtl_rules) {
203 for (
int j =
i - 2; j >= 0 && (*boxes)[j]->ch_ !=
" " && (*boxes)[j]->ch_ !=
"\t"; --j) {
204 prev = (*boxes)[j]->box_;
206 if (prev->x < right) {
212 for (
size_t j =
i + 2;
213 j < boxes->size() && (*boxes)[j]->box_ !=
nullptr && (*boxes)[j]->ch_ !=
"\t"; ++j) {
214 next = (*boxes)[j]->box_;
228 (*boxes)[
i]->AddBox(left, top, right - left, bottom - top);
229 (*boxes)[
i]->ch_ =
" ";
241 size_t num_boxes = boxes->size();
242 for (
size_t i = 0;
i < num_boxes; ++
i) {
243 int num_rtl = 0, num_ltr = 0;
244 (*boxes)[
i]->GetDirection(&num_rtl, &num_ltr);
245 if (num_rtl > num_ltr) {
246 (*boxes)[
i]->set_rtl_index(
i);
247 (*boxes)[
i]->ReverseUnicodesInBox();
252 for (
size_t start = 0; start < boxes->size(); start = end + 1) {
254 while (end < boxes->size() && (*boxes)[end]->ch_ !=
"\t") {
257 std::sort(boxes->begin() + start, boxes->begin() + end, sorter);
264 int num_rtl = 0, num_ltr = 0;
265 for (
auto boxe : boxes) {
266 boxe->GetDirection(&num_rtl, &num_ltr);
268 return num_rtl > num_ltr;
274 int64_t total_dx = 0, total_dy = 0;
275 for (
size_t i = 1;
i < boxes.size(); ++
i) {
276 if (boxes[
i - 1]->box_ !=
nullptr && boxes[
i]->box_ !=
nullptr &&
277 boxes[
i - 1]->page_ == boxes[
i]->page_) {
278 int dx = boxes[
i]->box_->x - boxes[
i - 1]->box_->x;
279 int dy = boxes[
i]->box_->y - boxes[
i - 1]->box_->y;
281 total_dx +=
static_cast<int64_t
>(dx) * dx;
282 total_dy +=
static_cast<int64_t
>(dy) * dy;
286 return total_dy > total_dx;
292 int total_length = 0;
293 for (
auto boxe : boxes) {
294 total_length += boxe->ch_.size();
303 std::vector<BoxChar *> *boxes) {
304 Boxa *orig = boxaCreate(0);
305 for (
int i = start_box;
i < end_box; ++
i) {
306 Box *
box = (*boxes)[
i]->box_;
308 boxaAddBox(orig,
box, L_CLONE);
311 Boxa *rotated = boxaRotate(orig, xcenter, ycenter, rotation);
313 for (
int i = start_box, box_ind = 0;
i < end_box; ++
i) {
314 if ((*boxes)[
i]->box_) {
315 boxDestroy(&((*boxes)[
i]->box_));
316 (*boxes)[
i]->box_ = boxaGetBox(rotated, box_ind++, L_CLONE);
319 boxaDestroy(&rotated);
325 const std::vector<BoxChar *> &boxes) {
334 for (
auto boxe : boxes) {
335 const Box *
box = boxe->box_;
336 if (
box ==
nullptr) {
337 tprintf(
"Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
340 int nbytes = snprintf(buffer,
kMaxLineLength,
"%s %d %d %d %d %d\n", boxe->ch_.c_str(),
box->x,
341 height -
box->y -
box->h,
box->x +
box->w, height -
box->y, boxe->page_);
342 output.append(buffer, nbytes);
const int kMinNewlineRatio
void tprintf(const char *format,...)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
static bool MostlyVertical(const std::vector< BoxChar * > &boxes)
static void WriteTesseractBoxFile(const std::string &name, int height, const std::vector< BoxChar * > &boxes)
static void PrepareToWrite(std::vector< BoxChar * > *boxes)
static void InsertSpaces(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
BoxChar(const char *utf8_str, int len)
void ReverseUnicodesInBox()
void GetDirection(int *num_rtl, int *num_ltr) const
static void ReorderRTLText(std::vector< BoxChar * > *boxes)
static int TotalByteLength(const std::vector< BoxChar * > &boxes)
static std::string GetTesseractBoxStr(int height, const std::vector< BoxChar * > &boxes)
static void TranslateBoxes(int xshift, int yshift, std::vector< BoxChar * > *boxes)
void AddBox(int x, int y, int width, int height)
static void RotateBoxes(float rotation, int xcenter, int ycenter, int start_box, int end_box, std::vector< BoxChar * > *boxes)
const std::string & ch() const
static bool ContainsMostlyRTL(const std::vector< BoxChar * > &boxes)
static void InsertNewlines(bool rtl_rules, bool vertical_rules, std::vector< BoxChar * > *boxes)
static void WriteStringToFileOrDie(const std::string &str, const std::string &filename)