27#include <allheaders.h>
32static const char *
const kLRM =
"\u200E";
33static const char *
const kRLM =
"\u200F";
38 in_minor_direction_ =
false;
39 at_beginning_of_minor_run_ =
false;
40 preserve_interword_spaces_ =
false;
42 auto *
p = ParamUtils::FindParam<BoolParam>(
45 preserve_interword_spaces_ = (bool)(*
p);
48 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
49 MoveToLogicalStartOfTextline();
57 return current_paragraph_is_ltr_;
60bool ResultIterator::CurrentParagraphIsLtr()
const {
65 it.RestartParagraph();
91 num_rtl = leftmost_rtl ? 1 : 0;
98 num_ltr += rightmost_ltr ? 1 : 0;
114 return num_ltr >= num_rtl;
121void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices)
const {
122 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
123 blob_indices->clear();
130 blob_indices->push_back(
i);
147 std::vector<int> letter_types;
154 if (letter_types[
i] == U_EURO_NUM && letter_types[
i + 2] == U_EURO_NUM &&
155 (letter_types[
i + 1] == U_EURO_NUM_SEP || letter_types[
i + 1] == U_COMMON_NUM_SEP)) {
156 letter_types[
i + 1] = U_EURO_NUM;
162 if (letter_types[
i] == U_EURO_NUM_TERM) {
164 while (j <
word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
167 if (j <
word_length_ && letter_types[j] == U_EURO_NUM) {
169 for (
int k =
i; k < j; k++) {
170 letter_types[k] = U_EURO_NUM;
174 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
177 if (j > -1 && letter_types[j] == U_EURO_NUM) {
179 for (
int k = j; k <=
i; k++) {
180 letter_types[k] = U_EURO_NUM;
189 int ti = letter_types[
i];
190 if (ti == U_LTR || ti == U_EURO_NUM) {
194 int tj = letter_types[j];
195 if (tj == U_LTR || tj == U_EURO_NUM) {
197 }
else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
204 for (
int k =
i; k <= last_good; k++) {
205 letter_types[k] = U_LTR;
209 letter_types[
i] = U_RTL;
216 if (letter_types[
i] == U_RTL) {
217 blob_indices->push_back(
i);
222 for (; j >= 0 && letter_types[j] != U_RTL; j--) {
225 for (
int k = j + 1; k <=
i; k++) {
226 blob_indices->push_back(k);
234static void PrintScriptDirs(
const std::vector<StrongScriptDirection> &dirs) {
235 for (
auto dir : dirs) {
258 std::vector<int> *word_indices)
const {
259 std::vector<StrongScriptDirection> directions;
264 std::vector<StrongScriptDirection> *dirs_arg,
265 std::vector<int> *word_indices)
const {
266 std::vector<StrongScriptDirection> dirs;
267 std::vector<StrongScriptDirection> *directions;
268 directions = (dirs_arg !=
nullptr) ? dirs_arg : &dirs;
278 directions->push_back(ltr_it.WordDirection());
281 word_indices->clear();
286 const std::vector<StrongScriptDirection> &word_dirs,
287 std::vector<int> *reading_order) {
288 reading_order->clear();
289 if (word_dirs.empty()) {
295 int minor_direction, major_direction, major_step, start, end;
296 if (paragraph_is_ltr) {
298 end = word_dirs.size();
303 start = word_dirs.size() - 1;
312 int neutral_end = start;
313 while (neutral_end > 0 && word_dirs[neutral_end] ==
DIR_NEUTRAL) {
319 int left = neutral_end;
326 for (
unsigned i = left;
i < word_dirs.size();
i++) {
327 reading_order->push_back(
i);
337 for (
int i = start;
i != end;) {
338 if (word_dirs[
i] == minor_direction) {
340 while (j != end && word_dirs[j] != major_direction) {
346 while (j !=
i && word_dirs[j] != minor_direction) {
351 for (
int k = j; k !=
i; k -= major_step) {
352 reading_order->push_back(k);
354 reading_order->push_back(
i);
358 reading_order->push_back(
i);
367int ResultIterator::LTRWordIndex()
const {
368 int this_word_index = 0;
370 textline.RestartRow();
371 while (!textline.PositionedAtSameWord(
it_)) {
375 return this_word_index;
378void ResultIterator::MoveToLogicalStartOfWord() {
383 std::vector<int> blob_order;
384 CalculateBlobOrder(&blob_order);
385 if (blob_order.empty() || blob_order[0] == 0) {
391bool ResultIterator::IsAtFinalSymbolOfWord()
const {
395 std::vector<int> blob_order;
396 CalculateBlobOrder(&blob_order);
397 return blob_order.empty() || blob_order.back() ==
blob_index_;
400bool ResultIterator::IsAtFirstSymbolOfWord()
const {
404 std::vector<int> blob_order;
405 CalculateBlobOrder(&blob_order);
406 return blob_order.empty() || blob_order[0] ==
blob_index_;
409void ResultIterator::AppendSuffixMarks(std::string *text)
const {
413 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
419 std::vector<int> textline_order;
421 int this_word_index = LTRWordIndex();
423 for (
const auto word_index : textline_order) {
424 if (word_index == this_word_index) {
429 if (
i == textline_order.size()) {
433 int last_non_word_mark = 0;
434 for (
i++;
i < textline_order.size() && textline_order[
i] < 0;
i++) {
435 last_non_word_mark = textline_order[
i];
438 *text += reading_direction_is_ltr ? kLRM : kRLM;
440 if (current_paragraph_is_ltr_) {
448void ResultIterator::MoveToLogicalStartOfTextline() {
449 std::vector<int> word_indices;
454 for (;
i < word_indices.size() && word_indices[
i] < 0;
i++) {
456 in_minor_direction_ =
true;
458 in_minor_direction_ =
false;
461 if (in_minor_direction_) {
462 at_beginning_of_minor_run_ =
true;
464 if (
i >= word_indices.size()) {
467 int first_word_index = word_indices[
i];
468 for (
int j = 0; j < first_word_index; j++) {
471 MoveToLogicalStartOfWord();
476 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
477 in_minor_direction_ =
false;
478 at_beginning_of_minor_run_ =
false;
479 MoveToLogicalStartOfTextline();
496 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
498 in_minor_direction_ =
false;
499 MoveToLogicalStartOfTextline();
502 std::vector<int> blob_order;
503 CalculateBlobOrder(&blob_order);
504 unsigned next_blob = 0;
505 while (next_blob < blob_order.size() &&
blob_index_ != blob_order[next_blob]) {
509 if (next_blob < blob_order.size()) {
512 at_beginning_of_minor_run_ =
false;
523 std::vector<int> word_indices;
524 int this_word_index = LTRWordIndex();
526 int final_real_index = word_indices.size() - 1;
527 while (final_real_index > 0 && word_indices[final_real_index] < 0) {
530 for (
int i = 0;
i < final_real_index;
i++) {
531 if (word_indices[
i] == this_word_index) {
533 for (; j < final_real_index && word_indices[j] < 0; j++) {
535 in_minor_direction_ =
true;
538 in_minor_direction_ =
false;
541 at_beginning_of_minor_run_ = (word_indices[j - 1] ==
kMinorRunStart);
544 tprintf(
"Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
547 for (
int k = 0; k < word_indices[j]; k++) {
550 MoveToLogicalStartOfWord();
555 tprintf(
"Next(RIL_WORD): %d -> EOL\n", this_word_index);
576 bool at_word_start = IsAtFirstSymbolOfWord();
578 return at_word_start;
583 line_start.MoveToLogicalStartOfTextline();
585 bool at_textline_start = at_word_start && *line_start.
it_ == *
it_;
587 return at_textline_start;
592 bool at_block_start =
595 return at_block_start;
599 at_block_start || (at_textline_start && line_start.
it_->
row()->
row->
para() !=
602 return at_para_start;
615 if (
Empty(element)) {
626 if (
next.Empty(element)) {
629 while (element > level) {
631 if (!
next.IsAtBeginningOf(element)) {
640 if (CurrentParagraphIsLtr()) {
659 pp.AppendUTF8ParagraphText(&text);
663 AppendUTF8ParagraphText(&text);
667 it.MoveToLogicalStartOfTextline();
668 it.IterateAndAppendUTF8TextlineText(&text);
671 AppendUTF8WordText(&text);
674 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
675 if (at_beginning_of_minor_run_) {
676 text += reading_direction_is_ltr ? kLRM : kRLM;
679 if (IsAtFinalSymbolOfWord()) {
680 AppendSuffixMarks(&text);
684 int length = text.length() + 1;
685 char *result =
new char[length];
686 strncpy(result, text.c_str(), length);
689std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
707void ResultIterator::AppendUTF8WordText(std::string *text)
const {
712 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
713 if (at_beginning_of_minor_run_) {
714 *text += reading_direction_is_ltr ? kLRM : kRLM;
717 std::vector<int> blob_order;
718 CalculateBlobOrder(&blob_order);
719 for (
int i : blob_order) {
722 AppendSuffixMarks(text);
725void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
731 std::vector<int> textline_order;
732 std::vector<StrongScriptDirection> dirs;
734 tprintf(
"Strong Script dirs [%p/P=%s]: ",
735 static_cast<void *
>(
it_->
row()),
736 current_paragraph_is_ltr_ ?
"ltr" :
"rtl");
737 PrintScriptDirs(dirs);
738 tprintf(
"Logical textline order [%p/P=%s]: ",
739 static_cast<void *
>(
it_->
row()),
740 current_paragraph_is_ltr_ ?
"ltr" :
"rtl");
741 for (
int i : textline_order) {
747 int words_appended = 0;
749 int numSpaces = preserve_interword_spaces_ ?
it_->
word()->
word->
space() : (words_appended > 0);
750 for (
int i = 0;
i < numSpaces; ++
i) {
753 AppendUTF8WordText(text);
756 tprintf(
"Num spaces=%d, text=%s\n", numSpaces, text->c_str());
760 tprintf(
"%d words printed\n", words_appended);
769void ResultIterator::AppendUTF8ParagraphText(std::string *text)
const {
771 it.RestartParagraph();
772 it.MoveToLogicalStartOfTextline();
777 it.IterateAndAppendUTF8TextlineText(text);
778 }
while (it.it_->block() !=
nullptr && !it.IsAtBeginningOf(
RIL_PARA));
781bool ResultIterator::BidiDebug(
int min_level)
const {
783 auto *
p = ParamUtils::FindParam<IntParam>(
"bidi_debug",
GlobalParams()->int_params,
786 debug_level = (int32_t)(*
p);
788 return debug_level >= min_level;
void tprintf(const char *format,...)
tesseract::ParamsVectors * GlobalParams()
const char * paragraph_separator_
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
int BlanksBeforeWord() const
const char * line_separator_
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
bool Empty(PageIteratorLevel level) const
void BeginWord(int offset)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
static const int kMinorRunEnd
bool ParagraphIsLtr() const
static const int kMinorRunStart
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
int BlanksBeforeWord() const
bool Next(PageIteratorLevel level) override
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps() const
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
ResultIterator(const LTRResultIterator &resit)
static const int kComplexWord
WERD_CHOICE * best_choice
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
bool UnicharsInReadingOrder() const
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
BLOCK_RES * block() const
BLOCK_RES * prev_block() const
ROW_RES * prev_row() const
std::vector< BoolParam * > bool_params
std::vector< IntParam * > int_params
@ U_EUROPEAN_NUMBER_TERMINATOR
@ U_COMMON_NUMBER_SEPARATOR
@ U_EUROPEAN_NUMBER_SEPARATOR