tesseract v5.3.3.20231005
resultiterator.cpp
Go to the documentation of this file.
1
2// File: resultiterator.cpp
3// Description: Iterator for tesseract results that is capable of
4// iterating in proper reading order over Bi Directional
5// (e.g. mixed Hebrew and English) text.
6// Author: David Eger
7//
8// (C) Copyright 2011, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18//
20
22
23#include "pageres.h"
24#include "tesseractclass.h"
25#include "unicharset.h"
26
27#include <allheaders.h>
28
29#include <set>
30#include <vector>
31
32static const char *const kLRM = "\u200E"; // Left-to-Right Mark
33static const char *const kRLM = "\u200F"; // Right-to-Left Mark
34
35namespace tesseract {
36
38 in_minor_direction_ = false;
39 at_beginning_of_minor_run_ = false;
40 preserve_interword_spaces_ = false;
41
42 auto *p = ParamUtils::FindParam<BoolParam>(
43 "preserve_interword_spaces", GlobalParams()->bool_params, tesseract_->params()->bool_params);
44 if (p != nullptr) {
45 preserve_interword_spaces_ = (bool)(*p);
46 }
47
48 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
49 MoveToLogicalStartOfTextline();
50}
51
53 return new ResultIterator(resit);
54}
55
57 return current_paragraph_is_ltr_;
58}
59
60bool ResultIterator::CurrentParagraphIsLtr() const {
61 if (!it_->word()) {
62 return true; // doesn't matter.
63 }
64 LTRResultIterator it(*this);
65 it.RestartParagraph();
66 // Try to figure out the ltr-ness of the paragraph. The rules below
67 // make more sense in the context of a difficult paragraph example.
68 // Here we denote {ltr characters, RTL CHARACTERS}:
69 //
70 // "don't go in there!" DAIS EH
71 // EHT OTNI DEPMUJ FELSMIH NEHT DNA
72 // .GNIDLIUB GNINRUB
73 //
74 // On the first line, the left-most word is LTR and the rightmost word
75 // is RTL. Thus, we are better off taking the majority direction for
76 // the whole paragraph contents. So instead of "the leftmost word is LTR"
77 // indicating an LTR paragraph, we use a heuristic about what RTL paragraphs
78 // would not do: Typically an RTL paragraph would *not* start with an LTR
79 // word. So our heuristics are as follows:
80 //
81 // (1) If the first text line has an RTL word in the left-most position
82 // it is RTL.
83 // (2) If the first text line has an LTR word in the right-most position
84 // it is LTR.
85 // (3) If neither of the above is true, take the majority count for the
86 // paragraph -- if there are more rtl words, it is RTL. If there
87 // are more LTR words, it's LTR.
88 bool leftmost_rtl = it.WordDirection() == DIR_RIGHT_TO_LEFT;
89 bool rightmost_ltr = it.WordDirection() == DIR_LEFT_TO_RIGHT;
90 int num_ltr, num_rtl;
91 num_rtl = leftmost_rtl ? 1 : 0;
92 num_ltr = (it.WordDirection() == DIR_LEFT_TO_RIGHT) ? 1 : 0;
93 for (it.Next(RIL_WORD); !it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_TEXTLINE);
94 it.Next(RIL_WORD)) {
95 StrongScriptDirection dir = it.WordDirection();
96 rightmost_ltr = (dir == DIR_LEFT_TO_RIGHT);
97 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
98 num_ltr += rightmost_ltr ? 1 : 0;
99 }
100 if (leftmost_rtl) {
101 return false;
102 }
103 if (rightmost_ltr) {
104 return true;
105 }
106 // First line is ambiguous. Take statistics on the whole paragraph.
107 if (!it.Empty(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA)) {
108 do {
109 StrongScriptDirection dir = it.WordDirection();
110 num_rtl += (dir == DIR_RIGHT_TO_LEFT) ? 1 : 0;
111 num_ltr += (dir == DIR_LEFT_TO_RIGHT) ? 1 : 0;
112 } while (it.Next(RIL_WORD) && !it.IsAtBeginningOf(RIL_PARA));
113 }
114 return num_ltr >= num_rtl;
115}
116
117const int ResultIterator::kMinorRunStart = -1;
118const int ResultIterator::kMinorRunEnd = -2;
119const int ResultIterator::kComplexWord = -3;
120
121void ResultIterator::CalculateBlobOrder(std::vector<int> *blob_indices) const {
122 bool context_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
123 blob_indices->clear();
124 if (Empty(RIL_WORD)) {
125 return;
126 }
127 if (context_is_ltr || it_->word()->UnicharsInReadingOrder()) {
128 // Easy! just return the blobs in order;
129 for (int i = 0; i < word_length_; i++) {
130 blob_indices->push_back(i);
131 }
132 return;
133 }
134
135 // The blobs are in left-to-right order, but the current reading context
136 // is right-to-left.
137 const int U_LTR = UNICHARSET::U_LEFT_TO_RIGHT;
138 const int U_RTL = UNICHARSET::U_RIGHT_TO_LEFT;
139 const int U_EURO_NUM = UNICHARSET::U_EUROPEAN_NUMBER;
140 const int U_EURO_NUM_SEP = UNICHARSET::U_EUROPEAN_NUMBER_SEPARATOR;
141 const int U_EURO_NUM_TERM = UNICHARSET::U_EUROPEAN_NUMBER_TERMINATOR;
142 const int U_COMMON_NUM_SEP = UNICHARSET::U_COMMON_NUMBER_SEPARATOR;
143 const int U_OTHER_NEUTRAL = UNICHARSET::U_OTHER_NEUTRAL;
144
145 // Step 1: Scan for and mark European Number sequences
146 // [:ET:]*[:EN:]+(([:ES:]|[:CS:])?[:EN:]+)*[:ET:]*
147 std::vector<int> letter_types;
148 letter_types.reserve(word_length_);
149 for (int i = 0; i < word_length_; i++) {
150 letter_types.push_back(it_->word()->SymbolDirection(i));
151 }
152 // Convert a single separator sandwiched between two ENs into an EN.
153 for (int i = 0; i + 2 < word_length_; i++) {
154 if (letter_types[i] == U_EURO_NUM && letter_types[i + 2] == U_EURO_NUM &&
155 (letter_types[i + 1] == U_EURO_NUM_SEP || letter_types[i + 1] == U_COMMON_NUM_SEP)) {
156 letter_types[i + 1] = U_EURO_NUM;
157 }
158 }
159 // Scan for sequences of European Number Terminators around ENs and convert
160 // them to ENs.
161 for (int i = 0; i < word_length_; i++) {
162 if (letter_types[i] == U_EURO_NUM_TERM) {
163 int j = i + 1;
164 while (j < word_length_ && letter_types[j] == U_EURO_NUM_TERM) {
165 j++;
166 }
167 if (j < word_length_ && letter_types[j] == U_EURO_NUM) {
168 // The sequence [i..j] should be converted to all European Numbers.
169 for (int k = i; k < j; k++) {
170 letter_types[k] = U_EURO_NUM;
171 }
172 }
173 j = i - 1;
174 while (j > -1 && letter_types[j] == U_EURO_NUM_TERM) {
175 j--;
176 }
177 if (j > -1 && letter_types[j] == U_EURO_NUM) {
178 // The sequence [j..i] should be converted to all European Numbers.
179 for (int k = j; k <= i; k++) {
180 letter_types[k] = U_EURO_NUM;
181 }
182 }
183 }
184 }
185 // Step 2: Convert all remaining types to either L or R.
186 // Sequences ([:L:]|[:EN:])+ (([:CS:]|[:ON:])+ ([:L:]|[:EN:])+)* -> L.
187 // All other are R.
188 for (int i = 0; i < word_length_;) {
189 int ti = letter_types[i];
190 if (ti == U_LTR || ti == U_EURO_NUM) {
191 // Left to right sequence; scan to the end of it.
192 int last_good = i;
193 for (int j = i + 1; j < word_length_; j++) {
194 int tj = letter_types[j];
195 if (tj == U_LTR || tj == U_EURO_NUM) {
196 last_good = j;
197 } else if (tj == U_COMMON_NUM_SEP || tj == U_OTHER_NEUTRAL) {
198 // do nothing.
199 } else {
200 break;
201 }
202 }
203 // [i..last_good] is the L sequence
204 for (int k = i; k <= last_good; k++) {
205 letter_types[k] = U_LTR;
206 }
207 i = last_good + 1;
208 } else {
209 letter_types[i] = U_RTL;
210 i++;
211 }
212 }
213
214 // At this point, letter_types is entirely U_LTR or U_RTL.
215 for (int i = word_length_ - 1; i >= 0;) {
216 if (letter_types[i] == U_RTL) {
217 blob_indices->push_back(i);
218 i--;
219 } else {
220 // left to right sequence. scan to the beginning.
221 int j = i - 1;
222 for (; j >= 0 && letter_types[j] != U_RTL; j--) {
223 } // pass
224 // Now (j, i] is LTR
225 for (int k = j + 1; k <= i; k++) {
226 blob_indices->push_back(k);
227 }
228 i = j;
229 }
230 }
231 ASSERT_HOST(blob_indices->size() == static_cast<size_t>(word_length_));
232}
233
234static void PrintScriptDirs(const std::vector<StrongScriptDirection> &dirs) {
235 for (auto dir : dirs) {
236 switch (dir) {
237 case DIR_NEUTRAL:
238 tprintf("N ");
239 break;
241 tprintf("L ");
242 break;
244 tprintf("R ");
245 break;
246 case DIR_MIX:
247 tprintf("Z ");
248 break;
249 default:
250 tprintf("? ");
251 break;
252 }
253 }
254 tprintf("\n");
255}
256
257void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
258 std::vector<int> *word_indices) const {
259 std::vector<StrongScriptDirection> directions;
260 CalculateTextlineOrder(paragraph_is_ltr, resit, &directions, word_indices);
261}
262
263void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr, const LTRResultIterator &resit,
264 std::vector<StrongScriptDirection> *dirs_arg,
265 std::vector<int> *word_indices) const {
266 std::vector<StrongScriptDirection> dirs;
267 std::vector<StrongScriptDirection> *directions;
268 directions = (dirs_arg != nullptr) ? dirs_arg : &dirs;
269 directions->clear();
270
271 // A LTRResultIterator goes strictly left-to-right word order.
272 LTRResultIterator ltr_it(resit);
273 ltr_it.RestartRow();
274 if (ltr_it.Empty(RIL_WORD)) {
275 return;
276 }
277 do {
278 directions->push_back(ltr_it.WordDirection());
279 } while (ltr_it.Next(RIL_WORD) && !ltr_it.IsAtBeginningOf(RIL_TEXTLINE));
280
281 word_indices->clear();
282 CalculateTextlineOrder(paragraph_is_ltr, *directions, word_indices);
283}
284
285void ResultIterator::CalculateTextlineOrder(bool paragraph_is_ltr,
286 const std::vector<StrongScriptDirection> &word_dirs,
287 std::vector<int> *reading_order) {
288 reading_order->clear();
289 if (word_dirs.empty()) {
290 return;
291 }
292
293 // Take all of the runs of minor direction words and insert them
294 // in reverse order.
295 int minor_direction, major_direction, major_step, start, end;
296 if (paragraph_is_ltr) {
297 start = 0;
298 end = word_dirs.size();
299 major_step = 1;
300 major_direction = DIR_LEFT_TO_RIGHT;
301 minor_direction = DIR_RIGHT_TO_LEFT;
302 } else {
303 start = word_dirs.size() - 1;
304 end = -1;
305 major_step = -1;
306 major_direction = DIR_RIGHT_TO_LEFT;
307 minor_direction = DIR_LEFT_TO_RIGHT;
308 // Special rule: if there are neutral words at the right most side
309 // of a line adjacent to a left-to-right word in the middle of the
310 // line, we interpret the end of the line as a single LTR sequence.
311 if (word_dirs[start] == DIR_NEUTRAL) {
312 int neutral_end = start;
313 while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
314 neutral_end--;
315 }
316 if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
317 // LTR followed by neutrals.
318 // Scan for the beginning of the minor left-to-right run.
319 int left = neutral_end;
320 for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
321 if (word_dirs[i] == DIR_LEFT_TO_RIGHT) {
322 left = i;
323 }
324 }
325 reading_order->push_back(kMinorRunStart);
326 for (unsigned i = left; i < word_dirs.size(); i++) {
327 reading_order->push_back(i);
328 if (word_dirs[i] == DIR_MIX) {
329 reading_order->push_back(kComplexWord);
330 }
331 }
332 reading_order->push_back(kMinorRunEnd);
333 start = left - 1;
334 }
335 }
336 }
337 for (int i = start; i != end;) {
338 if (word_dirs[i] == minor_direction) {
339 int j = i;
340 while (j != end && word_dirs[j] != major_direction) {
341 j += major_step;
342 }
343 if (j == end) {
344 j -= major_step;
345 }
346 while (j != i && word_dirs[j] != minor_direction) {
347 j -= major_step;
348 }
349 // [j..i] is a minor direction run.
350 reading_order->push_back(kMinorRunStart);
351 for (int k = j; k != i; k -= major_step) {
352 reading_order->push_back(k);
353 }
354 reading_order->push_back(i);
355 reading_order->push_back(kMinorRunEnd);
356 i = j + major_step;
357 } else {
358 reading_order->push_back(i);
359 if (word_dirs[i] == DIR_MIX) {
360 reading_order->push_back(kComplexWord);
361 }
362 i += major_step;
363 }
364 }
365}
366
367int ResultIterator::LTRWordIndex() const {
368 int this_word_index = 0;
369 LTRResultIterator textline(*this);
370 textline.RestartRow();
371 while (!textline.PositionedAtSameWord(it_)) {
372 this_word_index++;
373 textline.Next(RIL_WORD);
374 }
375 return this_word_index;
376}
377
378void ResultIterator::MoveToLogicalStartOfWord() {
379 if (word_length_ == 0) {
380 BeginWord(0);
381 return;
382 }
383 std::vector<int> blob_order;
384 CalculateBlobOrder(&blob_order);
385 if (blob_order.empty() || blob_order[0] == 0) {
386 return;
387 }
388 BeginWord(blob_order[0]);
389}
390
391bool ResultIterator::IsAtFinalSymbolOfWord() const {
392 if (!it_->word()) {
393 return true;
394 }
395 std::vector<int> blob_order;
396 CalculateBlobOrder(&blob_order);
397 return blob_order.empty() || blob_order.back() == blob_index_;
398}
399
400bool ResultIterator::IsAtFirstSymbolOfWord() const {
401 if (!it_->word()) {
402 return true;
403 }
404 std::vector<int> blob_order;
405 CalculateBlobOrder(&blob_order);
406 return blob_order.empty() || blob_order[0] == blob_index_;
407}
408
409void ResultIterator::AppendSuffixMarks(std::string *text) const {
410 if (!it_->word()) {
411 return;
412 }
413 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
414 // scan forward to see what meta-information the word ordering algorithm
415 // left us.
416 // If this word is at the *end* of a minor run, insert the other
417 // direction's mark; else if this was a complex word, insert the
418 // current reading order's mark.
419 std::vector<int> textline_order;
420 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &textline_order);
421 int this_word_index = LTRWordIndex();
422 size_t i = 0;
423 for (const auto word_index : textline_order) {
424 if (word_index == this_word_index) {
425 break;
426 }
427 i++;
428 }
429 if (i == textline_order.size()) {
430 return;
431 }
432
433 int last_non_word_mark = 0;
434 for (i++; i < textline_order.size() && textline_order[i] < 0; i++) {
435 last_non_word_mark = textline_order[i];
436 }
437 if (last_non_word_mark == kComplexWord) {
438 *text += reading_direction_is_ltr ? kLRM : kRLM;
439 } else if (last_non_word_mark == kMinorRunEnd) {
440 if (current_paragraph_is_ltr_) {
441 *text += kLRM;
442 } else {
443 *text += kRLM;
444 }
445 }
446}
447
448void ResultIterator::MoveToLogicalStartOfTextline() {
449 std::vector<int> word_indices;
450 RestartRow();
451 CalculateTextlineOrder(current_paragraph_is_ltr_, dynamic_cast<const LTRResultIterator &>(*this),
452 &word_indices);
453 unsigned i = 0;
454 for (; i < word_indices.size() && word_indices[i] < 0; i++) {
455 if (word_indices[i] == kMinorRunStart) {
456 in_minor_direction_ = true;
457 } else if (word_indices[i] == kMinorRunEnd) {
458 in_minor_direction_ = false;
459 }
460 }
461 if (in_minor_direction_) {
462 at_beginning_of_minor_run_ = true;
463 }
464 if (i >= word_indices.size()) {
465 return;
466 }
467 int first_word_index = word_indices[i];
468 for (int j = 0; j < first_word_index; j++) {
470 }
471 MoveToLogicalStartOfWord();
472}
473
476 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
477 in_minor_direction_ = false;
478 at_beginning_of_minor_run_ = false;
479 MoveToLogicalStartOfTextline();
480}
481
483 if (it_->block() == nullptr) {
484 return false; // already at end!
485 }
486 switch (level) {
487 case RIL_BLOCK: // explicit fall-through
488 case RIL_PARA: // explicit fall-through
489 case RIL_TEXTLINE:
490 if (!PageIterator::Next(level)) {
491 return false;
492 }
494 // if we've advanced to a new paragraph,
495 // recalculate current_paragraph_is_ltr_
496 current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
497 }
498 in_minor_direction_ = false;
499 MoveToLogicalStartOfTextline();
500 return it_->block() != nullptr;
501 case RIL_SYMBOL: {
502 std::vector<int> blob_order;
503 CalculateBlobOrder(&blob_order);
504 unsigned next_blob = 0;
505 while (next_blob < blob_order.size() && blob_index_ != blob_order[next_blob]) {
506 next_blob++;
507 }
508 next_blob++;
509 if (next_blob < blob_order.size()) {
510 // we're in the same word; simply advance one blob.
511 BeginWord(blob_order[next_blob]);
512 at_beginning_of_minor_run_ = false;
513 return true;
514 }
515 level = RIL_WORD; // we've fallen through to the next word.
516 }
517 // Fall through.
518 case RIL_WORD: // explicit fall-through.
519 {
520 if (it_->word() == nullptr) {
521 return Next(RIL_BLOCK);
522 }
523 std::vector<int> word_indices;
524 int this_word_index = LTRWordIndex();
525 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &word_indices);
526 int final_real_index = word_indices.size() - 1;
527 while (final_real_index > 0 && word_indices[final_real_index] < 0) {
528 final_real_index--;
529 }
530 for (int i = 0; i < final_real_index; i++) {
531 if (word_indices[i] == this_word_index) {
532 int j = i + 1;
533 for (; j < final_real_index && word_indices[j] < 0; j++) {
534 if (word_indices[j] == kMinorRunStart) {
535 in_minor_direction_ = true;
536 }
537 if (word_indices[j] == kMinorRunEnd) {
538 in_minor_direction_ = false;
539 }
540 }
541 at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
542 // awesome, we move to word_indices[j]
543 if (BidiDebug(3)) {
544 tprintf("Next(RIL_WORD): %d -> %d\n", this_word_index, word_indices[j]);
545 }
547 for (int k = 0; k < word_indices[j]; k++) {
549 }
550 MoveToLogicalStartOfWord();
551 return true;
552 }
553 }
554 if (BidiDebug(3)) {
555 tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
556 }
557 // we're going off the end of the text line.
558 return Next(RIL_TEXTLINE);
559 }
560 }
561 ASSERT_HOST(false); // shouldn't happen.
562 return false;
563}
564
566 if (it_->block() == nullptr) {
567 return false; // Already at the end!
568 }
569 if (it_->word() == nullptr) {
570 return true; // In an image block.
571 }
572 if (level == RIL_SYMBOL) {
573 return true; // Always at beginning of a symbol.
574 }
575
576 bool at_word_start = IsAtFirstSymbolOfWord();
577 if (level == RIL_WORD) {
578 return at_word_start;
579 }
580
581 ResultIterator line_start(*this);
582 // move to the first word in the line...
583 line_start.MoveToLogicalStartOfTextline();
584
585 bool at_textline_start = at_word_start && *line_start.it_ == *it_;
586 if (level == RIL_TEXTLINE) {
587 return at_textline_start;
588 }
589
590 // now we move to the left-most word...
591 line_start.RestartRow();
592 bool at_block_start =
593 at_textline_start && line_start.it_->block() != line_start.it_->prev_block();
594 if (level == RIL_BLOCK) {
595 return at_block_start;
596 }
597
598 bool at_para_start =
599 at_block_start || (at_textline_start && line_start.it_->row()->row->para() !=
600 line_start.it_->prev_row()->row->para());
601 if (level == RIL_PARA) {
602 return at_para_start;
603 }
604
605 ASSERT_HOST(false); // shouldn't happen.
606 return false;
607}
608
615 if (Empty(element)) {
616 return true; // Already at the end!
617 }
618 // The result is true if we step forward by element and find we are
619 // at the end of the page or at beginning of *all* levels in:
620 // [level, element).
621 // When there is more than one level difference between element and level,
622 // we could for instance move forward one symbol and still be at the first
623 // word on a line, so we also have to be at the first symbol in a word.
624 ResultIterator next(*this);
625 next.Next(element);
626 if (next.Empty(element)) {
627 return true; // Reached the end of the page.
628 }
629 while (element > level) {
630 element = static_cast<PageIteratorLevel>(element - 1);
631 if (!next.IsAtBeginningOf(element)) {
632 return false;
633 }
634 }
635 return true;
636}
637
638// Returns the number of blanks before the current word.
640 if (CurrentParagraphIsLtr()) {
642 }
643 return IsAtBeginningOf(RIL_TEXTLINE) ? 0 : 1;
644}
645
651 if (it_->word() == nullptr) {
652 return nullptr; // Already at the end!
653 }
654 std::string text;
655 switch (level) {
656 case RIL_BLOCK: {
657 ResultIterator pp(*this);
658 do {
659 pp.AppendUTF8ParagraphText(&text);
660 } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
661 } break;
662 case RIL_PARA:
663 AppendUTF8ParagraphText(&text);
664 break;
665 case RIL_TEXTLINE: {
666 ResultIterator it(*this);
667 it.MoveToLogicalStartOfTextline();
668 it.IterateAndAppendUTF8TextlineText(&text);
669 } break;
670 case RIL_WORD:
671 AppendUTF8WordText(&text);
672 break;
673 case RIL_SYMBOL: {
674 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
675 if (at_beginning_of_minor_run_) {
676 text += reading_direction_is_ltr ? kLRM : kRLM;
677 }
678 text = it_->word()->BestUTF8(blob_index_, false);
679 if (IsAtFinalSymbolOfWord()) {
680 AppendSuffixMarks(&text);
681 }
682 } break;
683 }
684 int length = text.length() + 1;
685 char *result = new char[length];
686 strncpy(result, text.c_str(), length);
687 return result;
688}
689std::vector<std::vector<std::vector<std::pair<const char *, float>>>>
691 if (it_->word() != nullptr) {
692 return &it_->word()->segmented_timesteps;
693 } else {
694 return nullptr;
695 }
696}
697
698std::vector<std::vector<std::pair<const char *, float>>> *ResultIterator::GetBestLSTMSymbolChoices()
699 const {
700 if (it_->word() != nullptr) {
701 return &it_->word()->CTC_symbol_choices;
702 } else {
703 return nullptr;
704 }
705}
706
707void ResultIterator::AppendUTF8WordText(std::string *text) const {
708 if (!it_->word()) {
709 return;
710 }
711 ASSERT_HOST(it_->word()->best_choice != nullptr);
712 bool reading_direction_is_ltr = current_paragraph_is_ltr_ ^ in_minor_direction_;
713 if (at_beginning_of_minor_run_) {
714 *text += reading_direction_is_ltr ? kLRM : kRLM;
715 }
716
717 std::vector<int> blob_order;
718 CalculateBlobOrder(&blob_order);
719 for (int i : blob_order) {
720 *text += it_->word()->BestUTF8(i, false);
721 }
722 AppendSuffixMarks(text);
723}
724
725void ResultIterator::IterateAndAppendUTF8TextlineText(std::string *text) {
726 if (Empty(RIL_WORD)) {
727 Next(RIL_WORD);
728 return;
729 }
730 if (BidiDebug(1)) {
731 std::vector<int> textline_order;
732 std::vector<StrongScriptDirection> dirs;
733 CalculateTextlineOrder(current_paragraph_is_ltr_, *this, &dirs, &textline_order);
734 tprintf("Strong Script dirs [%p/P=%s]: ",
735 static_cast<void *>(it_->row()),
736 current_paragraph_is_ltr_ ? "ltr" : "rtl");
737 PrintScriptDirs(dirs);
738 tprintf("Logical textline order [%p/P=%s]: ",
739 static_cast<void *>(it_->row()),
740 current_paragraph_is_ltr_ ? "ltr" : "rtl");
741 for (int i : textline_order) {
742 tprintf("%d ", i);
743 }
744 tprintf("\n");
745 }
746
747 int words_appended = 0;
748 do {
749 int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : (words_appended > 0);
750 for (int i = 0; i < numSpaces; ++i) {
751 *text += " ";
752 }
753 AppendUTF8WordText(text);
754 words_appended++;
755 if (BidiDebug(2)) {
756 tprintf("Num spaces=%d, text=%s\n", numSpaces, text->c_str());
757 }
759 if (BidiDebug(1)) {
760 tprintf("%d words printed\n", words_appended);
761 }
762 *text += line_separator_;
763 // If we just finished a paragraph, add an extra newline.
765 *text += paragraph_separator_;
766 }
767}
768
769void ResultIterator::AppendUTF8ParagraphText(std::string *text) const {
770 ResultIterator it(*this);
771 it.RestartParagraph();
772 it.MoveToLogicalStartOfTextline();
773 if (it.Empty(RIL_WORD)) {
774 return;
775 }
776 do {
777 it.IterateAndAppendUTF8TextlineText(text);
778 } while (it.it_->block() != nullptr && !it.IsAtBeginningOf(RIL_PARA));
779}
780
781bool ResultIterator::BidiDebug(int min_level) const {
782 int debug_level = 1;
783 auto *p = ParamUtils::FindParam<IntParam>("bidi_debug", GlobalParams()->int_params,
785 if (p != nullptr) {
786 debug_level = (int32_t)(*p);
787 }
788 return debug_level >= min_level;
789}
790
791} // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:54
const char * p
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
StrongScriptDirection
Definition: unichar.h:41
@ DIR_MIX
Definition: unichar.h:45
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_NEUTRAL
Definition: unichar.h:42
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:36
def next(obj)
Definition: ast.py:56
LTRResultIterator(PAGE_RES *page_res, Tesseract *tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)
virtual void RestartRow()
virtual bool Next(PageIteratorLevel level)
bool IsWithinFirstTextlineOfParagraph() const
bool Empty(PageIteratorLevel level) const
void BeginWord(int offset)
static void CalculateTextlineOrder(bool paragraph_is_ltr, const std::vector< StrongScriptDirection > &word_dirs, std::vector< int > *reading_order)
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
static const int kMinorRunEnd
static const int kMinorRunStart
virtual char * GetUTF8Text(PageIteratorLevel level) const
bool IsAtBeginningOf(PageIteratorLevel level) const override
virtual std::vector< std::vector< std::pair< const char *, float > > > * GetBestLSTMSymbolChoices() const
bool Next(PageIteratorLevel level) override
virtual std::vector< std::vector< std::vector< std::pair< const char *, float > > > > * GetRawLSTMTimesteps() const
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
ResultIterator(const LTRResultIterator &resit)
static const int kComplexWord
PARA * para() const
Definition: ocrrow.h:120
WERD_CHOICE * best_choice
Definition: pageres.h:239
const char * BestUTF8(unsigned blob_index, bool in_rtl_context) const
Definition: pageres.h:361
bool UnicharsInReadingOrder() const
Definition: pageres.h:435
std::vector< std::vector< std::pair< const char *, float > > > CTC_symbol_choices
Definition: pageres.h:224
UNICHARSET::Direction SymbolDirection(unsigned blob_index) const
Definition: pageres.h:387
std::vector< std::vector< std::vector< std::pair< const char *, float > > > > segmented_timesteps
Definition: pageres.h:222
BLOCK_RES * block() const
Definition: pageres.h:769
BLOCK_RES * prev_block() const
Definition: pageres.h:760
WERD_RES * word() const
Definition: pageres.h:763
ROW_RES * prev_row() const
Definition: pageres.h:757
ROW_RES * row() const
Definition: pageres.h:766
uint8_t space() const
Definition: werd.h:100
ParamsVectors * params()
Definition: ccutil.h:53
std::vector< BoolParam * > bool_params
Definition: params.h:48
std::vector< IntParam * > int_params
Definition: params.h:47