tesseract v5.3.3.20231005
paragraphs.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: paragraphs.cpp
3 * Description: Paragraph detection for tesseract.
4 * Author: David Eger
5 *
6 * (C) Copyright 2011, Google Inc.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#include "paragraphs.h"
20
21#include "helpers.h" // for UpdateRange, ClipToRange
22#include "host.h" // for NearlyEqual
23#include "mutableiterator.h" // for MutableIterator
24#include "ocrblock.h" // for BLOCK
25#include "ocrpara.h" // for ParagraphModel, PARA, PARA_IT, PARA...
26#include "ocrrow.h" // for ROW
27#include "pageres.h" // for PAGE_RES_IT, WERD_RES, ROW_RES, BLO...
28#include "paragraphs_internal.h" // for RowScratchRegisters, SetOfModels
29#include "pdblock.h" // for PDBLK
30#include "polyblk.h" // for POLY_BLOCK
31#include "ratngs.h" // for WERD_CHOICE
32#include "rect.h" // for TBOX
33#include "statistc.h" // for STATS
34#include "tprintf.h" // for tprintf
35#include "unicharset.h" // for UNICHARSET
36#include "werd.h" // for WERD, W_REP_CHAR
37
38#include <tesseract/pageiterator.h> // for PageIterator
39#include <tesseract/publictypes.h> // for JUSTIFICATION_LEFT, JUSTIFICATION_R...
40#include <tesseract/unichar.h> // for UNICHAR, UNICHAR_ID
41
42#include <algorithm> // for max
43#include <cctype> // for isspace
44#include <cmath> // for abs
45#include <cstdio> // for snprintf
46#include <cstdlib> // for abs
47#include <cstring> // for strchr, strlen
48#include <memory> // for unique_ptr
49
50static const char *const kRLE = "\u202A"; // Right-to-Left Embedding
51static const char *const kPDF = "\u202C"; // Pop Directional Formatting
52
53namespace tesseract {
54
55// Special "weak" ParagraphModels.
57 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD111F));
59 reinterpret_cast<ParagraphModel *>(static_cast<uintptr_t>(0xDEAD888F));
60
61// Do the text and geometry of two rows support a paragraph break between them?
62static bool LikelyParagraphStart(const RowScratchRegisters &before,
65
66// Given the width of a typical space between words, what is the threshold
67// by which by which we think left and right alignments for paragraphs
68// can vary and still be aligned.
69static int Epsilon(int space_pix) {
70 return space_pix * 4 / 5;
71}
72
73static bool AcceptableRowArgs(int debug_level, int min_num_rows, const char *function_name,
74 const std::vector<RowScratchRegisters> *rows, int row_start,
75 int row_end) {
76 if (row_start < 0 || static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77 tprintf("Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
78 rows->size());
79 return false;
80 }
81 if (row_end - row_start < min_num_rows) {
82 if (debug_level > 1) {
83 tprintf("# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
84 }
85 return false;
86 }
87 return true;
88}
89
90// =============================== Debug Code ================================
91
92// Given a row-major matrix of unicode text and a column separator, print
93// a formatted table. For ASCII, we get good column alignment.
94static void PrintTable(const std::vector<std::vector<std::string>> &rows, const char *colsep) {
95 std::vector<int> max_col_widths;
96 for (const auto &row : rows) {
97 auto num_columns = row.size();
98 for (size_t c = 0; c < num_columns; c++) {
99 int num_unicodes = 0;
100 for (char i : row[c]) {
101 if ((i & 0xC0) != 0x80) {
102 num_unicodes++;
103 }
104 }
105 if (c >= max_col_widths.size()) {
106 max_col_widths.push_back(num_unicodes);
107 } else {
108 if (num_unicodes > max_col_widths[c]) {
109 max_col_widths[c] = num_unicodes;
110 }
111 }
112 }
113 }
114
115 std::vector<std::string> col_width_patterns;
116 col_width_patterns.reserve(max_col_widths.size());
117 for (int max_col_width : max_col_widths) {
118 col_width_patterns.push_back(std::string("%-") + std::to_string(max_col_width) + "s");
119 }
120
121 for (const auto &row : rows) {
122 for (unsigned c = 0; c < row.size(); c++) {
123 if (c > 0) {
124 tprintf("%s", colsep);
125 }
126 tprintf(col_width_patterns[c].c_str(), row[c].c_str());
127 }
128 tprintf("\n");
129 }
130}
131
132static std::string RtlEmbed(const std::string &word, bool rtlify) {
133 if (rtlify) {
134 return std::string(kRLE) + word + std::string(kPDF);
135 }
136 return word;
137}
138
139// Print the current thoughts of the paragraph detector.
140static void PrintDetectorState(const ParagraphTheory &theory,
141 const std::vector<RowScratchRegisters> &rows) {
142 std::vector<std::vector<std::string>> output;
143 output.emplace_back();
144 output.back().push_back("#row");
145 output.back().push_back("space");
146 output.back().push_back("..");
147 output.back().push_back("lword[widthSEL]");
148 output.back().push_back("rword[widthSEL]");
150 output.back().push_back("text");
151
152 for (unsigned i = 0; i < rows.size(); i++) {
153 output.emplace_back();
154 std::vector<std::string> &row = output.back();
155 const RowInfo &ri = *rows[i].ri_;
156 row.push_back(std::to_string(i));
157 row.push_back(std::to_string(ri.average_interword_space));
158 row.emplace_back(ri.has_leaders ? ".." : " ");
159 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) + "[" + std::to_string(ri.lword_box.width()) +
160 (ri.lword_likely_starts_idea ? "S" : "s") +
161 (ri.lword_likely_ends_idea ? "E" : "e") +
162 (ri.lword_indicates_list_item ? "L" : "l") + "]");
163 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) + "[" + std::to_string(ri.rword_box.width()) +
164 (ri.rword_likely_starts_idea ? "S" : "s") +
165 (ri.rword_likely_ends_idea ? "E" : "e") +
166 (ri.rword_indicates_list_item ? "L" : "l") + "]");
167 rows[i].AppendDebugInfo(theory, row);
168 row.push_back(RtlEmbed(ri.text, !ri.ltr));
169 }
170 PrintTable(output, " ");
171
172 tprintf("Active Paragraph Models:\n");
173 unsigned m = 0;
174 for (const auto &model : theory.models()) {
175 tprintf(" %d: %s\n", ++m, model->ToString().c_str());
176 }
177}
178
179static void DebugDump(bool should_print, const char *phase, const ParagraphTheory &theory,
180 const std::vector<RowScratchRegisters> &rows) {
181 if (!should_print) {
182 return;
183 }
184 tprintf("# %s\n", phase);
185 PrintDetectorState(theory, rows);
186}
187
188// Print out the text for rows[row_start, row_end)
189static void PrintRowRange(const std::vector<RowScratchRegisters> &rows, int row_start,
190 int row_end) {
191 tprintf("======================================\n");
192 for (int row = row_start; row < row_end; row++) {
193 tprintf("%s\n", rows[row].ri_->text.c_str());
194 }
195 tprintf("======================================\n");
196}
197
198// ============= Brain Dead Language Model (ASCII Version) ===================
199
200static bool IsLatinLetter(int ch) {
201 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
202}
203
204static bool IsDigitLike(int ch) {
205 return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
206}
207
208static bool IsOpeningPunct(int ch) {
209 return strchr("'\"({[", ch) != nullptr;
210}
211
212static bool IsTerminalPunct(int ch) {
213 return strchr(":'\".?!]})", ch) != nullptr;
214}
215
216// Return a pointer after consuming as much text as qualifies as roman numeral.
217static const char *SkipChars(const char *str, const char *toskip) {
218 while (*str != '\0' && strchr(toskip, *str)) {
219 str++;
220 }
221 return str;
222}
223
224static const char *SkipChars(const char *str, bool (*skip)(int)) {
225 while (*str != '\0' && skip(*str)) {
226 str++;
227 }
228 return str;
229}
230
231static const char *SkipOne(const char *str, const char *toskip) {
232 if (*str != '\0' && strchr(toskip, *str)) {
233 return str + 1;
234 }
235 return str;
236}
237
238// Return whether it is very likely that this is a numeral marker that could
239// start a list item. Some examples include:
240// A I iii. VI (2) 3.5. [C-4]
241static bool LikelyListNumeral(const std::string &word) {
242 const char *kRomans = "ivxlmdIVXLMD";
243 const char *kDigits = "012345789";
244 const char *kOpen = "[{(";
245 const char *kSep = ":;-.,";
246 const char *kClose = "]})";
247
248 int num_segments = 0;
249 const char *pos = word.c_str();
250 while (*pos != '\0' && num_segments < 3) {
251 // skip up to two open parens.
252 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253 const char *numeral_end = SkipChars(numeral_start, kRomans);
254 if (numeral_end != numeral_start) {
255 // Got Roman Numeral. Great.
256 } else {
257 numeral_end = SkipChars(numeral_start, kDigits);
258 if (numeral_end == numeral_start) {
259 // If there's a single latin letter, we can use that.
260 numeral_end = SkipChars(numeral_start, IsLatinLetter);
261 if (numeral_end - numeral_start != 1) {
262 break;
263 }
264 }
265 }
266 // We got some sort of numeral.
267 num_segments++;
268 // Skip any trailing parens or punctuation.
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end) {
271 break;
272 }
273 }
274 return *pos == '\0';
275}
276
277static bool LikelyListMark(const std::string &word) {
278 const char *kListMarks = "0Oo*.,+.";
279 return word.size() == 1 && strchr(kListMarks, word[0]) != nullptr;
280}
281
282bool AsciiLikelyListItem(const std::string &word) {
283 return LikelyListMark(word) || LikelyListNumeral(word);
284}
285
286// ========== Brain Dead Language Model (Tesseract Version) ================
287
288// Return the first Unicode Codepoint from werd[pos].
289static int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, unsigned pos) {
290 if (!u || !werd || pos > werd->length()) {
291 return 0;
292 }
293 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
294}
295
296// A useful helper class for finding the first j >= i so that word[j]
297// does not have given character type.
299public:
300 UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
301 : u_(unicharset), word_(word), wordlen_(word->length()) {
302 }
303
304 // Given an input position, return the first position >= pos not punc.
305 unsigned SkipPunc(unsigned pos);
306 // Given an input position, return the first position >= pos not digit.
307 unsigned SkipDigits(unsigned pos);
308 // Given an input position, return the first position >= pos not roman.
309 unsigned SkipRomans(unsigned pos);
310 // Given an input position, return the first position >= pos not alpha.
311 unsigned SkipAlpha(unsigned pos);
312
313private:
314 const UNICHARSET *u_;
315 const WERD_CHOICE *word_;
316 unsigned wordlen_;
317};
318
319unsigned UnicodeSpanSkipper::SkipPunc(unsigned pos) {
320 while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) {
321 pos++;
322 }
323 return pos;
324}
325
326unsigned UnicodeSpanSkipper::SkipDigits(unsigned pos) {
327 while (pos < wordlen_ &&
328 (u_->get_isdigit(word_->unichar_id(pos)) || IsDigitLike(UnicodeFor(u_, word_, pos)))) {
329 pos++;
330 }
331 return pos;
332}
333
334unsigned UnicodeSpanSkipper::SkipRomans(unsigned pos) {
335 const char *kRomans = "ivxlmdIVXLMD";
336 while (pos < wordlen_) {
337 int ch = UnicodeFor(u_, word_, pos);
338 if (ch >= 0xF0 || strchr(kRomans, ch) == nullptr) {
339 break;
340 }
341 pos++;
342 }
343 return pos;
344}
345
346unsigned UnicodeSpanSkipper::SkipAlpha(unsigned pos) {
347 while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) {
348 pos++;
349 }
350 return pos;
351}
352
353static bool LikelyListMarkUnicode(int ch) {
354 if (ch < 0x80) {
355 std::string single_ch;
356 single_ch += ch;
357 return LikelyListMark(single_ch);
358 }
359 switch (ch) {
360 // TODO(eger) expand this list of unicodes as needed.
361 case 0x00B0: // degree sign
362 case 0x2022: // bullet
363 case 0x25E6: // white bullet
364 case 0x00B7: // middle dot
365 case 0x25A1: // white square
366 case 0x25A0: // black square
367 case 0x25AA: // black small square
368 case 0x2B1D: // black very small square
369 case 0x25BA: // black right-pointing pointer
370 case 0x25CF: // black circle
371 case 0x25CB: // white circle
372 return true;
373 default:
374 break; // fall through
375 }
376 return false;
377}
378
379// Return whether it is very likely that this is a numeral marker that could
380// start a list item. Some examples include:
381// A I iii. VI (2) 3.5. [C-4]
382static bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
383 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
384 return true;
385 }
386
387 UnicodeSpanSkipper m(u, werd);
388 int num_segments = 0;
389 unsigned pos = 0;
390 while (pos < werd->length() && num_segments < 3) {
391 auto numeral_start = m.SkipPunc(pos);
392 if (numeral_start > pos + 1) {
393 break;
394 }
395 auto numeral_end = m.SkipRomans(numeral_start);
396 if (numeral_end == numeral_start) {
397 numeral_end = m.SkipDigits(numeral_start);
398 if (numeral_end == numeral_start) {
399 // If there's a single latin letter, we can use that.
400 numeral_end = m.SkipAlpha(numeral_start);
401 if (numeral_end - numeral_start != 1) {
402 break;
403 }
404 }
405 }
406 // We got some sort of numeral.
407 num_segments++;
408 // Skip any trailing punctuation.
409 pos = m.SkipPunc(numeral_end);
410 if (pos == numeral_end) {
411 break;
412 }
413 }
414 return pos == werd->length();
415}
416
417template<class T>
418void push_back_new(std::vector<T> &vector, const T &data) {
419 if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420 vector.push_back(data);
421 }
422}
423
424// ========= Brain Dead Language Model (combined entry points) ================
425
426// Given the leftmost word of a line either as a Tesseract unicharset + werd
427// or a utf8 string, set the following attributes for it:
428// is_list - this word might be a list number or bullet.
429// starts_idea - this word is likely to start a sentence.
430// ends_idea - this word is likely to end a sentence.
431void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
432 bool *is_list, bool *starts_idea, bool *ends_idea) {
433 *is_list = false;
434 *starts_idea = false;
435 *ends_idea = false;
436 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
437 *ends_idea = true;
438 return;
439 }
440
441 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
442 if (UniLikelyListItem(unicharset, werd)) {
443 *is_list = true;
444 *starts_idea = true;
445 *ends_idea = true;
446 }
447 if (unicharset->get_isupper(werd->unichar_id(0))) {
448 *starts_idea = true;
449 }
450 if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
451 *starts_idea = true;
452 *ends_idea = true;
453 }
454 } else { // Assume utf8 is mostly ASCII
455 if (AsciiLikelyListItem(utf8)) {
456 *is_list = true;
457 *starts_idea = true;
458 }
459 int start_letter = utf8[0];
460 if (IsOpeningPunct(start_letter)) {
461 *starts_idea = true;
462 }
463 if (IsTerminalPunct(start_letter)) {
464 *ends_idea = true;
465 }
466 if (start_letter >= 'A' && start_letter <= 'Z') {
467 *starts_idea = true;
468 }
469 }
470}
471
472// Given the rightmost word of a line either as a Tesseract unicharset + werd
473// or a utf8 string, set the following attributes for it:
474// is_list - this word might be a list number or bullet.
475// starts_idea - this word is likely to start a sentence.
476// ends_idea - this word is likely to end a sentence.
477void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8,
478 bool *is_list, bool *starts_idea, bool *ends_idea) {
479 *is_list = false;
480 *starts_idea = false;
481 *ends_idea = false;
482 if (utf8.empty() || (werd != nullptr && werd->empty())) { // Empty
483 *ends_idea = true;
484 return;
485 }
486
487 if (unicharset && werd) { // We have a proper werd and unicharset so use it.
488 if (UniLikelyListItem(unicharset, werd)) {
489 *is_list = true;
490 *starts_idea = true;
491 }
492 UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
493 if (unicharset->get_ispunctuation(last_letter)) {
494 *ends_idea = true;
495 }
496 } else { // Assume utf8 is mostly ASCII
497 if (AsciiLikelyListItem(utf8)) {
498 *is_list = true;
499 *starts_idea = true;
500 }
501 int last_letter = utf8[utf8.size() - 1];
502 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
503 *ends_idea = true;
504 }
505 }
506}
507
508// =============== Implementation of RowScratchRegisters =====================
509/* static */
510void RowScratchRegisters::AppendDebugHeaderFields(std::vector<std::string> &header) {
511 header.emplace_back("[lmarg,lind;rind,rmarg]");
512 header.emplace_back("model");
513}
514
516 std::vector<std::string> &dbg) const {
517 char s[60];
518 // The largest (positive and negative) numbers are reported for lindent & rindent.
519 // While the column header has widths 5,4,4,5, it is therefore opportune to slightly
520 // offset the widths in the format string here to allow ample space for lindent & rindent
521 // while keeping the final table output nicely readable: 4,5,5,4.
522 snprintf(s, sizeof(s), "[%4d,%5d;%5d,%4d]", lmargin_, lindent_, rindent_, rmargin_);
523 dbg.emplace_back(s);
524 std::string model_string;
525 model_string += static_cast<char>(GetLineType());
526 model_string += ":";
527
528 int model_numbers = 0;
529 for (const auto &hypothese : hypotheses_) {
530 if (hypothese.model == nullptr) {
531 continue;
532 }
533 if (model_numbers > 0) {
534 model_string += ",";
535 }
536 if (StrongModel(hypothese.model)) {
537 model_string += std::to_string(1 + theory.IndexOf(hypothese.model));
538 } else if (hypothese.model == kCrownLeft) {
539 model_string += "CrL";
540 } else if (hypothese.model == kCrownRight) {
541 model_string += "CrR";
542 }
543 model_numbers++;
544 }
545 if (model_numbers == 0) {
546 model_string += "0";
547 }
548
549 dbg.push_back(model_string);
550}
551
553 ri_ = &row;
554 lmargin_ = 0;
556 rmargin_ = 0;
558}
559
561 if (hypotheses_.empty()) {
562 return LT_UNKNOWN;
563 }
564 bool has_start = false;
565 bool has_body = false;
566 for (const auto &hypothese : hypotheses_) {
567 switch (hypothese.ty) {
568 case LT_START:
569 has_start = true;
570 break;
571 case LT_BODY:
572 has_body = true;
573 break;
574 default:
575 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
576 break;
577 }
578 }
579 if (has_start && has_body) {
580 return LT_MULTIPLE;
581 }
582 return has_start ? LT_START : LT_BODY;
583}
584
586 if (hypotheses_.empty()) {
587 return LT_UNKNOWN;
588 }
589 bool has_start = false;
590 bool has_body = false;
591 for (const auto &hypothese : hypotheses_) {
592 if (hypothese.model != model) {
593 continue;
594 }
595 switch (hypothese.ty) {
596 case LT_START:
597 has_start = true;
598 break;
599 case LT_BODY:
600 has_body = true;
601 break;
602 default:
603 tprintf("Encountered bad value in hypothesis list: %c\n", hypothese.ty);
604 break;
605 }
606 }
607 if (has_start && has_body) {
608 return LT_MULTIPLE;
609 }
610 return has_start ? LT_START : LT_BODY;
611}
612
614 LineType current_lt = GetLineType();
615 if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
616 tprintf("Trying to set a line to be START when it's already BODY.\n");
617 }
618 if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
619 push_back_new(hypotheses_, LineHypothesis(LT_START, nullptr));
620 }
621}
622
624 LineType current_lt = GetLineType();
625 if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
626 tprintf("Trying to set a line to be BODY when it's already START.\n");
627 }
628 if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
629 push_back_new(hypotheses_, LineHypothesis(LT_BODY, nullptr));
630 }
631}
632
634 push_back_new(hypotheses_, LineHypothesis(LT_START, model));
635 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_START, nullptr));
636 if (found != hypotheses_.end()) {
637 hypotheses_.erase(found);
638 }
639}
640
642 push_back_new(hypotheses_, LineHypothesis(LT_BODY, model));
643 auto found = std::find(hypotheses_.begin(), hypotheses_.end(), LineHypothesis(LT_BODY, nullptr));
644 if (found != hypotheses_.end()) {
645 hypotheses_.erase(found);
646 }
647}
648
650 for (const auto &hypothese : hypotheses_) {
651 if (hypothese.ty == LT_START && StrongModel(hypothese.model)) {
652 push_back_new(*models, hypothese.model);
653 }
654 }
655}
656
658 for (const auto &hypothese : hypotheses_) {
659 if (StrongModel(hypothese.model)) {
660 push_back_new(*models, hypothese.model);
661 }
662 }
663}
664
666 for (const auto &hypothese : hypotheses_) {
667 if (hypothese.model != nullptr) {
668 push_back_new(*models, hypothese.model);
669 }
670 }
671}
672
674 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START) {
675 return nullptr;
676 }
677 return hypotheses_[0].model;
678}
679
681 if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY) {
682 return nullptr;
683 }
684 return hypotheses_[0].model;
685}
686
687// Discard any hypotheses whose model is not in the given list.
689 if (models.empty()) {
690 return;
691 }
692 for (int h = hypotheses_.size() - 1; h >= 0; h--) {
693 if (!contains(models, hypotheses_[h].model)) {
694 hypotheses_.erase(hypotheses_.begin() + h);
695 }
696 }
697}
698
699// ============ Geometry based Paragraph Detection Algorithm =================
700
701struct Cluster {
702 Cluster() : center(0), count(0) {}
703 Cluster(int cen, int num) : center(cen), count(num) {}
704
705 int center; // The center of the cluster.
706 int count; // The number of entries within the cluster.
707};
708
710public:
711 explicit SimpleClusterer(int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
712 void Add(int value) {
713 values_.push_back(value);
714 }
715 size_t size() const {
716 return values_.size();
717 }
718 void GetClusters(std::vector<Cluster> *clusters);
719
720private:
721 int max_cluster_width_;
722 std::vector<int> values_;
723};
724
725// Return the index of the cluster closest to value.
726static int ClosestCluster(const std::vector<Cluster> &clusters, int value) {
727 unsigned best_index = 0;
728 for (unsigned i = 0; i < clusters.size(); i++) {
729 if (abs(value - clusters[i].center) < abs(value - clusters[best_index].center)) {
730 best_index = i;
731 }
732 }
733 return best_index;
734}
735
736void SimpleClusterer::GetClusters(std::vector<Cluster> *clusters) {
737 clusters->clear();
738 std::sort(values_.begin(), values_.end());
739 for (unsigned i = 0; i < values_.size();) {
740 int orig_i = i;
741 int lo = values_[i];
742 int hi = lo;
743 while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
744 hi = values_[i];
745 }
746 clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
747 }
748}
749
750// Calculate left- and right-indent tab stop values seen in
751// rows[row_start, row_end) given a tolerance of tolerance.
752static void CalculateTabStops(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
753 int tolerance, std::vector<Cluster> *left_tabs,
754 std::vector<Cluster> *right_tabs) {
755 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
756 return;
757 }
758 // First pass: toss all left and right indents into clusterers.
759 SimpleClusterer initial_lefts(tolerance);
760 SimpleClusterer initial_rights(tolerance);
761 std::vector<Cluster> initial_left_tabs;
762 std::vector<Cluster> initial_right_tabs;
763 for (int i = row_start; i < row_end; i++) {
764 initial_lefts.Add((*rows)[i].lindent_);
765 initial_rights.Add((*rows)[i].rindent_);
766 }
767 initial_lefts.GetClusters(&initial_left_tabs);
768 initial_rights.GetClusters(&initial_right_tabs);
769
770 // Second pass: cluster only lines that are not "stray"
771 // An example of a stray line is a page number -- a line whose start
772 // and end tab-stops are far outside the typical start and end tab-stops
773 // for the block.
774 // Put another way, we only cluster data from lines whose start or end
775 // tab stop is frequent.
776 SimpleClusterer lefts(tolerance);
777 SimpleClusterer rights(tolerance);
778
779 // Outlier elimination. We might want to switch this to test outlier-ness
780 // based on how strange a position an outlier is in instead of or in addition
781 // to how rare it is. These outliers get re-added if we end up having too
782 // few tab stops, to work with, however.
783 int infrequent_enough_to_ignore = 0;
784 if (row_end - row_start >= 8) {
785 infrequent_enough_to_ignore = 1;
786 }
787 if (row_end - row_start >= 20) {
788 infrequent_enough_to_ignore = 2;
789 }
790
791 for (int i = row_start; i < row_end; i++) {
792 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
793 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
794 if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
795 initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
796 lefts.Add((*rows)[i].lindent_);
797 rights.Add((*rows)[i].rindent_);
798 }
799 }
800 lefts.GetClusters(left_tabs);
801 rights.GetClusters(right_tabs);
802
803 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
804 (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
805 // One side is really ragged, and the other only has one tab stop,
806 // so those "insignificant outliers" are probably important, actually.
807 // This often happens on a page of an index. Add back in the ones
808 // we omitted in the first pass.
809 for (int i = row_start; i < row_end; i++) {
810 int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
811 int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
812 if (!(initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
813 initial_right_tabs[ridx].count > infrequent_enough_to_ignore)) {
814 lefts.Add((*rows)[i].lindent_);
815 rights.Add((*rows)[i].rindent_);
816 }
817 }
818 }
819 lefts.GetClusters(left_tabs);
820 rights.GetClusters(right_tabs);
821
822 // If one side is almost a two-indent aligned side, and the other clearly
823 // isn't, try to prune out the least frequent tab stop from that side.
824 if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
825 int to_prune = -1;
826 for (int i = left_tabs->size() - 1; i >= 0; i--) {
827 if (to_prune < 0 || (*left_tabs)[i].count < (*left_tabs)[to_prune].count) {
828 to_prune = i;
829 }
830 }
831 if (to_prune >= 0 && (*left_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
832 left_tabs->erase(left_tabs->begin() + to_prune);
833 }
834 }
835 if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
836 int to_prune = -1;
837 for (int i = right_tabs->size() - 1; i >= 0; i--) {
838 if (to_prune < 0 || (*right_tabs)[i].count < (*right_tabs)[to_prune].count) {
839 to_prune = i;
840 }
841 }
842 if (to_prune >= 0 && (*right_tabs)[to_prune].count <= infrequent_enough_to_ignore) {
843 right_tabs->erase(right_tabs->begin() + to_prune);
844 }
845 }
846}
847
848// Given a paragraph model mark rows[row_start, row_end) as said model
849// start or body lines.
850//
851// Case 1: model->first_indent_ != model->body_indent_
852// Differentiating the paragraph start lines from the paragraph body lines in
853// this case is easy, we just see how far each line is indented.
854//
855// Case 2: model->first_indent_ == model->body_indent_
856// Here, we find end-of-paragraph lines by looking for "short lines."
857// What constitutes a "short line" changes depending on whether the text
858// ragged-right[left] or fully justified (aligned left and right).
859//
860// Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
861// We have a new paragraph it the first word would have at the end
862// of the previous line.
863//
864// Case 2b: Fully Justified. (eop_threshold > 0)
865// We mark a line as short (end of paragraph) if the offside indent
866// is greater than eop_threshold.
867static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows, int row_start, int row_end,
868 const ParagraphModel *model, bool ltr, int eop_threshold) {
869 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
870 return;
871 }
872 for (int row = row_start; row < row_end; row++) {
873 bool valid_first = ValidFirstLine(rows, row, model);
874 bool valid_body = ValidBodyLine(rows, row, model);
875 if (valid_first && !valid_body) {
876 (*rows)[row].AddStartLine(model);
877 } else if (valid_body && !valid_first) {
878 (*rows)[row].AddBodyLine(model);
879 } else if (valid_body && valid_first) {
880 bool after_eop = (row == row_start);
881 if (row > row_start) {
882 if (eop_threshold > 0) {
883 if (model->justification() == JUSTIFICATION_LEFT) {
884 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
885 } else {
886 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
887 }
888 } else {
889 after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row], model->justification());
890 }
891 }
892 if (after_eop) {
893 (*rows)[row].AddStartLine(model);
894 } else {
895 (*rows)[row].AddBodyLine(model);
896 }
897 } else {
898 // Do nothing. Stray row.
899 }
900 }
901}
902
903// GeometricClassifierState holds all of the information we'll use while
904// trying to determine a paragraph model for the text lines in a block of
905// text:
906// + the rows under consideration [row_start, row_end)
907// + the common left- and right-indent tab stops
908// + does the block start out left-to-right or right-to-left
909// Further, this struct holds the data we amass for the (single) ParagraphModel
910// we'll assign to the text lines (assuming we get that far).
912 GeometricClassifierState(int dbg_level, std::vector<RowScratchRegisters> *r, int r_start,
913 int r_end)
914 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end) {
915 tolerance = InterwordSpace(*r, r_start, r_end);
916 CalculateTabStops(r, r_start, r_end, tolerance, &left_tabs, &right_tabs);
917 if (debug_level >= 3) {
918 tprintf(
919 "Geometry: TabStop cluster tolerance = %d; "
920 "%zu left tabs; %zu right tabs\n",
921 tolerance, left_tabs.size(), right_tabs.size());
922 }
923 ltr = (*r)[r_start].ri_->ltr;
924 }
925
928 margin = (*rows)[row_start].lmargin_;
929 }
930
933 margin = (*rows)[row_start].rmargin_;
934 }
935
936 // Align tabs are the tab stops the text is aligned to.
937 const std::vector<Cluster> &AlignTabs() const {
939 return right_tabs;
940 }
941 return left_tabs;
942 }
943
944 // Offside tabs are the tab stops opposite the tabs used to align the text.
945 //
946 // Note that for a left-to-right text which is aligned to the right such as
947 // this function comment, the offside tabs are the horizontal tab stops
948 // marking the beginning of ("Note", "this" and "marking").
949 const std::vector<Cluster> &OffsideTabs() const {
951 return left_tabs;
952 }
953 return right_tabs;
954 }
955
956 // Return whether the i'th row extends from the leftmost left tab stop
957 // to the right most right tab stop.
958 bool IsFullRow(int i) const {
959 return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
960 ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
961 }
962
963 int AlignsideTabIndex(int row_idx) const {
964 return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
965 }
966
967 // Given what we know about the paragraph justification (just), would the
968 // first word of row_b have fit at the end of row_a?
969 bool FirstWordWouldHaveFit(int row_a, int row_b) {
971 }
972
973 void PrintRows() const {
974 PrintRowRange(*rows, row_start, row_end);
975 }
976
977 void Fail(int min_debug_level, const char *why) const {
978 if (debug_level < min_debug_level) {
979 return;
980 }
981 tprintf("# %s\n", why);
982 PrintRows();
983 }
984
987 }
988
989 // We print out messages with a debug level at least as great as debug_level.
990 int debug_level = 0;
991
992 // The Geometric Classifier was asked to find a single paragraph model
993 // to fit the text rows (*rows)[row_start, row_end)
994 std::vector<RowScratchRegisters> *rows;
995 int row_start = 0;
996 int row_end = 0;
997
998 // The amount by which we expect the text edge can vary and still be aligned.
999 int tolerance = 0;
1000
1001 // Is the script in this text block left-to-right?
1002 // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
1003 bool ltr = false;
1004
1005 // These left and right tab stops were determined to be the common tab
1006 // stops for the given text.
1007 std::vector<Cluster> left_tabs;
1008 std::vector<Cluster> right_tabs;
1009
1010 // These are parameters we must determine to create a ParagraphModel.
1012 int margin = 0;
1015
1016 // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
1018};
1019
1020// Given a section of text where strong textual clues did not help identifying
1021// paragraph breaks, and for which the left and right indents have exactly
1022// three tab stops between them, attempt to find the paragraph breaks based
1023// solely on the outline of the text and whether the script is left-to-right.
1024//
1025// Algorithm Detail:
1026// The selected rows are in the form of a rectangle except
1027// for some number of "short lines" of the same length:
1028//
1029// (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
1030// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1031// xxxxxxxxxxxxx xxxxxxxxxxxx
1032// xxxxxxxxxxxxx xxxxxxxxxxxx
1033//
1034// We have a slightly different situation if the only short
1035// line is at the end of the excerpt.
1036//
1037// (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
1038// xxxxxxxxxxxxx xxxxxxxxxxxx
1039// xxxxxxxxxxxxx xxxxxxxxxxxx
1040// xxxxxxxxxxx xxxxxxxxxx # A "short" line.
1041//
1042// We'll interpret these as follows based on the reasoning in the comment for
1043// GeometricClassify():
1044// [script direction: first indent, body indent]
1045// (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
1046// (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
1047static void GeometricClassifyThreeTabStopTextBlock(int debug_level, GeometricClassifierState &s,
1048 ParagraphTheory *theory) {
1049 int num_rows = s.row_end - s.row_start;
1050 int num_full_rows = 0;
1051 int last_row_full = 0;
1052 for (int i = s.row_start; i < s.row_end; i++) {
1053 if (s.IsFullRow(i)) {
1054 num_full_rows++;
1055 if (i == s.row_end - 1) {
1056 last_row_full++;
1057 }
1058 }
1059 }
1060
1061 if (num_full_rows < 0.7 * num_rows) {
1062 s.Fail(1, "Not enough full lines to know which lines start paras.");
1063 return;
1064 }
1065
1066 // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
1067 s.eop_threshold = 0;
1068
1069 if (s.ltr) {
1071 } else {
1073 }
1074
1075 if (debug_level > 0) {
1076 tprintf(
1077 "# Not enough variety for clear outline classification. "
1078 "Guessing these are %s aligned based on script.\n",
1079 s.ltr ? "left" : "right");
1080 s.PrintRows();
1081 }
1082
1083 if (s.AlignTabs().size() == 2) { // case A1 or A2
1084 s.first_indent = s.AlignTabs()[1].center;
1085 s.body_indent = s.AlignTabs()[0].center;
1086 } else { // case B1 or B2
1087 if (num_rows - 1 == num_full_rows - last_row_full) {
1088 // case B2
1089 const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
1090 (*s.rows)[s.row_start].AddStartLine(model);
1091 for (int i = s.row_start + 1; i < s.row_end; i++) {
1092 (*s.rows)[i].AddBodyLine(model);
1093 }
1094 return;
1095 } else {
1096 // case B1
1097 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1098 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1099 }
1100 }
1101 const ParagraphModel *model = theory->AddModel(s.Model());
1102 MarkRowsWithModel(s.rows, s.row_start, s.row_end, model, s.ltr, s.eop_threshold);
1103 return;
1104}
1105
1106// This function is called if strong textual clues were not available, but
1107// the caller hopes that the paragraph breaks will be super obvious just
1108// by the outline of the text.
1109//
1110// The particularly difficult case is figuring out what's going on if you
1111// don't have enough short paragraph end lines to tell us what's going on.
1112//
1113// For instance, let's say you have the following outline:
1114//
1115// (A1) xxxxxxxxxxxxxxxxxxxxxx
1116// xxxxxxxxxxxxxxxxxxxx
1117// xxxxxxxxxxxxxxxxxxxxxx
1118// xxxxxxxxxxxxxxxxxxxxxx
1119//
1120// Even if we know that the text is left-to-right and so will probably be
1121// left-aligned, both of the following are possible texts:
1122//
1123// (A1a) 1. Here our list item
1124// with two full lines.
1125// 2. Here a second item.
1126// 3. Here our third one.
1127//
1128// (A1b) so ends paragraph one.
1129// Here starts another
1130// paragraph we want to
1131// read. This continues
1132//
1133// These examples are obvious from the text and should have been caught
1134// by the StrongEvidenceClassify pass. However, for languages where we don't
1135// have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1136// it's worth guessing that (A1b) is the correct interpretation if there are
1137// far more "full" lines than "short" lines.
1138static void GeometricClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
1139 int row_start, int row_end, ParagraphTheory *theory) {
1140 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1141 return;
1142 }
1143 if (debug_level > 1) {
1144 tprintf("###############################################\n");
1145 tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end);
1146 tprintf("###############################################\n");
1147 }
1148 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1149
1150 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1151 if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1152 s.Fail(2, "Too much variety for simple outline classification.");
1153 return;
1154 }
1155 if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1156 s.Fail(1, "Not enough variety for simple outline classification.");
1157 return;
1158 }
1159 if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1160 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1161 return;
1162 }
1163
1164 // At this point, we know that one side has at least two tab stops, and the
1165 // other side has one or two tab stops.
1166 // Left to determine:
1167 // (1) Which is the body indent and which is the first line indent?
1168 // (2) Is the text fully justified?
1169
1170 // If one side happens to have three or more tab stops, assume that side
1171 // is opposite of the aligned side.
1172 if (s.right_tabs.size() > 2) {
1174 } else if (s.left_tabs.size() > 2) {
1176 } else if (s.ltr) { // guess based on script direction
1178 } else {
1180 }
1181
1182 if (s.AlignTabs().size() == 2) {
1183 // For each tab stop on the aligned side, how many of them appear
1184 // to be paragraph start lines? [first lines]
1185 int firsts[2] = {0, 0};
1186 // Count the first line as a likely paragraph start line.
1187 firsts[s.AlignsideTabIndex(s.row_start)]++;
1188 // For each line, if the first word would have fit on the previous
1189 // line count it as a likely paragraph start line.
1190 bool jam_packed = true;
1191 for (int i = s.row_start + 1; i < s.row_end; i++) {
1192 if (s.FirstWordWouldHaveFit(i - 1, i)) {
1193 firsts[s.AlignsideTabIndex(i)]++;
1194 jam_packed = false;
1195 }
1196 }
1197 // Make an extra accounting for the last line of the paragraph just
1198 // in case it's the only short line in the block. That is, take its
1199 // first word as typical and see if this looks like the *last* line
1200 // of a paragraph. If so, mark the *other* indent as probably a first.
1201 if (jam_packed && s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1202 firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1203 }
1204
1205 int percent0firsts, percent1firsts;
1206 percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1207 percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1208
1209 // TODO(eger): Tune these constants if necessary.
1210 if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1211 s.first_indent = s.AlignTabs()[1].center;
1212 s.body_indent = s.AlignTabs()[0].center;
1213 } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1214 percent1firsts + 30 < percent0firsts) {
1215 s.first_indent = s.AlignTabs()[0].center;
1216 s.body_indent = s.AlignTabs()[1].center;
1217 } else {
1218 // Ambiguous! Probably lineated (poetry)
1219 if (debug_level > 1) {
1220 tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1221 s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1222 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1223 s.AlignTabs()[0].center, percent0firsts);
1224 tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1225 s.AlignTabs()[1].center, percent1firsts);
1226 s.PrintRows();
1227 }
1228 return;
1229 }
1230 } else {
1231 // There's only one tab stop for the "aligned to" side.
1232 s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1233 }
1234
1235 // At this point, we have our model.
1236 const ParagraphModel *model = theory->AddModel(s.Model());
1237
1238 // Now all we have to do is figure out if the text is fully justified or not.
1239 // eop_threshold: default to fully justified unless we see evidence below.
1240 // See description on MarkRowsWithModel()
1241 s.eop_threshold = (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1242 // If the text is not fully justified, re-set the eop_threshold to 0.
1243 if (s.AlignTabs().size() == 2) {
1244 // Paragraphs with a paragraph-start indent.
1245 for (int i = s.row_start; i < s.row_end - 1; i++) {
1246 if (ValidFirstLine(s.rows, i + 1, model) &&
1247 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1248 s.tolerance)) {
1249 // We found a non-end-of-paragraph short line: not fully justified.
1250 s.eop_threshold = 0;
1251 break;
1252 }
1253 }
1254 } else {
1255 // Paragraphs with no paragraph-start indent.
1256 for (int i = s.row_start; i < s.row_end - 1; i++) {
1257 if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1258 !NearlyEqual(s.OffsideTabs()[0].center, (*s.rows)[i].OffsideIndent(s.just),
1259 s.tolerance)) {
1260 // We found a non-end-of-paragraph short line: not fully justified.
1261 s.eop_threshold = 0;
1262 break;
1263 }
1264 }
1265 }
1266 MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1267}
1268
1269// =============== Implementation of ParagraphTheory =====================
1270
1272 for (const auto &m : *models_) {
1273 if (m->Comparable(model)) {
1274 return m;
1275 }
1276 }
1277 auto *m = new ParagraphModel(model);
1278 models_->push_back(m);
1279 push_back_new(models_we_added_, m);
1280 return m;
1281}
1282
1284 size_t w = 0;
1285 for (size_t r = 0; r < models_->size(); r++) {
1286 ParagraphModel *m = (*models_)[r];
1287 if (!contains(used_models, static_cast<const ParagraphModel *>(m)) && contains(models_we_added_, m)) {
1288 delete m;
1289 } else {
1290 if (r > w) {
1291 (*models_)[w] = m;
1292 }
1293 w++;
1294 }
1295 }
1296 models_->resize(w);
1297}
1298
1299// Examine rows[start, end) and try to determine if an existing non-centered
1300// paragraph model would fit them perfectly. If so, return a pointer to it.
1301// If not, return nullptr.
1302const ParagraphModel *ParagraphTheory::Fits(const std::vector<RowScratchRegisters> *rows,
1303 int start, int end) const {
1304 for (const auto *model : *models_) {
1305 if (model->justification() != JUSTIFICATION_CENTER && RowsFitModel(rows, start, end, model)) {
1306 return model;
1307 }
1308 }
1309 return nullptr;
1310}
1311
1313 for (const auto *model : *models_) {
1314 if (model->justification() != JUSTIFICATION_CENTER) {
1315 push_back_new(*models, model);
1316 }
1317 }
1318}
1319
1321 int i = 0;
1322 for (const auto *m : *models_) {
1323 if (m == model) {
1324 return i;
1325 }
1326 i++;
1327 }
1328 return -1;
1329}
1330
1331bool ValidFirstLine(const std::vector<RowScratchRegisters> *rows, int row,
1332 const ParagraphModel *model) {
1333 if (!StrongModel(model)) {
1334 tprintf("ValidFirstLine() should only be called with strong models!\n");
1335 }
1336 return StrongModel(model) && model->ValidFirstLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1337 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1338}
1339
1340bool ValidBodyLine(const std::vector<RowScratchRegisters> *rows, int row,
1341 const ParagraphModel *model) {
1342 if (!StrongModel(model)) {
1343 tprintf("ValidBodyLine() should only be called with strong models!\n");
1344 }
1345 return StrongModel(model) && model->ValidBodyLine((*rows)[row].lmargin_, (*rows)[row].lindent_,
1346 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1347}
1348
1349bool CrownCompatible(const std::vector<RowScratchRegisters> *rows, int a, int b,
1350 const ParagraphModel *model) {
1351 if (model != kCrownRight && model != kCrownLeft) {
1352 tprintf("CrownCompatible() should only be called with crown models!\n");
1353 return false;
1354 }
1355 auto &row_a = (*rows)[a];
1356 auto &row_b = (*rows)[b];
1357 if (model == kCrownRight) {
1358 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1359 Epsilon(row_a.ri_->average_interword_space));
1360 }
1361 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1362 Epsilon(row_a.ri_->average_interword_space));
1363}
1364
1365// =============== Implementation of ParagraphModelSmearer ====================
1366
1367ParagraphModelSmearer::ParagraphModelSmearer(std::vector<RowScratchRegisters> *rows,
1368 int row_start, int row_end, ParagraphTheory *theory)
1369 : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1370 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1371 row_start_ = 0;
1372 row_end_ = 0;
1373 return;
1374 }
1375 open_models_.resize(open_models_.size() + row_end - row_start + 2);
1376}
1377
1378// see paragraphs_internal.h
1379void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1380 SetOfModels no_models;
1381 if (row_start < row_start_) {
1382 row_start = row_start_;
1383 }
1384 if (row_end > row_end_) {
1385 row_end = row_end_;
1386 }
1387
1388 for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1389 if ((*rows_)[row].ri_->num_words == 0) {
1390 OpenModels(row + 1) = no_models;
1391 } else {
1392 SetOfModels &opened = OpenModels(row);
1393 (*rows_)[row].StartHypotheses(&opened);
1394
1395 // Which models survive the transition from row to row + 1?
1396 SetOfModels still_open;
1397 for (auto &m : opened) {
1398 if (ValidFirstLine(rows_, row, m) || ValidBodyLine(rows_, row, m)) {
1399 // This is basic filtering; we check likely paragraph starty-ness down
1400 // below in Smear() -- you know, whether the first word would have fit
1401 // and such.
1402 push_back_new(still_open, m);
1403 }
1404 }
1405 OpenModels(row + 1) = still_open;
1406 }
1407 }
1408}
1409
1410// see paragraphs_internal.h
1412 CalculateOpenModels(row_start_, row_end_);
1413
1414 // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1415 // we have multiple LT_START hypotheses), see if there's a model that
1416 // was recently used (an "open" model) which might model it well.
1417 for (int i = row_start_; i < row_end_; i++) {
1418 RowScratchRegisters &row = (*rows_)[i];
1419 if (row.ri_->num_words == 0) {
1420 continue;
1421 }
1422
1423 // Step One:
1424 // Figure out if there are "open" models which are left-alined or
1425 // right-aligned. This is important for determining whether the
1426 // "first" word in a row would fit at the "end" of the previous row.
1427 bool left_align_open = false;
1428 bool right_align_open = false;
1429 for (auto &m : OpenModels(i)) {
1430 switch (m->justification()) {
1431 case JUSTIFICATION_LEFT:
1432 left_align_open = true;
1433 break;
1435 right_align_open = true;
1436 break;
1437 default:
1438 left_align_open = right_align_open = true;
1439 }
1440 }
1441 // Step Two:
1442 // Use that knowledge to figure out if this row is likely to
1443 // start a paragraph.
1444 bool likely_start;
1445 if (i == 0) {
1446 likely_start = true;
1447 } else {
1448 if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1449 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT) ||
1450 LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1451 } else if (left_align_open) {
1452 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_LEFT);
1453 } else {
1454 likely_start = LikelyParagraphStart((*rows_)[i - 1], row, JUSTIFICATION_RIGHT);
1455 }
1456 }
1457
1458 // Step Three:
1459 // If this text line seems like an obvious first line of an
1460 // open model, or an obvious continuation of an existing
1461 // modelled paragraph, mark it up.
1462 if (likely_start) {
1463 // Add Start Hypotheses for all Open models that fit.
1464 for (unsigned m = 0; m < OpenModels(i).size(); m++) {
1465 if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1466 row.AddStartLine(OpenModels(i)[m]);
1467 }
1468 }
1469 } else {
1470 // Add relevant body line hypotheses.
1471 SetOfModels last_line_models;
1472 if (i > 0) {
1473 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1474 } else {
1475 theory_->NonCenteredModels(&last_line_models);
1476 }
1477 for (auto model : last_line_models) {
1478 if (ValidBodyLine(rows_, i, model)) {
1479 row.AddBodyLine(model);
1480 }
1481 }
1482 }
1483
1484 // Step Four:
1485 // If we're still quite unsure about this line, go through all
1486 // models in our theory and see if this row could be the start
1487 // of any of our models.
1488 if (row.GetLineType() == LT_UNKNOWN ||
1489 (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1490 SetOfModels all_models;
1491 theory_->NonCenteredModels(&all_models);
1492 for (auto &all_model : all_models) {
1493 if (ValidFirstLine(rows_, i, all_model)) {
1494 row.AddStartLine(all_model);
1495 }
1496 }
1497 }
1498 // Step Five:
1499 // Since we may have updated the hypotheses about this row, we need
1500 // to recalculate the Open models for the rest of rows[i + 1, row_end)
1501 if (row.GetLineType() != LT_UNKNOWN) {
1502 CalculateOpenModels(i + 1, row_end_);
1503 }
1504 }
1505}
1506
1507// ================ Main Paragraph Detection Algorithm =======================
1508
1509// Find out what ParagraphModels are actually used, and discard any
1510// that are not.
1511static void DiscardUnusedModels(const std::vector<RowScratchRegisters> &rows,
1512 ParagraphTheory *theory) {
1513 SetOfModels used_models;
1514 for (const auto &row : rows) {
1515 row.StrongHypotheses(&used_models);
1516 }
1517 theory->DiscardUnusedModels(used_models);
1518}
1519
1520// DowngradeWeakestToCrowns:
1521// Forget any flush-{left, right} models unless we see two or more
1522// of them in sequence.
1523//
1524// In pass 3, we start to classify even flush-left paragraphs (paragraphs
1525// where the first line and body indent are the same) as having proper Models.
1526// This is generally dangerous, since if you start imagining that flush-left
1527// is a typical paragraph model when it is not, it will lead you to chop normal
1528// indented paragraphs in the middle whenever a sentence happens to start on a
1529// new line (see "This" above). What to do?
1530// What we do is to take any paragraph which is flush left and is not
1531// preceded by another paragraph of the same model and convert it to a "Crown"
1532// paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1533// for later. It means that the paragraph is flush, but it would be desirable
1534// to mark it as the same model as following text if it fits. This downgrade
1535// FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1536// avoid making flush left Paragraph Models whenever we see a top-of-the-page
1537// half-of-a-paragraph. and instead we mark it the same as normal body text.
1538//
1539// Implementation:
1540//
1541// Comb backwards through the row scratch registers, and turn any
1542// sequences of body lines of equivalent type abutted against the beginning
1543// or a body or start line of a different type into a crown paragraph.
1544static void DowngradeWeakestToCrowns(int debug_level, ParagraphTheory *theory,
1545 std::vector<RowScratchRegisters> *rows) {
1546 int start;
1547 for (int end = rows->size(); end > 0; end = start) {
1548 // Search back for a body line of a unique type.
1549 const ParagraphModel *model = nullptr;
1550 while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) == nullptr) {
1551 end--;
1552 }
1553 if (end == 0) {
1554 break;
1555 }
1556 start = end - 1;
1557 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1558 start--; // walk back to the first line that is not the same body type.
1559 }
1560 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model && StrongModel(model) &&
1561 NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1562 start--;
1563 }
1564 start++;
1565 // Now rows[start, end) is a sequence of unique body hypotheses of model.
1566 if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER) {
1567 continue;
1568 }
1569 if (!StrongModel(model)) {
1570 while (start > 0 && CrownCompatible(rows, start - 1, start, model)) {
1571 start--;
1572 }
1573 }
1574 if (start == 0 || (!StrongModel(model)) ||
1575 (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1576 // crownify rows[start, end)
1577 const ParagraphModel *crown_model = model;
1578 if (StrongModel(model)) {
1579 if (model->justification() == JUSTIFICATION_LEFT) {
1580 crown_model = kCrownLeft;
1581 } else {
1582 crown_model = kCrownRight;
1583 }
1584 }
1585 (*rows)[start].SetUnknown();
1586 (*rows)[start].AddStartLine(crown_model);
1587 for (int row = start + 1; row < end; row++) {
1588 (*rows)[row].SetUnknown();
1589 (*rows)[row].AddBodyLine(crown_model);
1590 }
1591 }
1592 }
1593 DiscardUnusedModels(*rows, theory);
1594}
1595
1596// Clear all hypotheses about lines [start, end) and reset margins.
1597//
1598// The empty space between the left of a row and the block boundary (and
1599// similarly for the right) is split into two pieces: margin and indent.
1600// In initial processing, we assume the block is tight and the margin for
1601// all lines is set to zero. However, if our first pass does not yield
1602// models for everything, it may be due to an inset paragraph like a
1603// block-quote. In that case, we make a second pass over that unmarked
1604// section of the page and reset the "margin" portion of the empty space
1605// to the common amount of space at the ends of the lines under consid-
1606// eration. This would be equivalent to percentile set to 0. However,
1607// sometimes we have a single character sticking out in the right margin
1608// of a text block (like the 'r' in 'for' on line 3 above), and we can
1609// really just ignore it as an outlier. To express this, we allow the
1610// user to specify the percentile (0..100) of indent values to use as
1611// the common margin for each row in the run of rows[start, end).
1612void RecomputeMarginsAndClearHypotheses(std::vector<RowScratchRegisters> *rows, int start,
1613 int end, int percentile) {
1614 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1615 return;
1616 }
1617
1618 int lmin, lmax, rmin, rmax;
1619 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1620 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1621 for (int i = start; i < end; i++) {
1622 RowScratchRegisters &sr = (*rows)[i];
1623 sr.SetUnknown();
1624 if (sr.ri_->num_words == 0) {
1625 continue;
1626 }
1627 UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1628 UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1629 }
1630 STATS lefts(lmin, lmax);
1631 STATS rights(rmin, rmax);
1632 for (int i = start; i < end; i++) {
1633 RowScratchRegisters &sr = (*rows)[i];
1634 if (sr.ri_->num_words == 0) {
1635 continue;
1636 }
1637 lefts.add(sr.lmargin_ + sr.lindent_, 1);
1638 rights.add(sr.rmargin_ + sr.rindent_, 1);
1639 }
1640 int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1641 int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1642 for (int i = start; i < end; i++) {
1643 RowScratchRegisters &sr = (*rows)[i];
1644 int ldelta = ignorable_left - sr.lmargin_;
1645 sr.lmargin_ += ldelta;
1646 sr.lindent_ -= ldelta;
1647 int rdelta = ignorable_right - sr.rmargin_;
1648 sr.rmargin_ += rdelta;
1649 sr.rindent_ -= rdelta;
1650 }
1651}
1652
1653// Return the median inter-word space in rows[row_start, row_end).
1654int InterwordSpace(const std::vector<RowScratchRegisters> &rows, int row_start, int row_end) {
1655 if (row_end < row_start + 1) {
1656 return 1;
1657 }
1658 int word_height =
1659 (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1660 int word_width =
1661 (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1662 STATS spacing_widths(0, 4 + word_width);
1663 for (int i = row_start; i < row_end; i++) {
1664 if (rows[i].ri_->num_words > 1) {
1665 spacing_widths.add(rows[i].ri_->average_interword_space, 1);
1666 }
1667 }
1668 int minimum_reasonable_space = word_height / 3;
1669 if (minimum_reasonable_space < 2) {
1670 minimum_reasonable_space = 2;
1671 }
1672 int median = spacing_widths.median();
1673 return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1674}
1675
1676// Return whether the first word on the after line can fit in the space at
1677// the end of the before line (knowing which way the text is aligned and read).
1679 tesseract::ParagraphJustification justification) {
1680 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1681 return true;
1682 }
1683
1684 if (justification == JUSTIFICATION_UNKNOWN) {
1685 tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1686 }
1687 int available_space;
1688 if (justification == JUSTIFICATION_CENTER) {
1689 available_space = before.lindent_ + before.rindent_;
1690 } else {
1691 available_space = before.OffsideIndent(justification);
1692 }
1693 available_space -= before.ri_->average_interword_space;
1694
1695 if (before.ri_->ltr) {
1696 return after.ri_->lword_box.width() < available_space;
1697 }
1698 return after.ri_->rword_box.width() < available_space;
1699}
1700
1701// Return whether the first word on the after line can fit in the space at
1702// the end of the before line (not knowing which way the text goes) in a left
1703// or right alignment.
1705 if (before.ri_->num_words == 0 || after.ri_->num_words == 0) {
1706 return true;
1707 }
1708
1709 int available_space = before.lindent_;
1710 if (before.rindent_ > available_space) {
1711 available_space = before.rindent_;
1712 }
1713 available_space -= before.ri_->average_interword_space;
1714
1715 if (before.ri_->ltr) {
1716 return after.ri_->lword_box.width() < available_space;
1717 }
1718 return after.ri_->rword_box.width() < available_space;
1719}
1720
1721static bool TextSupportsBreak(const RowScratchRegisters &before, const RowScratchRegisters &after) {
1722 if (before.ri_->ltr) {
1723 return before.ri_->rword_likely_ends_idea && after.ri_->lword_likely_starts_idea;
1724 } else {
1725 return before.ri_->lword_likely_ends_idea && after.ri_->rword_likely_starts_idea;
1726 }
1727}
1728
1729static bool LikelyParagraphStart(const RowScratchRegisters &before,
1730 const RowScratchRegisters &after,
1732 return before.ri_->num_words == 0 ||
1733 (FirstWordWouldHaveFit(before, after, j) && TextSupportsBreak(before, after));
1734}
1735
1736// Examine rows[start, end) and try to determine what sort of ParagraphModel
1737// would fit them as a single paragraph.
1738// If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1739// If the rows given could be a consistent start to a paragraph, set *consistent
1740// true.
1741static ParagraphModel InternalParagraphModelByOutline(
1742 const std::vector<RowScratchRegisters> *rows, int start, int end, int tolerance,
1743 bool *consistent) {
1744 int ltr_line_count = 0;
1745 for (int i = start; i < end; i++) {
1746 ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1747 }
1748 bool ltr = (ltr_line_count >= (end - start) / 2);
1749
1750 *consistent = true;
1751 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1752 return ParagraphModel();
1753 }
1754
1755 // Ensure the caller only passed us a region with a common rmargin and
1756 // lmargin.
1757 int lmargin = (*rows)[start].lmargin_;
1758 int rmargin = (*rows)[start].rmargin_;
1759 int lmin, lmax, rmin, rmax, cmin, cmax;
1760 lmin = lmax = (*rows)[start + 1].lindent_;
1761 rmin = rmax = (*rows)[start + 1].rindent_;
1762 cmin = cmax = 0;
1763 for (int i = start + 1; i < end; i++) {
1764 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1765 tprintf("Margins don't match! Software error.\n");
1766 *consistent = false;
1767 return ParagraphModel();
1768 }
1769 UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1770 UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1771 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1772 }
1773 int ldiff = lmax - lmin;
1774 int rdiff = rmax - rmin;
1775 int cdiff = cmax - cmin;
1776 if (rdiff > tolerance && ldiff > tolerance) {
1777 if (cdiff < tolerance * 2) {
1778 if (end - start < 3) {
1779 return ParagraphModel();
1780 }
1781 return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1782 }
1783 *consistent = false;
1784 return ParagraphModel();
1785 }
1786 if (end - start < 3) { // Don't return a model for two line paras.
1787 return ParagraphModel();
1788 }
1789
1790 // These booleans keep us from saying something is aligned left when the body
1791 // left variance is too large.
1792 bool body_admits_left_alignment = ldiff < tolerance;
1793 bool body_admits_right_alignment = rdiff < tolerance;
1794
1795 ParagraphModel left_model = ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1796 (lmin + lmax) / 2, tolerance);
1797 ParagraphModel right_model = ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1798 (rmin + rmax) / 2, tolerance);
1799
1800 // These booleans keep us from having an indent on the "wrong side" for the
1801 // first line.
1802 bool text_admits_left_alignment = ltr || left_model.is_flush();
1803 bool text_admits_right_alignment = !ltr || right_model.is_flush();
1804
1805 // At least one of the edges is less than tolerance in variance.
1806 // If the other is obviously ragged, it can't be the one aligned to.
1807 // [Note the last line is included in this raggedness.]
1808 if (tolerance < rdiff) {
1809 if (body_admits_left_alignment && text_admits_left_alignment) {
1810 return left_model;
1811 }
1812 *consistent = false;
1813 return ParagraphModel();
1814 }
1815 if (tolerance < ldiff) {
1816 if (body_admits_right_alignment && text_admits_right_alignment) {
1817 return right_model;
1818 }
1819 *consistent = false;
1820 return ParagraphModel();
1821 }
1822
1823 // At this point, we know the body text doesn't vary much on either side.
1824
1825 // If the first line juts out oddly in one direction or the other,
1826 // that likely indicates the side aligned to.
1827 int first_left = (*rows)[start].lindent_;
1828 int first_right = (*rows)[start].rindent_;
1829
1830 if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1831 return left_model;
1832 }
1833 if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1834 return right_model;
1835 }
1836
1837 *consistent = false;
1838 return ParagraphModel();
1839}
1840
1841// Examine rows[start, end) and try to determine what sort of ParagraphModel
1842// would fit them as a single paragraph. If nothing fits,
1843// justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1844// output if we're debugging.
1845static ParagraphModel ParagraphModelByOutline(int debug_level,
1846 const std::vector<RowScratchRegisters> *rows,
1847 int start, int end, int tolerance) {
1848 bool unused_consistent;
1849 ParagraphModel retval =
1850 InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1851 if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1852 tprintf("Could not determine a model for this paragraph:\n");
1853 PrintRowRange(*rows, start, end);
1854 }
1855 return retval;
1856}
1857
1858// Do rows[start, end) form a single instance of the given paragraph model?
1859bool RowsFitModel(const std::vector<RowScratchRegisters> *rows, int start, int end,
1860 const ParagraphModel *model) {
1861 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1862 return false;
1863 }
1864 if (!ValidFirstLine(rows, start, model)) {
1865 return false;
1866 }
1867 for (int i = start + 1; i < end; i++) {
1868 if (!ValidBodyLine(rows, i, model)) {
1869 return false;
1870 }
1871 }
1872 return true;
1873}
1874
1875// Examine rows[row_start, row_end) as an independent section of text,
1876// and mark rows that are exceptionally clear as start-of-paragraph
1877// and paragraph-body lines.
1878//
1879// We presume that any lines surrounding rows[row_start, row_end) may
1880// have wildly different paragraph models, so we don't key any data off
1881// of those lines.
1882//
1883// We only take the very strongest signals, as we don't want to get
1884// confused and marking up centered text, poetry, or source code as
1885// clearly part of a typical paragraph.
1886static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows, int row_start,
1887 int row_end) {
1888 // Record patently obvious body text.
1889 for (int i = row_start + 1; i < row_end; i++) {
1890 const RowScratchRegisters &prev = (*rows)[i - 1];
1891 RowScratchRegisters &curr = (*rows)[i];
1892 tesseract::ParagraphJustification typical_justification =
1893 prev.ri_->ltr ? JUSTIFICATION_LEFT : JUSTIFICATION_RIGHT;
1894 if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1895 !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1896 curr.SetBodyLine();
1897 }
1898 }
1899
1900 // Record patently obvious start paragraph lines.
1901 //
1902 // It's an extremely good signal of the start of a paragraph that
1903 // the first word would have fit on the end of the previous line.
1904 // However, applying just that signal would have us mark random
1905 // start lines of lineated text (poetry and source code) and some
1906 // centered headings as paragraph start lines. Therefore, we use
1907 // a second qualification for a paragraph start: Not only should
1908 // the first word of this line have fit on the previous line,
1909 // but also, this line should go full to the right of the block,
1910 // disallowing a subsequent word from having fit on this line.
1911
1912 // First row:
1913 {
1914 RowScratchRegisters &curr = (*rows)[row_start];
1915 RowScratchRegisters &next = (*rows)[row_start + 1];
1917 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1918 (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1919 curr.SetStartLine();
1920 }
1921 }
1922 // Middle rows
1923 for (int i = row_start + 1; i < row_end - 1; i++) {
1924 RowScratchRegisters &prev = (*rows)[i - 1];
1925 RowScratchRegisters &curr = (*rows)[i];
1926 RowScratchRegisters &next = (*rows)[i + 1];
1928 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, next, j) &&
1929 LikelyParagraphStart(prev, curr, j)) {
1930 curr.SetStartLine();
1931 }
1932 }
1933 // Last row
1934 { // the short circuit at the top means we have at least two lines.
1935 RowScratchRegisters &prev = (*rows)[row_end - 2];
1936 RowScratchRegisters &curr = (*rows)[row_end - 1];
1938 if (curr.GetLineType() == LT_UNKNOWN && !FirstWordWouldHaveFit(curr, curr, j) &&
1939 LikelyParagraphStart(prev, curr, j)) {
1940 curr.SetStartLine();
1941 }
1942 }
1943}
1944
1945// Look for sequences of a start line followed by some body lines in
1946// rows[row_start, row_end) and create ParagraphModels for them if
1947// they seem coherent.
1948static void ModelStrongEvidence(int debug_level, std::vector<RowScratchRegisters> *rows,
1949 int row_start, int row_end, bool allow_flush_models,
1950 ParagraphTheory *theory) {
1951 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1952 return;
1953 }
1954
1955 int start = row_start;
1956 while (start < row_end) {
1957 while (start < row_end && (*rows)[start].GetLineType() != LT_START) {
1958 start++;
1959 }
1960 if (start >= row_end - 1) {
1961 break;
1962 }
1963
1964 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1965 int end = start;
1966 ParagraphModel last_model;
1967 bool next_consistent;
1968 do {
1969 ++end;
1970 // rows[row, end) was consistent.
1971 // If rows[row, end + 1) is not consistent,
1972 // just model rows[row, end)
1973 if (end < row_end - 1) {
1974 RowScratchRegisters &next = (*rows)[end];
1975 LineType lt = next.GetLineType();
1976 next_consistent = lt == LT_BODY || (lt == LT_UNKNOWN &&
1977 !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1978 } else {
1979 next_consistent = false;
1980 }
1981 if (next_consistent) {
1982 ParagraphModel next_model =
1983 InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1984 if (((*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_LEFT &&
1985 next_model.justification() != JUSTIFICATION_LEFT) ||
1986 (!(*rows)[start].ri_->ltr && last_model.justification() == JUSTIFICATION_RIGHT &&
1987 next_model.justification() != JUSTIFICATION_RIGHT)) {
1988 next_consistent = false;
1989 }
1990 last_model = next_model;
1991 } else {
1992 next_consistent = false;
1993 }
1994 } while (next_consistent && end < row_end);
1995 // At this point, rows[start, end) looked like it could have been a
1996 // single paragraph. If we can make a good ParagraphModel for it,
1997 // do so and mark this sequence with that model.
1998 if (end > start + 1) {
1999 // emit a new paragraph if we have more than one line.
2000 const ParagraphModel *model = nullptr;
2001 ParagraphModel new_model = ParagraphModelByOutline(
2002 debug_level, rows, start, end, Epsilon(InterwordSpace(*rows, start, end)));
2003 if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
2004 // couldn't create a good model, oh well.
2005 } else if (new_model.is_flush()) {
2006 if (end == start + 2) {
2007 // It's very likely we just got two paragraph starts in a row.
2008 end = start + 1;
2009 } else if (start == row_start) {
2010 // Mark this as a Crown.
2011 if (new_model.justification() == JUSTIFICATION_LEFT) {
2012 model = kCrownLeft;
2013 } else {
2014 model = kCrownRight;
2015 }
2016 } else if (allow_flush_models) {
2017 model = theory->AddModel(new_model);
2018 }
2019 } else {
2020 model = theory->AddModel(new_model);
2021 }
2022 if (model) {
2023 (*rows)[start].AddStartLine(model);
2024 for (int i = start + 1; i < end; i++) {
2025 (*rows)[i].AddBodyLine(model);
2026 }
2027 }
2028 }
2029 start = end;
2030 }
2031}
2032
2033// We examine rows[row_start, row_end) and do the following:
2034// (1) Clear all existing hypotheses for the rows being considered.
2035// (2) Mark up any rows as exceptionally likely to be paragraph starts
2036// or paragraph body lines as such using both geometric and textual
2037// clues.
2038// (3) Form models for any sequence of start + continuation lines.
2039// (4) Smear the paragraph models to cover surrounding text.
2040static void StrongEvidenceClassify(int debug_level, std::vector<RowScratchRegisters> *rows,
2041 int row_start, int row_end, ParagraphTheory *theory) {
2042 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2043 return;
2044 }
2045
2046 if (debug_level > 1) {
2047 tprintf("#############################################\n");
2048 tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2049 tprintf("#############################################\n");
2050 }
2051
2052 RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
2053 MarkStrongEvidence(rows, row_start, row_end);
2054
2055 DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
2056
2057 // Create paragraph models.
2058 ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
2059
2060 DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
2061
2062 // At this point, some rows are marked up as paragraphs with model numbers,
2063 // and some rows are marked up as either LT_START or LT_BODY. Now let's
2064 // smear any good paragraph hypotheses forward and backward.
2065 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2066 smearer.Smear();
2067}
2068
2069static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows, int row_start,
2070 int row_end, ParagraphTheory *theory) {
2071 for (int i = row_start + 1; i < row_end - 1; i++) {
2072 if ((*rows)[i - 1].ri_->has_leaders && (*rows)[i].ri_->has_leaders &&
2073 (*rows)[i + 1].ri_->has_leaders) {
2074 const ParagraphModel *model =
2075 theory->AddModel(ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
2076 (*rows)[i].AddStartLine(model);
2077 }
2078 }
2079}
2080
2081// Collect sequences of unique hypotheses in row registers and create proper
2082// paragraphs for them, referencing the paragraphs in row_owners.
2083static void ConvertHypothesizedModelRunsToParagraphs(int debug_level,
2084 std::vector<RowScratchRegisters> &rows,
2085 std::vector<PARA *> *row_owners,
2086 ParagraphTheory *theory) {
2087 int end = rows.size();
2088 int start;
2089 for (; end > 0; end = start) {
2090 start = end - 1;
2091 const ParagraphModel *model = nullptr;
2092 // TODO(eger): Be smarter about dealing with multiple hypotheses.
2093 bool single_line_paragraph = false;
2094 SetOfModels models;
2095 rows[start].NonNullHypotheses(&models);
2096 if (!models.empty()) {
2097 model = models[0];
2098 if (rows[start].GetLineType(model) != LT_BODY) {
2099 single_line_paragraph = true;
2100 }
2101 }
2102 if (model && !single_line_paragraph) {
2103 // walk back looking for more body lines and then a start line.
2104 while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2105 // do nothing
2106 }
2107 if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2108 model = nullptr;
2109 }
2110 }
2111 if (model == nullptr) {
2112 continue;
2113 }
2114 // rows[start, end) should be a paragraph.
2115 PARA *p = new PARA();
2116 if (model == kCrownLeft || model == kCrownRight) {
2117 p->is_very_first_or_continuation = true;
2118 // Crown paragraph.
2119 // If we can find an existing ParagraphModel that fits, use it,
2120 // else create a new one.
2121 for (unsigned row = end; row < rows.size(); row++) {
2122 if ((*row_owners)[row] &&
2123 (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2124 (start == 0 || ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2125 model = (*row_owners)[row]->model;
2126 break;
2127 }
2128 }
2129 if (model == kCrownLeft) {
2130 // No subsequent model fits, so cons one up.
2131 model = theory->AddModel(ParagraphModel(JUSTIFICATION_LEFT,
2132 rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2133 Epsilon(rows[start].ri_->average_interword_space)));
2134 } else if (model == kCrownRight) {
2135 // No subsequent model fits, so cons one up.
2136 model = theory->AddModel(ParagraphModel(JUSTIFICATION_RIGHT,
2137 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2138 Epsilon(rows[start].ri_->average_interword_space)));
2139 }
2140 }
2141 rows[start].SetUnknown();
2142 rows[start].AddStartLine(model);
2143 for (int i = start + 1; i < end; i++) {
2144 rows[i].SetUnknown();
2145 rows[i].AddBodyLine(model);
2146 }
2147 p->model = model;
2148 p->has_drop_cap = rows[start].ri_->has_drop_cap;
2149 p->is_list_item = model->justification() == JUSTIFICATION_RIGHT
2150 ? rows[start].ri_->rword_indicates_list_item
2151 : rows[start].ri_->lword_indicates_list_item;
2152 for (int row = start; row < end; row++) {
2153 if ((*row_owners)[row] != nullptr) {
2154 tprintf(
2155 "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2156 "more than once!\n");
2157 delete (*row_owners)[row];
2158 }
2159 (*row_owners)[row] = p;
2160 }
2161 }
2162}
2163
2164struct Interval {
2165 Interval() : begin(0), end(0) {}
2166 Interval(int b, int e) : begin(b), end(e) {}
2167
2169 int end;
2170};
2171
2172// Return whether rows[row] appears to be stranded, meaning that the evidence
2173// for this row is very weak due to context. For instance, two lines of source
2174// code may happen to be indented at the same tab vector as body text starts,
2175// leading us to think they are two start-of-paragraph lines. This is not
2176// optimal. However, we also don't want to mark a sequence of short dialog
2177// as "weak," so our heuristic is:
2178// (1) If a line is surrounded by lines of unknown type, it's weak.
2179// (2) If two lines in a row are start lines for a given paragraph type, but
2180// after that the same paragraph type does not continue, they're weak.
2181static bool RowIsStranded(const std::vector<RowScratchRegisters> &rows, int row) {
2182 SetOfModels row_models;
2183 rows[row].StrongHypotheses(&row_models);
2184
2185 for (auto &row_model : row_models) {
2186 bool all_starts = rows[row].GetLineType();
2187 int run_length = 1;
2188 bool continues = true;
2189 for (int i = row - 1; i >= 0 && continues; i--) {
2190 SetOfModels models;
2191 rows[i].NonNullHypotheses(&models);
2192 switch (rows[i].GetLineType(row_model)) {
2193 case LT_START:
2194 run_length++;
2195 break;
2196 case LT_MULTIPLE: // explicit fall-through
2197 case LT_BODY:
2198 run_length++;
2199 all_starts = false;
2200 break;
2201 case LT_UNKNOWN: // explicit fall-through
2202 default:
2203 continues = false;
2204 }
2205 }
2206 continues = true;
2207 for (unsigned i = row + 1; i < rows.size() && continues; i++) {
2208 SetOfModels models;
2209 rows[i].NonNullHypotheses(&models);
2210 switch (rows[i].GetLineType(row_model)) {
2211 case LT_START:
2212 run_length++;
2213 break;
2214 case LT_MULTIPLE: // explicit fall-through
2215 case LT_BODY:
2216 run_length++;
2217 all_starts = false;
2218 break;
2219 case LT_UNKNOWN: // explicit fall-through
2220 default:
2221 continues = false;
2222 }
2223 }
2224 if (run_length > 2 || (!all_starts && run_length > 1)) {
2225 return false;
2226 }
2227 }
2228 return true;
2229}
2230
2231// Go through rows[row_start, row_end) and gather up sequences that need better
2232// classification.
2233// + Sequences of non-empty rows without hypotheses.
2234// + Crown paragraphs not immediately followed by a strongly modeled line.
2235// + Single line paragraphs surrounded by text that doesn't match the
2236// model.
2237static void LeftoverSegments(const std::vector<RowScratchRegisters> &rows,
2238 std::vector<Interval> *to_fix, int row_start, int row_end) {
2239 to_fix->clear();
2240 for (int i = row_start; i < row_end; i++) {
2241 bool needs_fixing = false;
2242
2243 SetOfModels models;
2244 SetOfModels models_w_crowns;
2245 rows[i].StrongHypotheses(&models);
2246 rows[i].NonNullHypotheses(&models_w_crowns);
2247 if (models.empty() && !models_w_crowns.empty()) {
2248 // Crown paragraph. Is it followed by a modeled line?
2249 for (unsigned end = i + 1; end < rows.size(); end++) {
2250 SetOfModels end_models;
2251 SetOfModels strong_end_models;
2252 rows[end].NonNullHypotheses(&end_models);
2253 rows[end].StrongHypotheses(&strong_end_models);
2254 if (end_models.empty()) {
2255 needs_fixing = true;
2256 break;
2257 } else if (!strong_end_models.empty()) {
2258 needs_fixing = false;
2259 break;
2260 }
2261 }
2262 } else if (models.empty() && rows[i].ri_->num_words > 0) {
2263 // No models at all.
2264 needs_fixing = true;
2265 }
2266
2267 if (!needs_fixing && !models.empty()) {
2268 needs_fixing = RowIsStranded(rows, i);
2269 }
2270
2271 if (needs_fixing) {
2272 if (!to_fix->empty() && to_fix->back().end == i - 1) {
2273 to_fix->back().end = i;
2274 } else {
2275 to_fix->push_back(Interval(i, i));
2276 }
2277 }
2278 }
2279 // Convert inclusive intervals to half-open intervals.
2280 for (auto &i : *to_fix) {
2281 i.end = i.end + 1;
2282 }
2283}
2284
2285// Given a set of row_owners pointing to PARAs or nullptr (no paragraph known),
2286// normalize each row_owner to point to an actual PARA, and output the
2287// paragraphs in order onto paragraphs.
2288void CanonicalizeDetectionResults(std::vector<PARA *> *row_owners, PARA_LIST *paragraphs) {
2289 std::vector<PARA *> &rows = *row_owners;
2290 paragraphs->clear();
2291 PARA_IT out(paragraphs);
2292 PARA *formerly_null = nullptr;
2293 for (unsigned i = 0; i < rows.size(); i++) {
2294 if (rows[i] == nullptr) {
2295 if (i == 0 || rows[i - 1] != formerly_null) {
2296 rows[i] = formerly_null = new PARA();
2297 } else {
2298 rows[i] = formerly_null;
2299 continue;
2300 }
2301 } else if (i > 0 && rows[i - 1] == rows[i]) {
2302 continue;
2303 }
2304 out.add_after_then_move(rows[i]);
2305 }
2306}
2307
2308// Main entry point for Paragraph Detection Algorithm.
2309//
2310// Given a set of equally spaced textlines (described by row_infos),
2311// Split them into paragraphs.
2312//
2313// Output:
2314// row_owners - one pointer for each row, to the paragraph it belongs to.
2315// paragraphs - this is the actual list of PARA objects.
2316// models - the list of paragraph models referenced by the PARA objects.
2317// caller is responsible for deleting the models.
2318void DetectParagraphs(int debug_level, std::vector<RowInfo> *row_infos,
2319 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2320 std::vector<ParagraphModel *> *models) {
2321 ParagraphTheory theory(models);
2322
2323 // Initialize row_owners to be a bunch of nullptr pointers.
2324 row_owners->clear();
2325 row_owners->resize(row_infos->size());
2326
2327 // Set up row scratch registers for the main algorithm.
2328 std::vector<RowScratchRegisters> rows(row_infos->size());
2329 for (unsigned i = 0; i < row_infos->size(); i++) {
2330 rows[i].Init((*row_infos)[i]);
2331 }
2332
2333 // Pass 1:
2334 // Detect sequences of lines that all contain leader dots (.....)
2335 // These are likely Tables of Contents. If there are three text lines in
2336 // a row with leader dots, it's pretty safe to say the middle one should
2337 // be a paragraph of its own.
2338 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2339
2340 DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2341
2342 std::vector<Interval> leftovers;
2343 LeftoverSegments(rows, &leftovers, 0, rows.size());
2344 for (auto &leftover : leftovers) {
2345 // Pass 2a:
2346 // Find any strongly evidenced start-of-paragraph lines. If they're
2347 // followed by two lines that look like body lines, make a paragraph
2348 // model for that and see if that model applies throughout the text
2349 // (that is, "smear" it).
2350 StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2351
2352 // Pass 2b:
2353 // If we had any luck in pass 2a, we got part of the page and didn't
2354 // know how to classify a few runs of rows. Take the segments that
2355 // didn't find a model and reprocess them individually.
2356 std::vector<Interval> leftovers2;
2357 LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2358 bool pass2a_was_useful =
2359 leftovers2.size() > 1 ||
2360 (leftovers2.size() == 1 && (leftovers2[0].begin != 0 || static_cast<size_t>(leftovers2[0].end) != rows.size()));
2361 if (pass2a_was_useful) {
2362 for (auto &leftover2 : leftovers2) {
2363 StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2364 }
2365 }
2366 }
2367
2368 DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2369
2370 // Pass 3:
2371 // These are the dregs for which we didn't have enough strong textual
2372 // and geometric clues to form matching models for. Let's see if
2373 // the geometric clues are simple enough that we could just use those.
2374 LeftoverSegments(rows, &leftovers, 0, rows.size());
2375 for (auto &leftover : leftovers) {
2376 GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2377 }
2378
2379 // Undo any flush models for which there's little evidence.
2380 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2381
2382 DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2383
2384 // Pass 4:
2385 // Take everything that's still not marked up well and clear all markings.
2386 LeftoverSegments(rows, &leftovers, 0, rows.size());
2387 for (auto &leftover : leftovers) {
2388 for (int j = leftover.begin; j < leftover.end; j++) {
2389 rows[j].SetUnknown();
2390 }
2391 }
2392
2393 DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2394
2395 // Convert all of the unique hypothesis runs to PARAs.
2396 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2397
2398 DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2399
2400 // Finally, clean up any dangling nullptr row paragraph parents.
2401 CanonicalizeDetectionResults(row_owners, paragraphs);
2402}
2403
2404// ============ Code interfacing with the rest of Tesseract ==================
2405
2406static void InitializeTextAndBoxesPreRecognition(const MutableIterator &it, RowInfo *info) {
2407 // Set up text, lword_text, and rword_text (mostly for debug printing).
2408 std::string fake_text;
2409 PageIterator pit(static_cast<const PageIterator &>(it));
2410 bool first_word = true;
2411 if (!pit.Empty(RIL_WORD)) {
2412 do {
2413 fake_text += "x";
2414 if (first_word) {
2415 info->lword_text += "x";
2416 }
2417 info->rword_text += "x";
2418 if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2419 !pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL)) {
2420 fake_text += " ";
2421 info->rword_text = "";
2422 first_word = false;
2423 }
2424 } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) && pit.Next(RIL_SYMBOL));
2425 }
2426 if (fake_text.empty()) {
2427 return;
2428 }
2429
2430 int lspaces = info->pix_ldistance / info->average_interword_space;
2431 for (int i = 0; i < lspaces; i++) {
2432 info->text += ' ';
2433 }
2434 info->text += fake_text;
2435
2436 // Set up lword_box, rword_box, and num_words.
2437 PAGE_RES_IT page_res_it = *it.PageResIt();
2438 WERD_RES *word_res = page_res_it.restart_row();
2439 ROW_RES *this_row = page_res_it.row();
2440
2441 WERD_RES *lword = nullptr;
2442 WERD_RES *rword = nullptr;
2443 info->num_words = 0;
2444 do {
2445 if (word_res) {
2446 if (!lword) {
2447 lword = word_res;
2448 }
2449 if (rword != word_res) {
2450 info->num_words++;
2451 }
2452 rword = word_res;
2453 }
2454 word_res = page_res_it.forward();
2455 } while (page_res_it.row() == this_row);
2456
2457 if (lword) {
2458 info->lword_box = lword->word->bounding_box();
2459 }
2460 if (rword) {
2461 info->rword_box = rword->word->bounding_box();
2462 }
2463}
2464
2465// Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2466// detector RowInfo with all relevant information from the row.
2467static void InitializeRowInfo(bool after_recognition, const MutableIterator &it, RowInfo *info) {
2468 if (it.PageResIt()->row() != nullptr) {
2469 ROW *row = it.PageResIt()->row()->row;
2470 info->pix_ldistance = row->lmargin();
2471 info->pix_rdistance = row->rmargin();
2472 info->average_interword_space =
2473 row->space() > 0 ? row->space() : std::max(static_cast<int>(row->x_height()), 1);
2474 info->pix_xheight = row->x_height();
2475 info->has_leaders = false;
2476 info->has_drop_cap = row->has_drop_cap();
2477 info->ltr = true; // set below depending on word scripts
2478 } else {
2479 info->pix_ldistance = info->pix_rdistance = 0;
2480 info->average_interword_space = 1;
2481 info->pix_xheight = 1.0;
2482 info->has_leaders = false;
2483 info->has_drop_cap = false;
2484 info->ltr = true;
2485 }
2486
2487 info->num_words = 0;
2488 info->lword_indicates_list_item = false;
2489 info->lword_likely_starts_idea = false;
2490 info->lword_likely_ends_idea = false;
2491 info->rword_indicates_list_item = false;
2492 info->rword_likely_starts_idea = false;
2493 info->rword_likely_ends_idea = false;
2494 info->has_leaders = false;
2495 info->ltr = true;
2496
2497 if (!after_recognition) {
2498 InitializeTextAndBoxesPreRecognition(it, info);
2499 return;
2500 }
2501 info->text = "";
2502 const std::unique_ptr<const char[]> text(it.GetUTF8Text(RIL_TEXTLINE));
2503 int trailing_ws_idx = strlen(text.get()); // strip trailing space
2504 while (trailing_ws_idx > 0 &&
2505 // isspace() only takes ASCII
2506 isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2507 trailing_ws_idx--;
2508 }
2509 if (trailing_ws_idx > 0) {
2510 int lspaces = info->pix_ldistance / info->average_interword_space;
2511 for (int i = 0; i < lspaces; i++) {
2512 info->text += ' ';
2513 }
2514 for (int i = 0; i < trailing_ws_idx; i++) {
2515 info->text += text[i];
2516 }
2517 }
2518
2519 if (info->text.empty()) {
2520 return;
2521 }
2522
2523 PAGE_RES_IT page_res_it = *it.PageResIt();
2524 std::vector<WERD_RES *> werds;
2525 WERD_RES *word_res = page_res_it.restart_row();
2526 ROW_RES *this_row = page_res_it.row();
2527 int num_leaders = 0;
2528 int ltr = 0;
2529 int rtl = 0;
2530 do {
2531 if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2532 werds.push_back(word_res);
2533 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2534 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2535 if (word_res->word->flag(W_REP_CHAR)) {
2536 num_leaders++;
2537 }
2538 }
2539 word_res = page_res_it.forward();
2540 } while (page_res_it.row() == this_row);
2541 info->ltr = ltr >= rtl;
2542 info->has_leaders = num_leaders > 3;
2543 info->num_words = werds.size();
2544 if (!werds.empty()) {
2545 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2546 info->lword_text = lword->best_choice->unichar_string().c_str();
2547 info->rword_text = rword->best_choice->unichar_string().c_str();
2548 info->lword_box = lword->word->bounding_box();
2549 info->rword_box = rword->word->bounding_box();
2550 LeftWordAttributes(lword->uch_set, lword->best_choice, info->lword_text,
2551 &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2552 &info->lword_likely_ends_idea);
2553 RightWordAttributes(rword->uch_set, rword->best_choice, info->rword_text,
2554 &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2555 &info->rword_likely_ends_idea);
2556 }
2557}
2558
2559// This is called after rows have been identified and words are recognized.
2560// Much of this could be implemented before word recognition, but text helps
2561// to identify bulleted lists and gives good signals for sentence boundaries.
2562void DetectParagraphs(int debug_level, bool after_text_recognition,
2563 const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2564 // Clear out any preconceived notions.
2565 if (block_start->Empty(RIL_TEXTLINE)) {
2566 return;
2567 }
2568 BLOCK *block = block_start->PageResIt()->block()->block;
2569 block->para_list()->clear();
2570 bool is_image_block = block->pdblk.poly_block() && !block->pdblk.poly_block()->IsText();
2571
2572 // Convert the Tesseract structures to RowInfos
2573 // for the paragraph detection algorithm.
2574 MutableIterator row(*block_start);
2575 if (row.Empty(RIL_TEXTLINE)) {
2576 return; // end of input already.
2577 }
2578
2579 std::vector<RowInfo> row_infos;
2580 do {
2581 if (!row.PageResIt()->row()) {
2582 continue; // empty row.
2583 }
2584 row.PageResIt()->row()->row->set_para(nullptr);
2585 row_infos.emplace_back();
2586 RowInfo &ri = row_infos.back();
2587 InitializeRowInfo(after_text_recognition, row, &ri);
2588 } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) && row.Next(RIL_TEXTLINE));
2589
2590 // If we're called before text recognition, we might not have
2591 // tight block bounding boxes, so trim by the minimum on each side.
2592 if (!row_infos.empty()) {
2593 int min_lmargin = row_infos[0].pix_ldistance;
2594 int min_rmargin = row_infos[0].pix_rdistance;
2595 for (unsigned i = 1; i < row_infos.size(); i++) {
2596 if (row_infos[i].pix_ldistance < min_lmargin) {
2597 min_lmargin = row_infos[i].pix_ldistance;
2598 }
2599 if (row_infos[i].pix_rdistance < min_rmargin) {
2600 min_rmargin = row_infos[i].pix_rdistance;
2601 }
2602 }
2603 if (min_lmargin > 0 || min_rmargin > 0) {
2604 for (auto &row_info : row_infos) {
2605 row_info.pix_ldistance -= min_lmargin;
2606 row_info.pix_rdistance -= min_rmargin;
2607 }
2608 }
2609 }
2610
2611 // Run the paragraph detection algorithm.
2612 std::vector<PARA *> row_owners;
2613 std::vector<PARA *> the_paragraphs;
2614 if (!is_image_block) {
2615 DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(), models);
2616 } else {
2617 row_owners.resize(row_infos.size());
2618 CanonicalizeDetectionResults(&row_owners, block->para_list());
2619 }
2620
2621 // Now stitch in the row_owners into the rows.
2622 row = *block_start;
2623 for (auto &row_owner : row_owners) {
2624 while (!row.PageResIt()->row()) {
2625 row.Next(RIL_TEXTLINE);
2626 }
2627 row.PageResIt()->row()->row->set_para(row_owner);
2628 row.Next(RIL_TEXTLINE);
2629 }
2630}
2631
2632} // namespace tesseract
int value
const char * p
int * count
IntAfterTypedTestSuiteP after
IntBeforeRegisterTypedTestSuiteP before
@ W_REP_CHAR
repeated character
Definition: werd.h:40
bool NearlyEqual(T x, T y, T tolerance)
Definition: host.h:51
bool StrongModel(const ParagraphModel *model)
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
ParagraphJustification
Definition: publictypes.h:246
@ JUSTIFICATION_LEFT
Definition: publictypes.h:248
@ JUSTIFICATION_UNKNOWN
Definition: publictypes.h:247
@ JUSTIFICATION_RIGHT
Definition: publictypes.h:250
@ JUSTIFICATION_CENTER
Definition: publictypes.h:249
std::vector< const ParagraphModel * > SetOfModels
int InterwordSpace(const std::vector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool RowsFitModel(const std::vector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(std::vector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:477
void CanonicalizeDetectionResults(std::vector< PARA * > *row_owners, PARA_LIST *paragraphs)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
int UNICHAR_ID
Definition: unichar.h:34
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:56
void push_back_new(std::vector< T > &vector, const T &data)
Definition: paragraphs.cpp:418
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:58
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:117
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:431
bool CrownCompatible(const std::vector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool AsciiLikelyListItem(const std::string &word)
Definition: paragraphs.cpp:282
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
bool contains(const std::vector< T > &data, const T &value)
Definition: helpers.h:39
def next(obj)
Definition: ast.py:56
bool Empty(PageIteratorLevel level) const
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool Next(PageIteratorLevel level) override
const PAGE_RES_IT * PageResIt() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
Definition: paragraphs.cpp:300
unsigned SkipDigits(unsigned pos)
Definition: paragraphs.cpp:326
unsigned SkipRomans(unsigned pos)
Definition: paragraphs.cpp:334
unsigned SkipPunc(unsigned pos)
Definition: paragraphs.cpp:319
unsigned SkipAlpha(unsigned pos)
Definition: paragraphs.cpp:346
Cluster(int cen, int num)
Definition: paragraphs.cpp:703
void GetClusters(std::vector< Cluster > *clusters)
Definition: paragraphs.cpp:736
SimpleClusterer(int max_cluster_width)
Definition: paragraphs.cpp:711
std::vector< Cluster > right_tabs
void Fail(int min_debug_level, const char *why) const
Definition: paragraphs.cpp:977
bool FirstWordWouldHaveFit(int row_a, int row_b)
Definition: paragraphs.cpp:969
std::vector< RowScratchRegisters > * rows
Definition: paragraphs.cpp:994
GeometricClassifierState(int dbg_level, std::vector< RowScratchRegisters > *r, int r_start, int r_end)
Definition: paragraphs.cpp:912
ParagraphModel Model() const
Definition: paragraphs.cpp:985
tesseract::ParagraphJustification just
const std::vector< Cluster > & AlignTabs() const
Definition: paragraphs.cpp:937
const std::vector< Cluster > & OffsideTabs() const
Definition: paragraphs.cpp:949
int AlignsideTabIndex(int row_idx) const
Definition: paragraphs.cpp:963
std::vector< Cluster > left_tabs
Interval(int b, int e)
void StartHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:649
const ParagraphModel * UniqueStartHypothesis() const
Definition: paragraphs.cpp:673
void NonNullHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:665
void AddBodyLine(const ParagraphModel *model)
Definition: paragraphs.cpp:641
void StrongHypotheses(SetOfModels *models) const
Definition: paragraphs.cpp:657
static void AppendDebugHeaderFields(std::vector< std::string > &header)
Definition: paragraphs.cpp:510
void AppendDebugInfo(const ParagraphTheory &theory, std::vector< std::string > &dbg) const
Definition: paragraphs.cpp:515
void DiscardNonMatchingHypotheses(const SetOfModels &models)
Definition: paragraphs.cpp:688
void AddStartLine(const ParagraphModel *model)
Definition: paragraphs.cpp:633
const ParagraphModel * UniqueBodyHypothesis() const
Definition: paragraphs.cpp:680
void Init(const RowInfo &row)
Definition: paragraphs.cpp:552
void NonCenteredModels(SetOfModels *models)
std::vector< ParagraphModel * > & models()
const ParagraphModel * Fits(const std::vector< RowScratchRegisters > *rows, int start, int end) const
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(std::vector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:185
PARA_LIST * para_list()
Definition: ocrblock.h:119
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:45
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
Definition: ocrpara.cpp:59
void set_para(PARA *p)
Definition: ocrrow.h:117
BLOCK_RES * block() const
Definition: pageres.h:769
ROW_RES * row() const
Definition: pageres.h:766
POLY_BLOCK * poly_block() const
Definition: pdblock.h:59
bool IsText() const
Definition: polyblk.h:52
UNICHAR_ID unichar_id(unsigned index) const
Definition: ratngs.h:299
bool empty() const
Definition: ratngs.h:284
unsigned length() const
Definition: ratngs.h:287
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double median() const
Definition: statistc.cpp:241
double ile(double frac) const
Definition: statistc.cpp:172
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:515
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:524
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:533