50static const char *
const kRLE =
"\u202A";
51static const char *
const kPDF =
"\u202C";
57 reinterpret_cast<ParagraphModel *
>(
static_cast<uintptr_t
>(0xDEAD111F));
59 reinterpret_cast<ParagraphModel *
>(
static_cast<uintptr_t
>(0xDEAD888F));
69static int Epsilon(
int space_pix) {
70 return space_pix * 4 / 5;
73static bool AcceptableRowArgs(
int debug_level,
int min_num_rows,
const char *function_name,
74 const std::vector<RowScratchRegisters> *rows,
int row_start,
76 if (row_start < 0 ||
static_cast<size_t>(row_end) > rows->size() || row_start > row_end) {
77 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %zu.\n", row_start, row_end,
81 if (row_end - row_start < min_num_rows) {
82 if (debug_level > 1) {
83 tprintf(
"# Too few rows[%d, %d) for %s.\n", row_start, row_end, function_name);
94static void PrintTable(
const std::vector<std::vector<std::string>> &rows,
const char *colsep) {
95 std::vector<int> max_col_widths;
96 for (
const auto &row : rows) {
97 auto num_columns = row.size();
98 for (
size_t c = 0; c < num_columns; c++) {
100 for (
char i : row[c]) {
101 if ((
i & 0xC0) != 0x80) {
105 if (c >= max_col_widths.size()) {
106 max_col_widths.push_back(num_unicodes);
108 if (num_unicodes > max_col_widths[c]) {
109 max_col_widths[c] = num_unicodes;
115 std::vector<std::string> col_width_patterns;
116 col_width_patterns.reserve(max_col_widths.size());
117 for (
int max_col_width : max_col_widths) {
118 col_width_patterns.push_back(std::string(
"%-") + std::to_string(max_col_width) +
"s");
121 for (
const auto &row : rows) {
122 for (
unsigned c = 0; c < row.size(); c++) {
126 tprintf(col_width_patterns[c].c_str(), row[c].c_str());
132static std::string RtlEmbed(
const std::string &word,
bool rtlify) {
134 return std::string(kRLE) + word + std::string(kPDF);
140static void PrintDetectorState(
const ParagraphTheory &theory,
141 const std::vector<RowScratchRegisters> &rows) {
142 std::vector<std::vector<std::string>>
output;
144 output.back().push_back(
"#row");
145 output.back().push_back(
"space");
146 output.back().push_back(
"..");
147 output.back().push_back(
"lword[widthSEL]");
148 output.back().push_back(
"rword[widthSEL]");
150 output.back().push_back(
"text");
152 for (
unsigned i = 0;
i < rows.size();
i++) {
154 std::vector<std::string> &row =
output.back();
155 const RowInfo &ri = *rows[
i].ri_;
156 row.push_back(std::to_string(
i));
157 row.push_back(std::to_string(ri.average_interword_space));
158 row.emplace_back(ri.has_leaders ?
".." :
" ");
159 row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
"[" + std::to_string(ri.lword_box.width()) +
160 (ri.lword_likely_starts_idea ?
"S" :
"s") +
161 (ri.lword_likely_ends_idea ?
"E" :
"e") +
162 (ri.lword_indicates_list_item ?
"L" :
"l") +
"]");
163 row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
"[" + std::to_string(ri.rword_box.width()) +
164 (ri.rword_likely_starts_idea ?
"S" :
"s") +
165 (ri.rword_likely_ends_idea ?
"E" :
"e") +
166 (ri.rword_indicates_list_item ?
"L" :
"l") +
"]");
167 rows[
i].AppendDebugInfo(theory, row);
168 row.push_back(RtlEmbed(ri.text, !ri.ltr));
172 tprintf(
"Active Paragraph Models:\n");
174 for (
const auto &model : theory.models()) {
175 tprintf(
" %d: %s\n", ++m, model->ToString().c_str());
179static void DebugDump(
bool should_print,
const char *phase,
const ParagraphTheory &theory,
180 const std::vector<RowScratchRegisters> &rows) {
185 PrintDetectorState(theory, rows);
189static void PrintRowRange(
const std::vector<RowScratchRegisters> &rows,
int row_start,
191 tprintf(
"======================================\n");
192 for (
int row = row_start; row < row_end; row++) {
193 tprintf(
"%s\n", rows[row].ri_->text.c_str());
195 tprintf(
"======================================\n");
200static bool IsLatinLetter(
int ch) {
201 return (
ch >=
'a' &&
ch <=
'z') || (
ch >=
'A' &&
ch <=
'Z');
204static bool IsDigitLike(
int ch) {
205 return ch ==
'o' ||
ch ==
'O' ||
ch ==
'l' ||
ch ==
'I';
208static bool IsOpeningPunct(
int ch) {
209 return strchr(
"'\"({[",
ch) !=
nullptr;
212static bool IsTerminalPunct(
int ch) {
213 return strchr(
":'\".?!]})",
ch) !=
nullptr;
217static const char *SkipChars(
const char *str,
const char *toskip) {
218 while (*str !=
'\0' && strchr(toskip, *str)) {
224static const char *SkipChars(
const char *str,
bool (*skip)(
int)) {
225 while (*str !=
'\0' && skip(*str)) {
231static const char *SkipOne(
const char *str,
const char *toskip) {
232 if (*str !=
'\0' && strchr(toskip, *str)) {
241static bool LikelyListNumeral(
const std::string &word) {
242 const char *kRomans =
"ivxlmdIVXLMD";
243 const char *kDigits =
"012345789";
244 const char *kOpen =
"[{(";
245 const char *kSep =
":;-.,";
246 const char *kClose =
"]})";
248 int num_segments = 0;
249 const char *pos = word.c_str();
250 while (*pos !=
'\0' && num_segments < 3) {
252 const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
253 const char *numeral_end = SkipChars(numeral_start, kRomans);
254 if (numeral_end != numeral_start) {
257 numeral_end = SkipChars(numeral_start, kDigits);
258 if (numeral_end == numeral_start) {
260 numeral_end = SkipChars(numeral_start, IsLatinLetter);
261 if (numeral_end - numeral_start != 1) {
269 pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
270 if (pos == numeral_end) {
277static bool LikelyListMark(
const std::string &word) {
278 const char *kListMarks =
"0Oo*.,+.";
279 return word.size() == 1 && strchr(kListMarks, word[0]) !=
nullptr;
283 return LikelyListMark(word) || LikelyListNumeral(word);
289static int UnicodeFor(
const UNICHARSET *u,
const WERD_CHOICE *werd,
unsigned pos) {
290 if (!u || !werd || pos > werd->length()) {
293 return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
301 : u_(unicharset), word_(word), wordlen_(word->length()) {
320 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) {
327 while (pos < wordlen_ &&
335 const char *kRomans =
"ivxlmdIVXLMD";
336 while (pos < wordlen_) {
337 int ch = UnicodeFor(u_, word_, pos);
338 if (
ch >= 0xF0 || strchr(kRomans,
ch) ==
nullptr) {
347 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) {
353static bool LikelyListMarkUnicode(
int ch) {
355 std::string single_ch;
357 return LikelyListMark(single_ch);
382static bool UniLikelyListItem(
const UNICHARSET *u,
const WERD_CHOICE *werd) {
383 if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0))) {
387 UnicodeSpanSkipper m(u, werd);
388 int num_segments = 0;
390 while (pos < werd->length() && num_segments < 3) {
391 auto numeral_start = m.SkipPunc(pos);
392 if (numeral_start > pos + 1) {
395 auto numeral_end = m.SkipRomans(numeral_start);
396 if (numeral_end == numeral_start) {
397 numeral_end = m.SkipDigits(numeral_start);
398 if (numeral_end == numeral_start) {
400 numeral_end = m.SkipAlpha(numeral_start);
401 if (numeral_end - numeral_start != 1) {
409 pos = m.SkipPunc(numeral_end);
410 if (pos == numeral_end) {
414 return pos == werd->length();
419 if (std::find(vector.begin(), vector.end(), data) == vector.end()) {
420 vector.push_back(data);
432 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
434 *starts_idea =
false;
436 if (utf8.empty() || (werd !=
nullptr && werd->
empty())) {
441 if (unicharset && werd) {
442 if (UniLikelyListItem(unicharset, werd)) {
459 int start_letter = utf8[0];
460 if (IsOpeningPunct(start_letter)) {
463 if (IsTerminalPunct(start_letter)) {
466 if (start_letter >=
'A' && start_letter <=
'Z') {
478 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
480 *starts_idea =
false;
482 if (utf8.empty() || (werd !=
nullptr && werd->
empty())) {
487 if (unicharset && werd) {
488 if (UniLikelyListItem(unicharset, werd)) {
501 int last_letter = utf8[utf8.size() - 1];
502 if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
511 header.emplace_back(
"[lmarg,lind;rind,rmarg]");
512 header.emplace_back(
"model");
516 std::vector<std::string> &dbg)
const {
524 std::string model_string;
528 int model_numbers = 0;
529 for (
const auto &hypothese : hypotheses_) {
530 if (hypothese.model ==
nullptr) {
533 if (model_numbers > 0) {
537 model_string += std::to_string(1 + theory.
IndexOf(hypothese.model));
539 model_string +=
"CrL";
541 model_string +=
"CrR";
545 if (model_numbers == 0) {
549 dbg.push_back(model_string);
561 if (hypotheses_.empty()) {
564 bool has_start =
false;
565 bool has_body =
false;
566 for (
const auto &hypothese : hypotheses_) {
567 switch (hypothese.ty) {
575 tprintf(
"Encountered bad value in hypothesis list: %c\n", hypothese.ty);
579 if (has_start && has_body) {
586 if (hypotheses_.empty()) {
589 bool has_start =
false;
590 bool has_body =
false;
591 for (
const auto &hypothese : hypotheses_) {
592 if (hypothese.model != model) {
595 switch (hypothese.ty) {
603 tprintf(
"Encountered bad value in hypothesis list: %c\n", hypothese.ty);
607 if (has_start && has_body) {
616 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
626 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
636 if (found != hypotheses_.end()) {
637 hypotheses_.erase(found);
644 if (found != hypotheses_.end()) {
645 hypotheses_.erase(found);
650 for (
const auto &hypothese : hypotheses_) {
658 for (
const auto &hypothese : hypotheses_) {
666 for (
const auto &hypothese : hypotheses_) {
667 if (hypothese.model !=
nullptr) {
674 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_START) {
677 return hypotheses_[0].model;
681 if (hypotheses_.size() != 1 || hypotheses_[0].ty !=
LT_BODY) {
684 return hypotheses_[0].model;
689 if (models.empty()) {
692 for (
int h = hypotheses_.size() - 1; h >= 0; h--) {
693 if (!
contains(models, hypotheses_[h].model)) {
694 hypotheses_.erase(hypotheses_.begin() + h);
711 explicit SimpleClusterer(
int max_cluster_width) : max_cluster_width_(max_cluster_width) {}
713 values_.push_back(
value);
716 return values_.size();
721 int max_cluster_width_;
722 std::vector<int> values_;
726static int ClosestCluster(
const std::vector<Cluster> &clusters,
int value) {
727 unsigned best_index = 0;
728 for (
unsigned i = 0;
i < clusters.size();
i++) {
729 if (abs(
value - clusters[
i].center) < abs(
value - clusters[best_index].center)) {
738 std::sort(values_.begin(), values_.end());
739 for (
unsigned i = 0;
i < values_.size();) {
743 while (++
i < values_.size() && values_[
i] <= lo + max_cluster_width_) {
746 clusters->push_back(
Cluster((hi + lo) / 2,
i - orig_i));
752static void CalculateTabStops(std::vector<RowScratchRegisters> *rows,
int row_start,
int row_end,
753 int tolerance, std::vector<Cluster> *left_tabs,
754 std::vector<Cluster> *right_tabs) {
755 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end)) {
759 SimpleClusterer initial_lefts(tolerance);
760 SimpleClusterer initial_rights(tolerance);
761 std::vector<Cluster> initial_left_tabs;
762 std::vector<Cluster> initial_right_tabs;
763 for (
int i = row_start;
i < row_end;
i++) {
764 initial_lefts.Add((*rows)[
i].lindent_);
765 initial_rights.Add((*rows)[
i].rindent_);
767 initial_lefts.GetClusters(&initial_left_tabs);
768 initial_rights.GetClusters(&initial_right_tabs);
776 SimpleClusterer lefts(tolerance);
777 SimpleClusterer rights(tolerance);
783 int infrequent_enough_to_ignore = 0;
784 if (row_end - row_start >= 8) {
785 infrequent_enough_to_ignore = 1;
787 if (row_end - row_start >= 20) {
788 infrequent_enough_to_ignore = 2;
791 for (
int i = row_start;
i < row_end;
i++) {
792 int lidx = ClosestCluster(initial_left_tabs, (*rows)[
i].lindent_);
793 int ridx = ClosestCluster(initial_right_tabs, (*rows)[
i].rindent_);
794 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
795 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
796 lefts.Add((*rows)[
i].lindent_);
797 rights.Add((*rows)[
i].rindent_);
800 lefts.GetClusters(left_tabs);
801 rights.GetClusters(right_tabs);
803 if ((left_tabs->size() == 1 && right_tabs->size() >= 4) ||
804 (right_tabs->size() == 1 && left_tabs->size() >= 4)) {
809 for (
int i = row_start;
i < row_end;
i++) {
810 int lidx = ClosestCluster(initial_left_tabs, (*rows)[
i].lindent_);
811 int ridx = ClosestCluster(initial_right_tabs, (*rows)[
i].rindent_);
812 if (!(initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
813 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore)) {
814 lefts.Add((*rows)[
i].lindent_);
815 rights.Add((*rows)[
i].rindent_);
819 lefts.GetClusters(left_tabs);
820 rights.GetClusters(right_tabs);
824 if (left_tabs->size() == 3 && right_tabs->size() >= 4) {
826 for (
int i = left_tabs->size() - 1;
i >= 0;
i--) {
827 if (to_prune < 0 || (*left_tabs)[
i].
count < (*left_tabs)[to_prune].
count) {
831 if (to_prune >= 0 && (*left_tabs)[to_prune].
count <= infrequent_enough_to_ignore) {
832 left_tabs->erase(left_tabs->begin() + to_prune);
835 if (right_tabs->size() == 3 && left_tabs->size() >= 4) {
837 for (
int i = right_tabs->size() - 1;
i >= 0;
i--) {
838 if (to_prune < 0 || (*right_tabs)[
i].
count < (*right_tabs)[to_prune].
count) {
842 if (to_prune >= 0 && (*right_tabs)[to_prune].
count <= infrequent_enough_to_ignore) {
843 right_tabs->erase(right_tabs->begin() + to_prune);
867static void MarkRowsWithModel(std::vector<RowScratchRegisters> *rows,
int row_start,
int row_end,
868 const ParagraphModel *model,
bool ltr,
int eop_threshold) {
869 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
872 for (
int row = row_start; row < row_end; row++) {
875 if (valid_first && !valid_body) {
876 (*rows)[row].AddStartLine(model);
877 }
else if (valid_body && !valid_first) {
878 (*rows)[row].AddBodyLine(model);
879 }
else if (valid_body && valid_first) {
880 bool after_eop = (row == row_start);
881 if (row > row_start) {
882 if (eop_threshold > 0) {
884 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
886 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
893 (*rows)[row].AddStartLine(model);
895 (*rows)[row].AddBodyLine(model);
919 "Geometry: TabStop cluster tolerance = %d; "
920 "%zu left tabs; %zu right tabs\n",
923 ltr = (*r)[r_start].ri_->ltr;
977 void Fail(
int min_debug_level,
const char *why)
const {
994 std::vector<RowScratchRegisters> *
rows;
1050 int num_full_rows = 0;
1051 int last_row_full = 0;
1061 if (num_full_rows < 0.7 * num_rows) {
1062 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
1075 if (debug_level > 0) {
1077 "# Not enough variety for clear outline classification. "
1078 "Guessing these are %s aligned based on script.\n",
1079 s.
ltr ?
"left" :
"right");
1087 if (num_rows - 1 == num_full_rows - last_row_full) {
1092 (*s.
rows)[
i].AddBodyLine(model);
1138static void GeometricClassify(
int debug_level, std::vector<RowScratchRegisters> *rows,
1139 int row_start,
int row_end, ParagraphTheory *theory) {
1140 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end)) {
1143 if (debug_level > 1) {
1144 tprintf(
"###############################################\n");
1145 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n", row_start, row_end);
1146 tprintf(
"###############################################\n");
1150 GeometricClassifierState s(debug_level, rows, row_start, row_end);
1152 s.
Fail(2,
"Too much variety for simple outline classification.");
1156 s.
Fail(1,
"Not enough variety for simple outline classification.");
1160 GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1185 int firsts[2] = {0, 0};
1190 bool jam_packed =
true;
1205 int percent0firsts, percent1firsts;
1206 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1207 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1210 if ((percent0firsts < 20 && 30 < percent1firsts) || percent0firsts + 30 < percent1firsts) {
1213 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1214 percent1firsts + 30 < percent0firsts) {
1219 if (debug_level > 1) {
1220 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1222 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1223 s.
AlignTabs()[0].center, percent0firsts);
1224 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1225 s.
AlignTabs()[1].center, percent1firsts);
1236 const ParagraphModel *model = theory->AddModel(s.
Model());
1266 MarkRowsWithModel(rows, row_start, row_end, model, s.
ltr, s.
eop_threshold);
1272 for (
const auto &m : *models_) {
1273 if (m->Comparable(model)) {
1278 models_->push_back(m);
1285 for (
size_t r = 0; r < models_->size(); r++) {
1303 int start,
int end)
const {
1304 for (
const auto *model : *models_) {
1313 for (
const auto *model : *models_) {
1322 for (
const auto *m : *models_) {
1334 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1337 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1343 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1346 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1352 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1355 auto &row_a = (*rows)[a];
1356 auto &row_b = (*rows)[b];
1358 return NearlyEqual(row_a.rindent_ + row_a.rmargin_, row_b.rindent_ + row_b.rmargin_,
1359 Epsilon(row_a.ri_->average_interword_space));
1361 return NearlyEqual(row_a.lindent_ + row_a.lmargin_, row_b.lindent_ + row_b.lmargin_,
1362 Epsilon(row_a.ri_->average_interword_space));
1369 : theory_(theory), rows_(rows), row_start_(row_start), row_end_(row_end) {
1370 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1375 open_models_.resize(open_models_.size() + row_end - row_start + 2);
1379void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1381 if (row_start < row_start_) {
1382 row_start = row_start_;
1384 if (row_end > row_end_) {
1388 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end; row++) {
1389 if ((*rows_)[row].ri_->num_words == 0) {
1390 OpenModels(row + 1) = no_models;
1393 (*rows_)[row].StartHypotheses(&opened);
1397 for (
auto &m : opened) {
1405 OpenModels(row + 1) = still_open;
1412 CalculateOpenModels(row_start_, row_end_);
1417 for (
int i = row_start_;
i < row_end_;
i++) {
1427 bool left_align_open =
false;
1428 bool right_align_open =
false;
1429 for (
auto &m : OpenModels(
i)) {
1430 switch (m->justification()) {
1432 left_align_open =
true;
1435 right_align_open =
true;
1438 left_align_open = right_align_open =
true;
1446 likely_start =
true;
1448 if ((left_align_open && right_align_open) || (!left_align_open && !right_align_open)) {
1451 }
else if (left_align_open) {
1464 for (
unsigned m = 0; m < OpenModels(
i).size(); m++) {
1473 (*rows_)[
i - 1].StrongHypotheses(&last_line_models);
1477 for (
auto model : last_line_models) {
1492 for (
auto &all_model : all_models) {
1502 CalculateOpenModels(
i + 1, row_end_);
1511static void DiscardUnusedModels(
const std::vector<RowScratchRegisters> &rows,
1514 for (
const auto &row : rows) {
1515 row.StrongHypotheses(&used_models);
1544static void DowngradeWeakestToCrowns(
int debug_level, ParagraphTheory *theory,
1545 std::vector<RowScratchRegisters> *rows) {
1547 for (
int end = rows->size(); end > 0; end = start) {
1549 const ParagraphModel *model =
nullptr;
1550 while (end > 0 && (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
nullptr) {
1557 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1560 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
StrongModel(model) &&
1561 NearlyEqual(model->first_indent(), model->body_indent(), model->tolerance())) {
1577 const ParagraphModel *crown_model = model;
1585 (*rows)[start].SetUnknown();
1586 (*rows)[start].AddStartLine(crown_model);
1587 for (
int row = start + 1; row < end; row++) {
1588 (*rows)[row].SetUnknown();
1589 (*rows)[row].AddBodyLine(crown_model);
1593 DiscardUnusedModels(*rows, theory);
1613 int end,
int percentile) {
1614 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end)) {
1618 int lmin, lmax, rmin, rmax;
1619 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1620 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1621 for (
int i = start;
i < end;
i++) {
1630 STATS lefts(lmin, lmax);
1631 STATS rights(rmin, rmax);
1632 for (
int i = start;
i < end;
i++) {
1640 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1641 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1642 for (
int i = start;
i < end;
i++) {
1644 int ldelta = ignorable_left - sr.
lmargin_;
1647 int rdelta = ignorable_right - sr.
rmargin_;
1654int InterwordSpace(
const std::vector<RowScratchRegisters> &rows,
int row_start,
int row_end) {
1655 if (row_end < row_start + 1) {
1659 (rows[row_start].ri_->lword_box.height() + rows[row_end - 1].ri_->lword_box.height()) / 2;
1661 (rows[row_start].ri_->lword_box.width() + rows[row_end - 1].ri_->lword_box.width()) / 2;
1662 STATS spacing_widths(0, 4 + word_width);
1663 for (
int i = row_start;
i < row_end;
i++) {
1664 if (rows[
i].ri_->num_words > 1) {
1665 spacing_widths.
add(rows[
i].ri_->average_interword_space, 1);
1668 int minimum_reasonable_space = word_height / 3;
1669 if (minimum_reasonable_space < 2) {
1670 minimum_reasonable_space = 2;
1672 int median = spacing_widths.
median();
1673 return (median > minimum_reasonable_space) ? median : minimum_reasonable_space;
1680 if (
before.ri_->num_words == 0 ||
after.ri_->num_words == 0) {
1685 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1687 int available_space;
1691 available_space =
before.OffsideIndent(justification);
1693 available_space -=
before.ri_->average_interword_space;
1696 return after.ri_->lword_box.width() < available_space;
1698 return after.ri_->rword_box.width() < available_space;
1705 if (
before.ri_->num_words == 0 ||
after.ri_->num_words == 0) {
1709 int available_space =
before.lindent_;
1710 if (
before.rindent_ > available_space) {
1711 available_space =
before.rindent_;
1713 available_space -=
before.ri_->average_interword_space;
1716 return after.ri_->lword_box.width() < available_space;
1718 return after.ri_->rword_box.width() < available_space;
1721static bool TextSupportsBreak(
const RowScratchRegisters &
before,
const RowScratchRegisters &
after) {
1723 return before.ri_->rword_likely_ends_idea &&
after.ri_->lword_likely_starts_idea;
1725 return before.ri_->lword_likely_ends_idea &&
after.ri_->rword_likely_starts_idea;
1729static bool LikelyParagraphStart(
const RowScratchRegisters &
before,
1730 const RowScratchRegisters &
after,
1732 return before.ri_->num_words == 0 ||
1741static ParagraphModel InternalParagraphModelByOutline(
1742 const std::vector<RowScratchRegisters> *rows,
int start,
int end,
int tolerance,
1744 int ltr_line_count = 0;
1745 for (
int i = start;
i < end;
i++) {
1746 ltr_line_count +=
static_cast<int>((*rows)[
i].ri_->ltr);
1748 bool ltr = (ltr_line_count >= (end - start) / 2);
1751 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end)) {
1752 return ParagraphModel();
1757 int lmargin = (*rows)[start].lmargin_;
1758 int rmargin = (*rows)[start].rmargin_;
1759 int lmin, lmax, rmin, rmax, cmin, cmax;
1760 lmin = lmax = (*rows)[start + 1].lindent_;
1761 rmin = rmax = (*rows)[start + 1].rindent_;
1763 for (
int i = start + 1;
i < end;
i++) {
1764 if ((*rows)[
i].lmargin_ != lmargin || (*rows)[
i].rmargin_ != rmargin) {
1765 tprintf(
"Margins don't match! Software error.\n");
1766 *consistent =
false;
1767 return ParagraphModel();
1771 UpdateRange((*rows)[
i].rindent_ - (*rows)[
i].lindent_, &cmin, &cmax);
1773 int ldiff = lmax - lmin;
1774 int rdiff = rmax - rmin;
1775 int cdiff = cmax - cmin;
1776 if (rdiff > tolerance && ldiff > tolerance) {
1777 if (cdiff < tolerance * 2) {
1778 if (end - start < 3) {
1779 return ParagraphModel();
1783 *consistent =
false;
1784 return ParagraphModel();
1786 if (end - start < 3) {
1787 return ParagraphModel();
1792 bool body_admits_left_alignment = ldiff < tolerance;
1793 bool body_admits_right_alignment = rdiff < tolerance;
1795 ParagraphModel left_model = ParagraphModel(
JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1796 (lmin + lmax) / 2, tolerance);
1797 ParagraphModel right_model = ParagraphModel(
JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1798 (rmin + rmax) / 2, tolerance);
1802 bool text_admits_left_alignment = ltr || left_model.is_flush();
1803 bool text_admits_right_alignment = !ltr || right_model.is_flush();
1808 if (tolerance < rdiff) {
1809 if (body_admits_left_alignment && text_admits_left_alignment) {
1812 *consistent =
false;
1813 return ParagraphModel();
1815 if (tolerance < ldiff) {
1816 if (body_admits_right_alignment && text_admits_right_alignment) {
1819 *consistent =
false;
1820 return ParagraphModel();
1827 int first_left = (*rows)[start].lindent_;
1828 int first_right = (*rows)[start].rindent_;
1830 if (ltr && body_admits_left_alignment && (first_left < lmin || first_left > lmax)) {
1833 if (!ltr && body_admits_right_alignment && (first_right < rmin || first_right > rmax)) {
1837 *consistent =
false;
1838 return ParagraphModel();
1845static ParagraphModel ParagraphModelByOutline(
int debug_level,
1846 const std::vector<RowScratchRegisters> *rows,
1847 int start,
int end,
int tolerance) {
1848 bool unused_consistent;
1849 ParagraphModel retval =
1850 InternalParagraphModelByOutline(rows, start, end, tolerance, &unused_consistent);
1852 tprintf(
"Could not determine a model for this paragraph:\n");
1853 PrintRowRange(*rows, start, end);
1859bool RowsFitModel(
const std::vector<RowScratchRegisters> *rows,
int start,
int end,
1861 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end)) {
1867 for (
int i = start + 1;
i < end;
i++) {
1886static void MarkStrongEvidence(std::vector<RowScratchRegisters> *rows,
int row_start,
1889 for (
int i = row_start + 1;
i < row_end;
i++) {
1890 const RowScratchRegisters &prev = (*rows)[
i - 1];
1891 RowScratchRegisters &curr = (*rows)[
i];
1894 if (!curr.ri_->rword_likely_starts_idea && !curr.ri_->lword_likely_starts_idea &&
1914 RowScratchRegisters &curr = (*rows)[row_start];
1915 RowScratchRegisters &
next = (*rows)[row_start + 1];
1918 (curr.ri_->lword_likely_starts_idea || curr.ri_->rword_likely_starts_idea)) {
1919 curr.SetStartLine();
1923 for (
int i = row_start + 1;
i < row_end - 1;
i++) {
1924 RowScratchRegisters &prev = (*rows)[
i - 1];
1925 RowScratchRegisters &curr = (*rows)[
i];
1926 RowScratchRegisters &
next = (*rows)[
i + 1];
1929 LikelyParagraphStart(prev, curr, j)) {
1930 curr.SetStartLine();
1935 RowScratchRegisters &prev = (*rows)[row_end - 2];
1936 RowScratchRegisters &curr = (*rows)[row_end - 1];
1939 LikelyParagraphStart(prev, curr, j)) {
1940 curr.SetStartLine();
1948static void ModelStrongEvidence(
int debug_level, std::vector<RowScratchRegisters> *rows,
1949 int row_start,
int row_end,
bool allow_flush_models,
1950 ParagraphTheory *theory) {
1951 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
1955 int start = row_start;
1956 while (start < row_end) {
1957 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START) {
1960 if (start >= row_end - 1) {
1964 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1966 ParagraphModel last_model;
1967 bool next_consistent;
1973 if (end < row_end - 1) {
1974 RowScratchRegisters &
next = (*rows)[end];
1979 next_consistent =
false;
1981 if (next_consistent) {
1982 ParagraphModel next_model =
1983 InternalParagraphModelByOutline(rows, start, end + 1, tolerance, &next_consistent);
1988 next_consistent =
false;
1990 last_model = next_model;
1992 next_consistent =
false;
1994 }
while (next_consistent && end < row_end);
1998 if (end > start + 1) {
2000 const ParagraphModel *model =
nullptr;
2001 ParagraphModel new_model = ParagraphModelByOutline(
2002 debug_level, rows, start, end, Epsilon(
InterwordSpace(*rows, start, end)));
2005 }
else if (new_model.is_flush()) {
2006 if (end == start + 2) {
2009 }
else if (start == row_start) {
2016 }
else if (allow_flush_models) {
2017 model = theory->AddModel(new_model);
2020 model = theory->AddModel(new_model);
2023 (*rows)[start].AddStartLine(model);
2024 for (
int i = start + 1;
i < end;
i++) {
2025 (*rows)[
i].AddBodyLine(model);
2040static void StrongEvidenceClassify(
int debug_level, std::vector<RowScratchRegisters> *rows,
2041 int row_start,
int row_end, ParagraphTheory *theory) {
2042 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end)) {
2046 if (debug_level > 1) {
2047 tprintf(
"#############################################\n");
2048 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
2049 tprintf(
"#############################################\n");
2053 MarkStrongEvidence(rows, row_start, row_end);
2055 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
2058 ModelStrongEvidence(debug_level, rows, row_start, row_end,
false, theory);
2060 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
2065 ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
2069static void SeparateSimpleLeaderLines(std::vector<RowScratchRegisters> *rows,
int row_start,
2070 int row_end, ParagraphTheory *theory) {
2071 for (
int i = row_start + 1;
i < row_end - 1;
i++) {
2072 if ((*rows)[
i - 1].ri_->has_leaders && (*rows)[
i].ri_->has_leaders &&
2073 (*rows)[
i + 1].ri_->has_leaders) {
2074 const ParagraphModel *model =
2076 (*rows)[
i].AddStartLine(model);
2083static void ConvertHypothesizedModelRunsToParagraphs(
int debug_level,
2084 std::vector<RowScratchRegisters> &rows,
2085 std::vector<PARA *> *row_owners,
2086 ParagraphTheory *theory) {
2087 int end = rows.size();
2089 for (; end > 0; end = start) {
2091 const ParagraphModel *model =
nullptr;
2093 bool single_line_paragraph =
false;
2095 rows[start].NonNullHypotheses(&models);
2096 if (!models.empty()) {
2098 if (rows[start].GetLineType(model) !=
LT_BODY) {
2099 single_line_paragraph =
true;
2102 if (model && !single_line_paragraph) {
2104 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2107 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2111 if (model ==
nullptr) {
2115 PARA *
p =
new PARA();
2117 p->is_very_first_or_continuation =
true;
2121 for (
unsigned row = end; row < rows.size(); row++) {
2122 if ((*row_owners)[row] &&
2124 (start == 0 ||
ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2125 model = (*row_owners)[row]->model;
2132 rows[start].lmargin_ + rows[start].lindent_, 0, 0,
2133 Epsilon(rows[start].ri_->average_interword_space)));
2137 rows[start].rmargin_ + rows[start].rmargin_, 0, 0,
2138 Epsilon(rows[start].ri_->average_interword_space)));
2141 rows[start].SetUnknown();
2142 rows[start].AddStartLine(model);
2143 for (
int i = start + 1;
i < end;
i++) {
2144 rows[
i].SetUnknown();
2145 rows[
i].AddBodyLine(model);
2148 p->has_drop_cap = rows[start].ri_->has_drop_cap;
2150 ? rows[start].ri_->rword_indicates_list_item
2151 : rows[start].ri_->lword_indicates_list_item;
2152 for (
int row = start; row < end; row++) {
2153 if ((*row_owners)[row] !=
nullptr) {
2155 "Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2156 "more than once!\n");
2157 delete (*row_owners)[row];
2159 (*row_owners)[row] =
p;
2181static bool RowIsStranded(
const std::vector<RowScratchRegisters> &rows,
int row) {
2183 rows[row].StrongHypotheses(&row_models);
2185 for (
auto &row_model : row_models) {
2186 bool all_starts = rows[row].GetLineType();
2188 bool continues =
true;
2189 for (
int i = row - 1;
i >= 0 && continues;
i--) {
2191 rows[
i].NonNullHypotheses(&models);
2192 switch (rows[
i].GetLineType(row_model)) {
2207 for (
unsigned i = row + 1;
i < rows.size() && continues;
i++) {
2209 rows[
i].NonNullHypotheses(&models);
2210 switch (rows[
i].GetLineType(row_model)) {
2224 if (run_length > 2 || (!all_starts && run_length > 1)) {
2237static void LeftoverSegments(
const std::vector<RowScratchRegisters> &rows,
2238 std::vector<Interval> *to_fix,
int row_start,
int row_end) {
2240 for (
int i = row_start;
i < row_end;
i++) {
2241 bool needs_fixing =
false;
2245 rows[
i].StrongHypotheses(&models);
2246 rows[
i].NonNullHypotheses(&models_w_crowns);
2247 if (models.empty() && !models_w_crowns.empty()) {
2249 for (
unsigned end =
i + 1; end < rows.size(); end++) {
2252 rows[end].NonNullHypotheses(&end_models);
2253 rows[end].StrongHypotheses(&strong_end_models);
2254 if (end_models.empty()) {
2255 needs_fixing =
true;
2257 }
else if (!strong_end_models.empty()) {
2258 needs_fixing =
false;
2262 }
else if (models.empty() && rows[
i].ri_->num_words > 0) {
2264 needs_fixing =
true;
2267 if (!needs_fixing && !models.empty()) {
2268 needs_fixing = RowIsStranded(rows,
i);
2272 if (!to_fix->empty() && to_fix->back().end ==
i - 1) {
2273 to_fix->back().end =
i;
2275 to_fix->push_back(Interval(
i,
i));
2280 for (
auto &
i : *to_fix) {
2289 std::vector<PARA *> &rows = *row_owners;
2290 paragraphs->clear();
2291 PARA_IT out(paragraphs);
2292 PARA *formerly_null =
nullptr;
2293 for (
unsigned i = 0;
i < rows.size();
i++) {
2294 if (rows[
i] ==
nullptr) {
2295 if (
i == 0 || rows[
i - 1] != formerly_null) {
2296 rows[
i] = formerly_null =
new PARA();
2298 rows[
i] = formerly_null;
2301 }
else if (
i > 0 && rows[
i - 1] == rows[
i]) {
2304 out.add_after_then_move(rows[
i]);
2319 std::vector<PARA *> *row_owners, PARA_LIST *paragraphs,
2320 std::vector<ParagraphModel *> *models) {
2324 row_owners->clear();
2325 row_owners->resize(row_infos->size());
2328 std::vector<RowScratchRegisters> rows(row_infos->size());
2329 for (
unsigned i = 0;
i < row_infos->size();
i++) {
2330 rows[
i].Init((*row_infos)[
i]);
2338 SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2340 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2342 std::vector<Interval> leftovers;
2343 LeftoverSegments(rows, &leftovers, 0, rows.size());
2344 for (
auto &leftover : leftovers) {
2350 StrongEvidenceClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2356 std::vector<Interval> leftovers2;
2357 LeftoverSegments(rows, &leftovers2, leftover.begin, leftover.end);
2358 bool pass2a_was_useful =
2359 leftovers2.size() > 1 ||
2360 (leftovers2.size() == 1 && (leftovers2[0].begin != 0 ||
static_cast<size_t>(leftovers2[0].end) != rows.size()));
2361 if (pass2a_was_useful) {
2362 for (
auto &leftover2 : leftovers2) {
2363 StrongEvidenceClassify(debug_level, &rows, leftover2.begin, leftover2.end, &theory);
2368 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2374 LeftoverSegments(rows, &leftovers, 0, rows.size());
2375 for (
auto &leftover : leftovers) {
2376 GeometricClassify(debug_level, &rows, leftover.begin, leftover.end, &theory);
2380 DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2382 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2386 LeftoverSegments(rows, &leftovers, 0, rows.size());
2387 for (
auto &leftover : leftovers) {
2388 for (
int j = leftover.begin; j < leftover.end; j++) {
2389 rows[j].SetUnknown();
2393 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2396 ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners, &theory);
2398 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2406static void InitializeTextAndBoxesPreRecognition(
const MutableIterator &it, RowInfo *info) {
2408 std::string fake_text;
2409 PageIterator pit(
static_cast<const PageIterator &
>(it));
2410 bool first_word =
true;
2415 info->lword_text +=
"x";
2417 info->rword_text +=
"x";
2421 info->rword_text =
"";
2426 if (fake_text.empty()) {
2430 int lspaces = info->pix_ldistance / info->average_interword_space;
2431 for (
int i = 0;
i < lspaces;
i++) {
2434 info->text += fake_text;
2437 PAGE_RES_IT page_res_it = *it.PageResIt();
2438 WERD_RES *word_res = page_res_it.restart_row();
2439 ROW_RES *this_row = page_res_it.row();
2441 WERD_RES *lword =
nullptr;
2442 WERD_RES *rword =
nullptr;
2443 info->num_words = 0;
2449 if (rword != word_res) {
2454 word_res = page_res_it.forward();
2455 }
while (page_res_it.row() == this_row);
2458 info->lword_box = lword->word->bounding_box();
2461 info->rword_box = rword->word->bounding_box();
2467static void InitializeRowInfo(
bool after_recognition,
const MutableIterator &it, RowInfo *info) {
2468 if (it.PageResIt()->row() !=
nullptr) {
2469 ROW *row = it.PageResIt()->row()->row;
2470 info->pix_ldistance = row->lmargin();
2471 info->pix_rdistance = row->rmargin();
2472 info->average_interword_space =
2473 row->space() > 0 ? row->space() : std::max(
static_cast<int>(row->x_height()), 1);
2474 info->pix_xheight = row->x_height();
2475 info->has_leaders =
false;
2476 info->has_drop_cap = row->has_drop_cap();
2479 info->pix_ldistance = info->pix_rdistance = 0;
2480 info->average_interword_space = 1;
2481 info->pix_xheight = 1.0;
2482 info->has_leaders =
false;
2483 info->has_drop_cap =
false;
2487 info->num_words = 0;
2488 info->lword_indicates_list_item =
false;
2489 info->lword_likely_starts_idea =
false;
2490 info->lword_likely_ends_idea =
false;
2491 info->rword_indicates_list_item =
false;
2492 info->rword_likely_starts_idea =
false;
2493 info->rword_likely_ends_idea =
false;
2494 info->has_leaders =
false;
2497 if (!after_recognition) {
2498 InitializeTextAndBoxesPreRecognition(it, info);
2502 const std::unique_ptr<const char[]> text(it.GetUTF8Text(
RIL_TEXTLINE));
2503 int trailing_ws_idx = strlen(text.get());
2504 while (trailing_ws_idx > 0 &&
2506 isascii(text[trailing_ws_idx - 1]) && isspace(text[trailing_ws_idx - 1])) {
2509 if (trailing_ws_idx > 0) {
2510 int lspaces = info->pix_ldistance / info->average_interword_space;
2511 for (
int i = 0;
i < lspaces;
i++) {
2514 for (
int i = 0;
i < trailing_ws_idx;
i++) {
2515 info->text += text[
i];
2519 if (info->text.empty()) {
2523 PAGE_RES_IT page_res_it = *it.PageResIt();
2524 std::vector<WERD_RES *> werds;
2525 WERD_RES *word_res = page_res_it.restart_row();
2526 ROW_RES *this_row = page_res_it.row();
2527 int num_leaders = 0;
2531 if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2532 werds.push_back(word_res);
2533 ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2534 rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2539 word_res = page_res_it.forward();
2540 }
while (page_res_it.row() == this_row);
2541 info->ltr = ltr >= rtl;
2542 info->has_leaders = num_leaders > 3;
2543 info->num_words = werds.size();
2544 if (!werds.empty()) {
2545 WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2546 info->lword_text = lword->best_choice->unichar_string().c_str();
2547 info->rword_text = rword->best_choice->unichar_string().c_str();
2548 info->lword_box = lword->word->bounding_box();
2549 info->rword_box = rword->word->bounding_box();
2551 &info->lword_indicates_list_item, &info->lword_likely_starts_idea,
2552 &info->lword_likely_ends_idea);
2554 &info->rword_indicates_list_item, &info->rword_likely_starts_idea,
2555 &info->rword_likely_ends_idea);
2563 const MutableIterator *block_start, std::vector<ParagraphModel *> *models) {
2579 std::vector<RowInfo> row_infos;
2585 row_infos.emplace_back();
2586 RowInfo &ri = row_infos.back();
2587 InitializeRowInfo(after_text_recognition, row, &ri);
2592 if (!row_infos.empty()) {
2593 int min_lmargin = row_infos[0].pix_ldistance;
2594 int min_rmargin = row_infos[0].pix_rdistance;
2595 for (
unsigned i = 1;
i < row_infos.size();
i++) {
2596 if (row_infos[
i].pix_ldistance < min_lmargin) {
2597 min_lmargin = row_infos[
i].pix_ldistance;
2599 if (row_infos[
i].pix_rdistance < min_rmargin) {
2600 min_rmargin = row_infos[
i].pix_rdistance;
2603 if (min_lmargin > 0 || min_rmargin > 0) {
2604 for (
auto &row_info : row_infos) {
2605 row_info.pix_ldistance -= min_lmargin;
2606 row_info.pix_rdistance -= min_rmargin;
2612 std::vector<PARA *> row_owners;
2613 std::vector<PARA *> the_paragraphs;
2614 if (!is_image_block) {
2617 row_owners.resize(row_infos.size());
2623 for (
auto &row_owner : row_owners) {
IntAfterTypedTestSuiteP after
IntBeforeRegisterTypedTestSuiteP before
@ W_REP_CHAR
repeated character
bool NearlyEqual(T x, T y, T tolerance)
bool StrongModel(const ParagraphModel *model)
void tprintf(const char *format,...)
std::vector< const ParagraphModel * > SetOfModels
int InterwordSpace(const std::vector< RowScratchRegisters > &rows, int row_start, int row_end)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
bool RowsFitModel(const std::vector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
void RecomputeMarginsAndClearHypotheses(std::vector< RowScratchRegisters > *rows, int start, int end, int percentile)
bool ValidBodyLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void CanonicalizeDetectionResults(std::vector< PARA * > *row_owners, PARA_LIST *paragraphs)
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after)
const ParagraphModel * kCrownLeft
void push_back_new(std::vector< T > &vector, const T &data)
const ParagraphModel * kCrownRight
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
bool CrownCompatible(const std::vector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
bool ValidFirstLine(const std::vector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
bool AsciiLikelyListItem(const std::string &word)
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
bool contains(const std::vector< T > &data, const T &value)
bool Empty(PageIteratorLevel level) const
bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const override
bool Next(PageIteratorLevel level) override
const PAGE_RES_IT * PageResIt() const
UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
unsigned SkipDigits(unsigned pos)
unsigned SkipRomans(unsigned pos)
unsigned SkipPunc(unsigned pos)
unsigned SkipAlpha(unsigned pos)
Cluster(int cen, int num)
void GetClusters(std::vector< Cluster > *clusters)
SimpleClusterer(int max_cluster_width)
bool IsFullRow(int i) const
std::vector< Cluster > right_tabs
void Fail(int min_debug_level, const char *why) const
bool FirstWordWouldHaveFit(int row_a, int row_b)
std::vector< RowScratchRegisters > * rows
GeometricClassifierState(int dbg_level, std::vector< RowScratchRegisters > *r, int r_start, int r_end)
ParagraphModel Model() const
void AssumeLeftJustification()
tesseract::ParagraphJustification just
const std::vector< Cluster > & AlignTabs() const
void AssumeRightJustification()
const std::vector< Cluster > & OffsideTabs() const
int AlignsideTabIndex(int row_idx) const
std::vector< Cluster > left_tabs
void StartHypotheses(SetOfModels *models) const
const ParagraphModel * UniqueStartHypothesis() const
void NonNullHypotheses(SetOfModels *models) const
void AddBodyLine(const ParagraphModel *model)
void StrongHypotheses(SetOfModels *models) const
LineType GetLineType() const
static void AppendDebugHeaderFields(std::vector< std::string > &header)
void AppendDebugInfo(const ParagraphTheory &theory, std::vector< std::string > &dbg) const
void DiscardNonMatchingHypotheses(const SetOfModels &models)
void AddStartLine(const ParagraphModel *model)
const ParagraphModel * UniqueBodyHypothesis() const
void Init(const RowInfo &row)
void NonCenteredModels(SetOfModels *models)
std::vector< ParagraphModel * > & models()
const ParagraphModel * Fits(const std::vector< RowScratchRegisters > *rows, int start, int end) const
void DiscardUnusedModels(const SetOfModels &used_models)
int IndexOf(const ParagraphModel *model) const
const ParagraphModel * AddModel(const ParagraphModel &model)
ParagraphModelSmearer(std::vector< RowScratchRegisters > *rows, int row_start, int row_end, ParagraphTheory *theory)
PDBLK pdblk
Page Description Block.
bool ValidFirstLine(int lmargin, int lindent, int rindent, int rmargin) const
bool ValidBodyLine(int lmargin, int lindent, int rindent, int rmargin) const
BLOCK_RES * block() const
POLY_BLOCK * poly_block() const
UNICHAR_ID unichar_id(unsigned index) const
void add(int32_t value, int32_t count)
double ile(double frac) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const