50 const int kCharWidth = 10;
51 const int kLineSpace = 30;
53 info->
has_leaders = strstr(text,
"...") !=
nullptr || strstr(text,
". . .") !=
nullptr;
61 std::vector<std::string> words =
split(text,
' ');
68 info->
rword_text = words[words.size() - 1].c_str();
70 while (lspace < info->text.size() && text[lspace] ==
' ') {
74 while (rspace < info->text.size() && text[info->
text.size() - rspace - 1] ==
' ') {
78 int top = -kLineSpace * row_number;
79 int bottom = top - kLineSpace;
80 int row_right = kCharWidth * info->
text.size();
81 int lword_width = kCharWidth * info->
lword_text.size();
82 int rword_width = kCharWidth * info->
rword_text.size();
97 for (
int i = 0;
i < n;
i++) {
106 const std::vector<PARA *> &detector_output) {
107 int incorrect_breaks = 0;
108 int missed_breaks = 0;
109 int poorly_matched_models = 0;
111 int bad_list_items = 0;
113 for (
int i = 1;
i < n;
i++) {
115 bool detected_break = (detector_output[
i - 1] != detector_output[
i]);
116 if (has_break && !detected_break) {
119 if (detected_break && !has_break) {
123 if (correct[
i].model_type ==
PNONE) {
124 if (detector_output[
i]->model !=
nullptr) {
125 poorly_matched_models++;
128 if (correct[
i].model.justification() !=
kUnknown &&
129 (detector_output[
i]->model ==
nullptr ||
131 poorly_matched_models++;
134 if (correct[
i].is_very_first_or_continuation ^
135 detector_output[
i]->is_very_first_or_continuation) {
138 if (correct[
i].is_list_item ^ detector_output[
i]->is_list_item) {
148 if (incorrect_breaks || missed_breaks || poorly_matched_models || bad_list_items || bad_crowns) {
149 std::vector<std::string> dbg_lines;
150 dbg_lines.emplace_back(
"# ==========================");
151 dbg_lines.emplace_back(
"# Correct paragraph breaks:");
152 dbg_lines.emplace_back(
"# ==========================");
153 for (
int i = 0;
i < n;
i++) {
154 if (correct[
i].model_type !=
PCONT) {
155 std::string s = std::string(correct[
i].ascii) +
" # " +
158 (correct[
i].is_list_item ?
" li" :
"");
159 dbg_lines.push_back(s);
161 dbg_lines.emplace_back(correct[
i].ascii);
164 dbg_lines.emplace_back(
"");
165 dbg_lines.emplace_back(
"# ==========================");
166 dbg_lines.emplace_back(
"# Paragraph detector output:");
167 dbg_lines.emplace_back(
"# ==========================");
168 for (
int i = 0;
i < n;
i++) {
169 std::string annotation;
170 if (
i == 0 || (detector_output[
i - 1] != detector_output[
i])) {
171 if (detector_output[
i] && detector_output[
i]->model) {
173 " # " + detector_output[
i]->model->ToString() +
174 (detector_output[
i]->is_very_first_or_continuation ?
" crown" :
"") +
175 (detector_output[
i]->is_list_item ?
" li" :
"");
177 annotation =
" # Unmodeled paragraph.";
180 std::string s = correct[
i].
ascii + annotation;
181 dbg_lines.push_back(s);
184 for (
auto &dbg_line : dbg_lines) {
185 s += dbg_line +
"\n";
187 LOG(
INFO) <<
"Discrepancy!\n" << s;
192 std::vector<RowInfo> row_infos;
193 std::vector<PARA *> row_owners;
194 PARA_LIST paragraphs;
195 std::vector<ParagraphModel *> models;
201 for (
auto *model : models) {
206TEST(ParagraphsTest, ListItemsIdentified) {
232 {
" Look here, I have a paragraph.",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
233 {
"This paragraph starts at the top",
PCONT,
PModel(),
false,
false},
234 {
"of the page and takes 3 lines. ",
PCONT,
PModel(),
false,
false},
235 {
" Here I have a second paragraph",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
236 {
"which indicates that the first ",
PCONT,
PModel(),
false,
false},
237 {
"paragraph is not a continuation ",
PCONT,
PModel(),
false,
false},
238 {
"from a previous page, as it is ",
PCONT,
PModel(),
false,
false},
239 {
"indented just like this second ",
PCONT,
PModel(),
false,
false},
243TEST(ParagraphsTest, TestSimpleParagraphDetection) {
248 {
"This paragraph starts at the top",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
true,
false},
249 {
"of the page and takes two lines.",
PCONT,
PModel(),
false,
false},
250 {
" Here I have a second paragraph",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
251 {
"which indicates that the first ",
PCONT,
PModel(),
false,
false},
252 {
"paragraph is a continuation from",
PCONT,
PModel(),
false,
false},
253 {
"a previous page, as it is ",
PCONT,
PModel(),
false,
false},
254 {
"indented just like this second ",
PCONT,
PModel(),
false,
false},
258TEST(ParagraphsTest, TestFewCluesWithCrown) {
263 {
"The first paragraph on a page is",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
true,
false},
264 {
"often not indented as the rest ",
PCONT,
PModel(),
false,
false},
265 {
"of the paragraphs are. Nonethe-",
PCONT,
PModel(),
false,
false},
266 {
"less it should be counted as the",
PCONT,
PModel(),
false,
false},
267 {
"same type of paragraph. ",
PCONT,
PModel(),
false,
false},
268 {
" The second and third para- ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
269 {
"graphs are both indented two ",
PCONT,
PModel(),
false,
false},
271 {
" The first paragraph has what ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
272 {
"fmt refers to as a 'crown.' ",
PCONT,
PModel(),
false,
false},
275TEST(ParagraphsTest, TestCrownParagraphDetection) {
280 {
"It is sometimes the case that",
PSTART,
PModel(
kLeft, 0, 0, 0, 0),
false,
false},
281 {
"flush left paragraphs (those",
PCONT,
PModel(),
false,
false},
282 {
"with no body indent) are not",
PCONT,
PModel(),
false,
false},
283 {
"actually crowns. ",
PCONT,
PModel(),
false,
false},
284 {
"Instead, further paragraphs are",
PSTART,
PModel(
kLeft, 0, 0, 0, 0),
false,
false},
285 {
"also flush left aligned. Usual-",
PCONT,
PModel(),
false,
false},
286 {
"ly, these paragraphs are set",
PCONT,
PModel(),
false,
false},
287 {
"apart vertically by some white-",
PCONT,
PModel(),
false,
false},
288 {
"space, but you can also detect",
PCONT,
PModel(),
false,
false},
289 {
"them by observing the big empty",
PCONT,
PModel(),
false,
false},
290 {
"space at the ends of the para-",
PCONT,
PModel(),
false,
false},
294TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
299 {
"sometimes a page is one giant",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
true,
false},
300 {
"continuation. It flows from",
PCONT,
PModel(),
false,
false},
301 {
"line to line, using the full",
PCONT,
PModel(),
false,
false},
302 {
"column width with no clear",
PCONT,
PModel(),
false,
false},
303 {
"paragraph break, because it",
PCONT,
PModel(),
false,
false},
304 {
"actually doesn't have one. It",
PCONT,
PModel(),
false,
false},
305 {
"is the middle of one monster",
PCONT,
PModel(),
false,
false},
306 {
"paragraph continued from the",
PCONT,
PModel(),
false,
false},
307 {
"previous page and continuing",
PCONT,
PModel(),
false,
false},
308 {
"onto the next page. There-",
PCONT,
PModel(),
false,
false},
309 {
"fore, it ends up getting",
PCONT,
PModel(),
false,
false},
310 {
"marked as a crown and then",
PCONT,
PModel(),
false,
false},
311 {
"getting re-marked as any ex-",
PCONT,
PModel(),
false,
false},
312 {
"isting model. Not great, but",
PCONT,
PModel(),
false,
false},
315TEST(ParagraphsTest, TestSingleFullPageContinuation) {
318 std::vector<RowInfo> row_infos;
319 std::vector<PARA *> row_owners;
320 PARA_LIST paragraphs;
321 std::vector<ParagraphModel *> models;
326 for (
auto *model : models) {
333 {
" uncommon in Left-to-Right",
PCONT,
PModel(),
false,
false},
334 {
" languages, but they do",
PCONT,
PModel(),
false,
false},
336 {
" Mostly, however, they're",
PSTART,
PModel(
kRight, 0, 0, 0, 0),
false,
false},
337 {
" horribly tiny paragraphs in",
PCONT,
PModel(),
false,
false},
338 {
" tables on which we have no",
PCONT,
PModel(),
false,
false},
339 {
" chance anyways.",
PCONT,
PModel(),
false,
false},
342TEST(ParagraphsTest, TestRightAlignedParagraph) {
347 {
" Occasionally, interspersed with",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
348 {
"obvious paragraph text, you might",
PCONT,
PModel(),
false,
false},
349 {
"find short exchanges of dialogue ",
PCONT,
PModel(),
false,
false},
350 {
"between characters. ",
PCONT,
PModel(),
false,
false},
354 {
" One naive approach would be to ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
355 {
"mark a new paragraph whenever one",
PCONT,
PModel(),
false,
false},
356 {
"of the statistics (left, right or",
PCONT,
PModel(),
false,
false},
357 {
"center) changes from one text-",
PCONT,
PModel(),
false,
false},
358 {
"line to the next. Such an",
PCONT,
PModel(),
false,
false},
359 {
"approach would misclassify the",
PCONT,
PModel(),
false,
false},
360 {
"tiny paragraphs above as a single",
PCONT,
PModel(),
false,
false},
364TEST(ParagraphsTest, TestTinyParagraphs) {
370 {
" Centered Title ",
PCONT,
PModel(),
false,
false},
371 {
" Paragraph Detection ",
PCONT,
PModel(),
false,
false},
373 {
" 10 November 2010 ",
PCONT,
PModel(),
false,
false},
375 {
" Look here, I have a paragraph.",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
376 {
"This paragraph starts at the top",
PCONT,
PModel(),
false,
false},
377 {
"of the page and takes 3 lines. ",
PCONT,
PModel(),
false,
false},
378 {
" Here I have a second paragraph",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
379 {
"which indicates that the first ",
PCONT,
PModel(),
false,
false},
380 {
"paragraph is not a continuation ",
PCONT,
PModel(),
false,
false},
381 {
"from a previous page, as it is ",
PCONT,
PModel(),
false,
false},
382 {
"indented just like this second ",
PCONT,
PModel(),
false,
false},
384 {
" Here is a block quote. It ",
PSTART,
PModel(
kLeft, 30, 0, 0, 0),
true,
false},
385 {
" looks like the prior text ",
PCONT,
PModel(),
false,
false},
386 {
" but it is indented more ",
PCONT,
PModel(),
false,
false},
387 {
" and is fully justified. ",
PCONT,
PModel(),
false,
false},
388 {
" So how does one deal with ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
389 {
"centered text, block quotes, ",
PCONT,
PModel(),
false,
false},
390 {
"normal paragraphs, and lists ",
PCONT,
PModel(),
false,
false},
391 {
"like what follows? ",
PCONT,
PModel(),
false,
false},
393 {
"2. Use a heuristic, for example,",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
394 {
" looking for lines where the ",
PCONT,
PModel(),
false,
false},
395 {
" first word of the next line ",
PCONT,
PModel(),
false,
false},
396 {
" would fit on the previous ",
PCONT,
PModel(),
false,
false},
398 {
"8. Try to implement the plan in ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
399 {
" Python and try it out. ",
PCONT,
PModel(),
false,
false},
400 {
"4. Determine how to fix the ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
403 {
" For extra painful penalty work",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
404 {
"you can try to identify source ",
PCONT,
PModel(),
false,
false},
408TEST(ParagraphsTest, TestComplexPage1) {
415 {
" Centered Title ",
PCONT,
PModel(),
false,
false},
416 {
" Paragraph Detection ",
PCONT,
PModel(),
false,
false},
418 {
" 10 November 2010 ",
PCONT,
PModel(),
false,
false},
420 {
" Look here, I have a paragraph. ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
421 {
"This paragraph starts at the top of",
PCONT,
PModel(),
false,
false},
422 {
"the page and takes 3 lines. ",
PCONT,
PModel(),
false,
false},
423 {
" Here I have a second paragraph ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
424 {
"which indicates that the first ",
PCONT,
PModel(),
false,
false},
425 {
"paragraph is not a continuation ",
PCONT,
PModel(),
false,
false},
426 {
"from a previous page, as it is in- ",
PCONT,
PModel(),
false,
false},
427 {
"dented just like this second para- ",
PCONT,
PModel(),
false,
false},
429 {
" Here is a block quote. It ",
PSTART,
PModel(
kLeft, 30, 0, 0, 0),
true,
false},
430 {
" looks like the prior text ",
PCONT,
PModel(),
false,
false},
431 {
" but it is indented more ",
PCONT,
PModel(),
false,
false},
432 {
" and is fully justified. ",
PCONT,
PModel(),
false,
false},
433 {
" So how does one deal with center-",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
434 {
"ed text, block quotes, normal para-",
PCONT,
PModel(),
false,
false},
435 {
"graphs, and lists like what follow?",
PCONT,
PModel(),
false,
false},
436 {
"1. Make a plan. ",
PCONT,
PModel(),
false,
false},
437 {
"2. Use a heuristic, for example, ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
438 {
" looking for lines where the ",
PCONT,
PModel(),
false,
false},
439 {
" first word of the next line ",
PCONT,
PModel(),
false,
false},
440 {
" would fit on the previous line. ",
PCONT,
PModel(),
false,
false},
441 {
"8. Try to implement the plan in ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
442 {
" Python and try it out. ",
PCONT,
PModel(),
false,
false},
443 {
"4. Determine how to fix the ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
true},
446 {
" For extra painful penalty work ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
447 {
"you can try to identify source ",
PCONT,
PModel(),
false,
false},
451TEST(ParagraphsTest, TestComplexPage2) {
456 {
"The first paragraph on a page is",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
true,
false},
457 {
"often not indented as the rest ",
PCONT,
PModel(),
false,
false},
458 {
"of the paragraphs are. Nonethe-",
PCONT,
PModel(),
false,
false},
459 {
"less it should be counted as the",
PCONT,
PModel(),
false,
false},
460 {
"same type of paragraph. ",
PCONT,
PModel(),
false,
false},
461 {
" Even a short second paragraph ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
false,
false},
462 {
"should suffice. ",
PCONT,
PModel(),
false,
false},
466TEST(ParagraphsTest, TestSubtleCrown) {
470TEST(ParagraphsTest, TestStrayLineInBlock) {
475 {
" Defined contribution plans cover employees in Australia, New",
PSTART,
477 {
"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ",
PCONT,
PModel(),
false,
479 {
"In addition, employees in the U.S. are eligible to participate in ",
PCONT,
PModel(),
481 {
"defined contribution plans (Employee Savings Plans) by contribut-",
PCONT,
PModel(),
false,
483 {
"ing a portion of their compensation. The Company matches com- ",
PCONT,
PModel(),
false,
485 {
"pensation, depending on Company profit levels. Contributions ",
PCONT,
PModel(),
false,
487 {
"charged to income for defined contribution plans were $92 in ",
PCONT,
PModel(),
false,
489 {
"1993, $98 in 1992 and $89 in 1991. ",
PCONT,
PModel(),
false,
491 {
" In addition to providing pension benefits, the Company pro- ",
PSTART,
493 {
"vides certain health care and life insurance benefits to retired ",
PCONT,
PModel(),
false,
495 {
"employees. As discussed in Note A, the Company adopted FASB ",
PCONT,
PModel(),
false,
497 {
"Statement No. 106 effective January 1, 1992. Previously, the ",
PCONT,
PModel(),
false,
499 {
"Company recognized the cost of providing these benefits as the ",
PCONT,
PModel(),
false,
501 {
"benefits were paid. These pretax costs amounted to $53 in 1991. ",
PCONT,
PModel(),
false,
503 {
"The Company continues to fund most of the cost of these medical ",
PCONT,
PModel(),
false,
505 {
"and life insurance benefits in the year incurred. ",
PCONT,
PModel(),
false,
507 {
" The U.S. plan covering the parent company is the largest plan.",
PSTART,
509 {
"It provides medical and life insurance benefits including hospital, ",
PCONT,
PModel(),
false,
511 {
"physicians’ services and major medical expense benefits and life ",
PCONT,
PModel(),
false,
513 {
"insurance benefits. The plan provides benefits supplemental to ",
PCONT,
PModel(),
false,
515 {
"Medicare after retirees are eligible for these benefits. The cost of ",
PCONT,
PModel(),
517 {
"these benefits are shared by the Company and the retiree, with the ",
PCONT,
PModel(),
false,
519 {
"Company portion increasing as the retiree has increased years of ",
PCONT,
PModel(),
false,
521 {
"credited service. The Company has the ability to change these ",
PCONT,
PModel(),
false,
525 {
" Effective October 1993, the Company amended its health ",
PSTART,
527 {
"benefits plan in the U.S. to cap the cost absorbed by the Company ",
PCONT,
PModel(),
false,
529 {
"at approximately twice the 1993 cost per person for employees who",
PCONT,
PModel(),
false,
531 {
"retire after December 31, 1993. The effect of this amendment was ",
PCONT,
PModel(),
false,
533 {
"to reduce the December 31, 1993 accumulated postretirement ",
PCONT,
PModel(),
false,
535 {
"benefit obligation by $327. It also reduced the net periodic postre- ",
PCONT,
PModel(),
false,
537 {
"tirement cost by $21 for 1993 and is estimated to reduce this cost ",
PCONT,
PModel(),
false,
539 {
"for 1994 by approximately $83. ",
PCONT,
PModel(),
false,
543TEST(ParagraphsTest, TestUnlvInsurance) {
568TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
573 {
" A typical page of a programming book may contain",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
575 {
"examples of source code to exemplify an algorithm ",
PCONT,
PModel(),
false,
false},
576 {
"being described in prose. Such examples should be",
PCONT,
PModel(),
false,
false},
577 {
"rendered as lineated text, meaning text with ",
PCONT,
PModel(),
false,
false},
578 {
"explicit line breaks but without extra inter-line ",
PCONT,
PModel(),
false,
false},
579 {
"spacing. Accidentally finding stray paragraphs in",
PCONT,
PModel(),
false,
false},
580 {
"source code would lead to a bad reading experience",
PCONT,
PModel(),
false,
false},
581 {
"when the text is re-flowed. ",
PCONT,
PModel(),
false,
false},
582 {
" Let's show this by describing the function fact-",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
584 {
"orial. Factorial is a simple recursive function ",
PCONT,
PModel(),
false,
false},
585 {
"which grows very quickly. So quickly, in fact, ",
PCONT,
PModel(),
false,
false},
586 {
"that the typical C implementation will only work ",
PCONT,
PModel(),
false,
false},
587 {
"for values less than about 12: ",
PCONT,
PModel(),
false,
false},
589 {
" # Naive implementation in C ",
PCONT,
PModel(),
false,
false},
590 {
" int factorial(int n) { ",
PCONT,
PModel(),
false,
false},
593 {
" return n * factorial(n - 1); ",
PCONT,
PModel(),
false,
false},
596 {
" The C programming language does not have built- ",
PSTART,
PModel(
kLeft, 0, 20, 0, 0),
598 {
"in support for detecting integer overflow, so this",
PCONT,
PModel(),
false,
false},
599 {
"naive implementation simply returns random values ",
PCONT,
PModel(),
false,
false},
600 {
"if even a moderate sized n is provided. ",
PCONT,
PModel(),
false,
false},
603TEST(ParagraphsTest, NotDistractedBySourceCode) {
608 {
"royal palm which are called guano and in it there was a bed, a",
PSTART,
610 {
"table, one chair, and a place on the dirt floor to cook with charcoal.",
PCONT,
PModel(),
612 {
"On the brown walls of the flattened, overlapping leaves of the",
PCONT,
PModel(),
614 {
"sturdy fibered guano there was a picture in color of the Sacred",
PCONT,
PModel(),
616 {
"Heart of Jesus and another of the Virgin of Cobre. These were",
PCONT,
PModel(),
618 {
"relics of his wife. Once there had been a tinted photograph of his",
PCONT,
PModel(),
620 {
"wife on the wall but he had taken it down because it made him too",
PCONT,
PModel(),
622 {
"lonely to see it and it was on the shelf in the corner under his clean",
PCONT,
PModel(),
626 {
" \"What do you have to eat?\" the boy asked. ",
PSTART,
628 {
" \"A pot of yellow rice with fish. Do you want some?\" ",
PSTART,
630 {
" \"No. I will eat at home. Do you want me to make the fire?\" ",
PSTART,
632 {
" \"No. I will make it later on. Or I may eat the rice cold.\" ",
PSTART,
634 {
" \"May I take the cast net?\" ",
PSTART,
636 {
" \"Of course.\" ",
PSTART,
638 {
" There was no cast net and the boy remembered when they had",
PSTART,
640 {
"sold it. But they went through this fiction every day. There was no",
PCONT,
PModel(),
642 {
"pot of yellow rice and fish and the boy knew this too. "
645 {
" \"Eighty-five is a lucky number,\" the old man said. \"How",
PSTART,
647 {
"would you like to see me bring one in that dressed out over a "
653 {
" \"I'll get the cast net and go for sardines. Will you sit in the "
656 {
"in the doorway?\" "
659 {
" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
PSTART,
661 {
" The boy did not know whether yesterday's paper was a fiction",
PSTART,
663 {
"too. But the old man brought it out from under the bed. ",
PCONT,
PModel(),
665 {
" \"Pedrico gave it to me at the bodega,\" he explained. "
668 {
" \"I'll be back when I have the sardines. I'll keep yours and mine",
PSTART,
670 {
"together on ice and we can share them in the morning. When I",
PCONT,
PModel(),
672 {
"come back you can tell me about the baseball.\" ",
PCONT,
PModel(),
674 {
" \"The Yankees cannot lose.\" ",
PSTART,
676 {
" \"But I fear the Indians of Cleveland.\" ",
PSTART,
678 {
" \"Have faith in the Yankees my son. Think of the great Di-",
PSTART,
682 {
" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
PSTART,
687TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
692 {
"Oats, 51 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
693 {
"O'Brien, Gregory, 175 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
694 {
"Occupational composition, 110,",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
696 {
"OECD rankings, 155, 172 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
697 {
"Okiato (original capital), 47 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
698 {
"Oil shock: 1974, xxx, 143; 1979,",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
700 {
"Old Age Pensions, xxii, 89-90 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
701 {
"Old World evils, 77 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
702 {
"Oliver, W. H., 39, 77, 89 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
703 {
"Olssen, Erik, 45, 64, 84 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
704 {
"Olympic Games, 1924, 111, 144 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
705 {
"Once on Chunuk Bair, 149 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
706 {
"Once Were Warriors, xxxiii, 170",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
707 {
"On—shore whaling, xvi ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
708 {
"Opotiki, xix ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
709 {
"Orakau battle of, xviii, 57 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
710 {
"O’Regan, Tipene, 170, 198-99 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
711 {
"Organic agriculture, 177 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
712 {
"Orwell, George, 151 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
713 {
"Otago, xvii, 45, 49-50, 70 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
714 {
"Otago block, xvii ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
715 {
"Otago Daily Times, 67 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
716 {
"Otago Girls’ High School, xix, 61,",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
718 {
"Otago gold rushes, 61-63 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
719 {
"Otago Peninsula, xx ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
720 {
"Otago Provincial Council, 68 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
721 {
"Otaki, 33 ",
PSTART,
PModel(
kLeft, 0, 0, 30, 0),
false,
false},
724TEST(ParagraphsTest, IndexPageTest) {
#define ASSERT_EQ(val1, val2)
#define EXPECT_EQ(val1, val2)
#define EXPECT_TRUE(condition)
#define EXPECT_FALSE(condition)
const ParagraphJustification kRight
const TextAndModel kComplexPage2[]
const TextAndModel kFlushLeftParagraphs[]
const TextAndModel kTextWithSourceCode[]
const TextAndModel kFewCluesWithCrown[]
const TextAndModel kOldManAndSea[]
const TextAndModel kUnlvRep3AO[]
const TextAndModel kSubtleCrown[]
const TextAndModel kNewZealandIndex[]
constexpr size_t countof(T const (&)[N]) noexcept
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
void AsciiToRowInfo(const char *text, int row_number, RowInfo *info)
const TextAndModel kTableOfContents[]
const ParagraphJustification kUnknown
const TextAndModel kSingleFullPageContinuation[]
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const std::string &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
const std::vector< std::string > split(const std::string &s, char c)
bool AsciiLikelyListItem(const std::string &word)
const TextAndModel kTinyParagraphs[]
const TextAndModel kCrownedParagraph[]
const TextAndModel kComplexPage1[]
void DetectParagraphs(int debug_level, std::vector< RowInfo > *row_infos, std::vector< PARA * > *row_owners, PARA_LIST *paragraphs, std::vector< ParagraphModel * > *models)
const ParagraphJustification kLeft
void EvaluateParagraphDetection(const TextAndModel *correct, int n, const std::vector< PARA * > &detector_output)
const TextAndModel kRightAligned[]
const TextAndModel kTwoSimpleParagraphs[]
const ParagraphJustification kCenter
void MakeAsciiRowInfos(const TextAndModel *row_infos, int n, std::vector< RowInfo > *output)
void TestParagraphDetection(const TextAndModel *correct, int num_rows)
TEST(TesseractInstanceTest, TestMultipleTessInstances)
bool lword_likely_ends_idea
bool rword_likely_ends_idea
int average_interword_space
bool rword_likely_starts_idea
bool lword_indicates_list_item
bool rword_indicates_list_item
bool lword_likely_starts_idea
bool Comparable(const ParagraphModel &other) const
std::string ToString() const
TextModelInputType model_type
bool is_very_first_or_continuation