tesseract v5.3.3.20231005
tesseract::PAGE_RES_IT Class Reference

#include <pageres.h>

Public Member Functions

 PAGE_RES_IT ()=default
 
 PAGE_RES_IT (PAGE_RES *the_page_res)
 
bool operator== (const PAGE_RES_IT &other) const
 
bool operator!= (const PAGE_RES_IT &other) const
 
int cmp (const PAGE_RES_IT &other) const
 
WERD_RESrestart_page ()
 
WERD_RESrestart_page_with_empties ()
 
WERD_RESstart_page (bool empty_ok)
 
WERD_RESrestart_row ()
 
WERD_RESInsertSimpleCloneWord (const WERD_RES &clone_res, WERD *new_word)
 
void ReplaceCurrentWord (PointerVector< WERD_RES > *words)
 
void DeleteCurrentWord ()
 
void MakeCurrentWordFuzzy ()
 
WERD_RESforward ()
 
WERD_RESforward_with_empties ()
 
WERD_RESforward_paragraph ()
 
WERD_RESforward_block ()
 
WERD_RESprev_word () const
 
ROW_RESprev_row () const
 
BLOCK_RESprev_block () const
 
WERD_RESword () const
 
ROW_RESrow () const
 
BLOCK_RESblock () const
 
WERD_RESnext_word () const
 
ROW_RESnext_row () const
 
BLOCK_RESnext_block () const
 
void rej_stat_word ()
 
void ResetWordIterator ()
 

Public Attributes

PAGE_RESpage_res
 

Detailed Description

Definition at line 682 of file pageres.h.

Constructor & Destructor Documentation

◆ PAGE_RES_IT() [1/2]

tesseract::PAGE_RES_IT::PAGE_RES_IT ( )
default

◆ PAGE_RES_IT() [2/2]

tesseract::PAGE_RES_IT::PAGE_RES_IT ( PAGE_RES the_page_res)
inline

Definition at line 688 of file pageres.h.

688 { // page result
689 page_res = the_page_res;
690 restart_page(); // ready to scan
691 }
PAGE_RES * page_res
Definition: pageres.h:684
WERD_RES * restart_page()
Definition: pageres.h:710

Member Function Documentation

◆ block()

BLOCK_RES * tesseract::PAGE_RES_IT::block ( ) const
inline

Definition at line 769 of file pageres.h.

769 { // block of cur. word
770 return block_res;
771 }

◆ cmp()

int tesseract::PAGE_RES_IT::cmp ( const PAGE_RES_IT other) const

Definition at line 1183 of file pageres.cpp.

1183 {
1184 ASSERT_HOST(page_res == other.page_res);
1185 if (other.block_res == nullptr) {
1186 // other points to the end of the page.
1187 if (block_res == nullptr) {
1188 return 0;
1189 }
1190 return -1;
1191 }
1192 if (block_res == nullptr) {
1193 return 1; // we point to the end of the page.
1194 }
1195 if (block_res == other.block_res) {
1196 if (other.row_res == nullptr || row_res == nullptr) {
1197 // this should only happen if we hit an image block.
1198 return 0;
1199 }
1200 if (row_res == other.row_res) {
1201 // we point to the same block and row.
1202 ASSERT_HOST(other.word_res != nullptr && word_res != nullptr);
1203 if (word_res == other.word_res) {
1204 // we point to the same word!
1205 return 0;
1206 }
1207
1208 WERD_RES_IT word_res_it(&row_res->word_res_list);
1209 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
1210 word_res_it.forward()) {
1211 if (word_res_it.data() == word_res) {
1212 return -1;
1213 } else if (word_res_it.data() == other.word_res) {
1214 return 1;
1215 }
1216 }
1217 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1218 }
1219
1220 // we both point to the same block, but different rows.
1221 ROW_RES_IT row_res_it(&block_res->row_res_list);
1222 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
1223 row_res_it.forward()) {
1224 if (row_res_it.data() == row_res) {
1225 return -1;
1226 } else if (row_res_it.data() == other.row_res) {
1227 return 1;
1228 }
1229 }
1230 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1231 }
1232
1233 // We point to different blocks.
1234 BLOCK_RES_IT block_res_it(&page_res->block_res_list);
1235 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
1236 block_res_it.forward()) {
1237 if (block_res_it.data() == block_res) {
1238 return -1;
1239 } else if (block_res_it.data() == other.block_res) {
1240 return 1;
1241 }
1242 }
1243 // Shouldn't happen...
1244 ASSERT_HOST("Error: Incomparable PAGE_RES_ITs" == nullptr);
1245 return 0;
1246}
#define ASSERT_HOST(x)
Definition: errcode.h:54
BLOCK_RES_LIST block_res_list
Definition: pageres.h:81
ROW_RES_LIST row_res_list
Definition: pageres.h:129
WERD_RES_LIST word_res_list
Definition: pageres.h:148

◆ DeleteCurrentWord()

void tesseract::PAGE_RES_IT::DeleteCurrentWord ( )

Definition at line 1488 of file pageres.cpp.

1488 {
1489 // Check that this word is as we expect. part_of_combos are NEVER iterated
1490 // by the normal iterator, so we should never be trying to delete them.
1491 ASSERT_HOST(!word_res->part_of_combo);
1492 if (!word_res->combination) {
1493 // Combinations own their own word, so we won't find the word on the
1494 // row's word_list, but it is legitimate to try to delete them.
1495 // Delete word from the ROW when not a combination.
1496 WERD_IT w_it(row()->row->word_list());
1497 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1498 if (w_it.data() == word_res->word) {
1499 break;
1500 }
1501 }
1502 ASSERT_HOST(!w_it.cycled_list());
1503 delete w_it.extract();
1504 }
1505 // Remove the WERD_RES for the new_word.
1506 // Remove the WORD_RES from the ROW_RES.
1507 WERD_RES_IT wr_it(&row()->word_res_list);
1508 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1509 if (wr_it.data() == word_res) {
1510 word_res = nullptr;
1511 break;
1512 }
1513 }
1514 ASSERT_HOST(!wr_it.cycled_list());
1515 delete wr_it.extract();
1517}
ROW_RES * row() const
Definition: pageres.h:766

◆ forward()

WERD_RES * tesseract::PAGE_RES_IT::forward ( )
inline

Definition at line 743 of file pageres.h.

743 { // Get next word.
744 return internal_forward(false, false);
745 }

◆ forward_block()

WERD_RES * tesseract::PAGE_RES_IT::forward_block ( )

Definition at line 1715 of file pageres.cpp.

1715 {
1716 while (block_res == next_block_res) {
1717 internal_forward(false, true);
1718 }
1719 return internal_forward(false, true);
1720}

◆ forward_paragraph()

WERD_RES * tesseract::PAGE_RES_IT::forward_paragraph ( )

Definition at line 1700 of file pageres.cpp.

1700 {
1701 while (block_res == next_block_res &&
1702 (next_row_res != nullptr && next_row_res->row != nullptr &&
1703 row_res->row->para() == next_row_res->row->para())) {
1704 internal_forward(false, true);
1705 }
1706 return internal_forward(false, true);
1707}
PARA * para() const
Definition: ocrrow.h:120

◆ forward_with_empties()

WERD_RES * tesseract::PAGE_RES_IT::forward_with_empties ( )
inline

Definition at line 747 of file pageres.h.

747 {
748 return internal_forward(false, true);
749 }

◆ InsertSimpleCloneWord()

WERD_RES * tesseract::PAGE_RES_IT::InsertSimpleCloneWord ( const WERD_RES clone_res,
WERD new_word 
)

Definition at line 1252 of file pageres.cpp.

1253 {
1254 // Make a WERD_RES for the new_word.
1255 auto *new_res = new WERD_RES(new_word);
1256 new_res->CopySimpleFields(clone_res);
1257 new_res->combination = true;
1258 // Insert into the appropriate place in the ROW_RES.
1259 WERD_RES_IT wr_it(&row()->word_res_list);
1260 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1261 WERD_RES *word = wr_it.data();
1262 if (word == word_res) {
1263 break;
1264 }
1265 }
1266 ASSERT_HOST(!wr_it.cycled_list());
1267 wr_it.add_before_then_move(new_res);
1268 if (wr_it.at_first()) {
1269 // This is the new first word, so reset the member iterator so it
1270 // detects the cycled_list state correctly.
1272 }
1273 return new_res;
1274}
WERD_RES * word() const
Definition: pageres.h:763

◆ MakeCurrentWordFuzzy()

void tesseract::PAGE_RES_IT::MakeCurrentWordFuzzy ( )

Definition at line 1521 of file pageres.cpp.

1521 {
1522 WERD *real_word = word_res->word;
1523 if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
1524 real_word->set_flag(W_FUZZY_SP, true);
1525 if (word_res->combination) {
1526 // The next word should be the corresponding part of combo, but we have
1527 // already stepped past it, so find it by search.
1528 WERD_RES_IT wr_it(&row()->word_res_list);
1529 for (wr_it.mark_cycle_pt();
1530 !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
1531 }
1532 wr_it.forward();
1533 ASSERT_HOST(wr_it.data()->part_of_combo);
1534 real_word = wr_it.data()->word;
1535 ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
1536 !real_word->flag(W_FUZZY_NON));
1537 real_word->set_flag(W_FUZZY_SP, true);
1538 }
1539 }
1540}
@ W_FUZZY_SP
fuzzy space
Definition: werd.h:41
@ W_FUZZY_NON
fuzzy nonspace
Definition: werd.h:42
void set_flag(WERD_FLAGS mask, bool value)
Definition: werd.h:131

◆ next_block()

BLOCK_RES * tesseract::PAGE_RES_IT::next_block ( ) const
inline

Definition at line 778 of file pageres.h.

778 { // block of next word
779 return next_block_res;
780 }

◆ next_row()

ROW_RES * tesseract::PAGE_RES_IT::next_row ( ) const
inline

Definition at line 775 of file pageres.h.

775 { // row of next word
776 return next_row_res;
777 }

◆ next_word()

WERD_RES * tesseract::PAGE_RES_IT::next_word ( ) const
inline

Definition at line 772 of file pageres.h.

772 { // next word
773 return next_word_res;
774 }

◆ operator!=()

bool tesseract::PAGE_RES_IT::operator!= ( const PAGE_RES_IT other) const
inline

Definition at line 700 of file pageres.h.

700 {
701 return !(*this == other);
702 }

◆ operator==()

bool tesseract::PAGE_RES_IT::operator== ( const PAGE_RES_IT other) const
inline

Definition at line 695 of file pageres.h.

695 {
696 return word_res == other.word_res && row_res == other.row_res &&
697 block_res == other.block_res;
698 }

◆ prev_block()

BLOCK_RES * tesseract::PAGE_RES_IT::prev_block ( ) const
inline

Definition at line 760 of file pageres.h.

760 { // block of prev word
761 return prev_block_res;
762 }

◆ prev_row()

ROW_RES * tesseract::PAGE_RES_IT::prev_row ( ) const
inline

Definition at line 757 of file pageres.h.

757 { // row of prev word
758 return prev_row_res;
759 }

◆ prev_word()

WERD_RES * tesseract::PAGE_RES_IT::prev_word ( ) const
inline

Definition at line 754 of file pageres.h.

754 { // previous word
755 return prev_word_res;
756 }

◆ rej_stat_word()

void tesseract::PAGE_RES_IT::rej_stat_word ( )

Definition at line 1722 of file pageres.cpp.

1722 {
1723 int16_t chars_in_word;
1724 int16_t rejects_in_word = 0;
1725
1726 chars_in_word = word_res->reject_map.length();
1727 page_res->char_count += chars_in_word;
1728 block_res->char_count += chars_in_word;
1729 row_res->char_count += chars_in_word;
1730
1731 rejects_in_word = word_res->reject_map.reject_count();
1732
1733 page_res->rej_count += rejects_in_word;
1734 block_res->rej_count += rejects_in_word;
1735 row_res->rej_count += rejects_in_word;
1736 if (chars_in_word == rejects_in_word) {
1737 row_res->whole_word_rej_count += rejects_in_word;
1738 }
1739}
int32_t rej_count
Definition: pageres.h:80
int32_t char_count
Definition: pageres.h:79
int32_t char_count
Definition: pageres.h:121
int32_t whole_word_rej_count
Definition: pageres.h:147
int32_t rej_count
Definition: pageres.h:146
int32_t char_count
Definition: pageres.h:145
int16_t reject_count() const
Definition: rejctmap.h:339
uint16_t length() const
Definition: rejctmap.h:333

◆ ReplaceCurrentWord()

void tesseract::PAGE_RES_IT::ReplaceCurrentWord ( tesseract::PointerVector< WERD_RES > *  words)

Definition at line 1378 of file pageres.cpp.

1379 {
1380 if (words->empty()) {
1382 return;
1383 }
1384 WERD_RES *input_word = word();
1385 // Set the BOL/EOL flags on the words from the input word.
1386 if (input_word->word->flag(W_BOL)) {
1387 (*words)[0]->word->set_flag(W_BOL, true);
1388 } else {
1389 (*words)[0]->word->set_blanks(input_word->word->space());
1390 }
1391 words->back()->word->set_flag(W_EOL, input_word->word->flag(W_EOL));
1392
1393 // Move the blobs from the input word to the new set of words.
1394 // If the input word_res is a combination, then the replacements will also be
1395 // combinations, and will own their own words. If the input word_res is not a
1396 // combination, then the final replacements will not be either, (although it
1397 // is allowed for the input words to be combinations) and their words
1398 // will get put on the row list. This maintains the ownership rules.
1399 WERD_IT w_it(row()->row->word_list());
1400 if (!input_word->combination) {
1401 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
1402 WERD *word = w_it.data();
1403 if (word == input_word->word) {
1404 break;
1405 }
1406 }
1407 // w_it is now set to the input_word's word.
1408 ASSERT_HOST(!w_it.cycled_list());
1409 }
1410 // Insert into the appropriate place in the ROW_RES.
1411 WERD_RES_IT wr_it(&row()->word_res_list);
1412 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1413 WERD_RES *word = wr_it.data();
1414 if (word == input_word) {
1415 break;
1416 }
1417 }
1418 ASSERT_HOST(!wr_it.cycled_list());
1419 // Since we only have an estimate of the bounds between blobs, use the blob
1420 // x-middle as the determiner of where to put the blobs
1421 C_BLOB_IT src_b_it(input_word->word->cblob_list());
1422 src_b_it.sort(&C_BLOB::SortByXMiddle);
1423 C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
1424 rej_b_it.sort(&C_BLOB::SortByXMiddle);
1425 TBOX clip_box;
1426 for (size_t w = 0; w < words->size(); ++w) {
1427 WERD_RES *word_w = (*words)[w];
1428 clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
1429 // Compute blob boundaries.
1430 std::vector<int> blob_ends;
1431 C_BLOB_LIST *next_word_blobs =
1432 w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1433 ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1434 // Remove the fake blobs on the current word, but keep safe for back-up if
1435 // no blob can be found.
1436 C_BLOB_LIST fake_blobs;
1437 C_BLOB_IT fake_b_it(&fake_blobs);
1438 fake_b_it.add_list_after(word_w->word->cblob_list());
1439 fake_b_it.move_to_first();
1440 word_w->word->cblob_list()->clear();
1441 C_BLOB_IT dest_it(word_w->word->cblob_list());
1442 // Build the box word as we move the blobs.
1443 auto *box_word = new tesseract::BoxWord;
1444 for (size_t i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1445 int end_x = blob_ends[i];
1446 TBOX blob_box;
1447 // Add the blobs up to end_x.
1448 while (!src_b_it.empty() &&
1449 src_b_it.data()->bounding_box().x_middle() < end_x) {
1450 blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1451 src_b_it.forward();
1452 }
1453 while (!rej_b_it.empty() &&
1454 rej_b_it.data()->bounding_box().x_middle() < end_x) {
1455 blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1456 rej_b_it.forward();
1457 }
1458 if (blob_box.null_box()) {
1459 // Use the original box as a back-up.
1460 blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1461 }
1462 box_word->InsertBox(i, blob_box);
1463 }
1464 delete word_w->box_word;
1465 word_w->box_word = box_word;
1466 if (!input_word->combination) {
1467 // Insert word_w->word into the ROW. It doesn't own its word, so the
1468 // ROW needs to own it.
1469 w_it.add_before_stay_put(word_w->word);
1470 word_w->combination = false;
1471 }
1472 (*words)[w] = nullptr; // We are taking ownership.
1473 wr_it.add_before_stay_put(word_w);
1474 }
1475 // We have taken ownership of the words.
1476 words->clear();
1477 // Delete the current word, which has been replaced. We could just call
1478 // DeleteCurrentWord, but that would iterate both lists again, and we know
1479 // we are already in the right place.
1480 if (!input_word->combination) {
1481 delete w_it.extract();
1482 }
1483 delete wr_it.extract();
1485}
@ TBOX
@ W_BOL
start of line
Definition: werd.h:34
@ W_EOL
end of line
Definition: werd.h:35
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:124
unsigned size() const
Definition: genericvector.h:70

◆ ResetWordIterator()

void tesseract::PAGE_RES_IT::ResetWordIterator ( )

Definition at line 1571 of file pageres.cpp.

1571 {
1572 if (row_res == next_row_res) {
1573 // Reset the member iterator so it can move forward and detect the
1574 // cycled_list state correctly.
1575 word_res_it.move_to_first();
1576 for (word_res_it.mark_cycle_pt();
1577 !word_res_it.cycled_list() && word_res_it.data() != next_word_res;
1578 word_res_it.forward()) {
1579 if (!word_res_it.data()->part_of_combo) {
1580 if (prev_row_res == row_res) {
1581 prev_word_res = word_res;
1582 }
1583 word_res = word_res_it.data();
1584 }
1585 }
1586 ASSERT_HOST(!word_res_it.cycled_list());
1587 wr_it_of_next_word = word_res_it;
1588 word_res_it.forward();
1589 } else {
1590 // word_res_it is OK, but reset word_res and prev_word_res if needed.
1591 WERD_RES_IT wr_it(&row_res->word_res_list);
1592 for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
1593 if (!wr_it.data()->part_of_combo) {
1594 if (prev_row_res == row_res) {
1595 prev_word_res = word_res;
1596 }
1597 word_res = wr_it.data();
1598 }
1599 }
1600 }
1601}

◆ restart_page()

WERD_RES * tesseract::PAGE_RES_IT::restart_page ( )
inline

Definition at line 710 of file pageres.h.

710 {
711 return start_page(false); // Skip empty blocks.
712 }
WERD_RES * start_page(bool empty_ok)
Definition: pageres.cpp:1548

◆ restart_page_with_empties()

WERD_RES * tesseract::PAGE_RES_IT::restart_page_with_empties ( )
inline

Definition at line 713 of file pageres.h.

713 {
714 return start_page(true); // Allow empty blocks.
715 }

◆ restart_row()

WERD_RES * tesseract::PAGE_RES_IT::restart_row ( )

Definition at line 1683 of file pageres.cpp.

1683 {
1684 ROW_RES *row = this->row();
1685 if (!row) {
1686 return nullptr;
1687 }
1688 for (restart_page(); this->row() != row; forward()) {
1689 // pass
1690 }
1691 return word();
1692}
WERD_RES * forward()
Definition: pageres.h:743

◆ row()

ROW_RES * tesseract::PAGE_RES_IT::row ( ) const
inline

Definition at line 766 of file pageres.h.

766 { // row of current word
767 return row_res;
768 }

◆ start_page()

WERD_RES * tesseract::PAGE_RES_IT::start_page ( bool  empty_ok)

Definition at line 1548 of file pageres.cpp.

1548 {
1549 block_res_it.set_to_list(&page_res->block_res_list);
1550 block_res_it.mark_cycle_pt();
1551 prev_block_res = nullptr;
1552 prev_row_res = nullptr;
1553 prev_word_res = nullptr;
1554 block_res = nullptr;
1555 row_res = nullptr;
1556 word_res = nullptr;
1557 next_block_res = nullptr;
1558 next_row_res = nullptr;
1559 next_word_res = nullptr;
1560 internal_forward(true, empty_ok);
1561 return internal_forward(false, empty_ok);
1562}

◆ word()

WERD_RES * tesseract::PAGE_RES_IT::word ( ) const
inline

Definition at line 763 of file pageres.h.

763 { // current word
764 return word_res;
765 }

Member Data Documentation

◆ page_res

PAGE_RES* tesseract::PAGE_RES_IT::page_res

Definition at line 684 of file pageres.h.


The documentation for this class was generated from the following files: