tesseract v5.3.3.20231005
tesseract::DocumentData Class Reference

#include <imagedata.h>

Public Member Functions

TESS_API DocumentData (const std::string &name)
 
TESS_API ~DocumentData ()
 
TESS_API bool LoadDocument (const char *filename, int start_page, int64_t max_memory, FileReader reader)
 
void SetDocument (const char *filename, int64_t max_memory, FileReader reader)
 
TESS_API bool SaveDocument (const char *filename, FileWriter writer)
 
TESS_API void AddPageToDocument (ImageData *page)
 
const std::string & document_name () const
 
int NumPages () const
 
size_t PagesSize () const
 
int64_t memory_used () const
 
void LoadPageInBackground (int index)
 
TESS_API const ImageDataGetPage (int index)
 
bool IsPageAvailable (int index, ImageData **page)
 
ImageDataTakePage (int index)
 
bool IsCached () const
 
int64_t UnCache ()
 
void Shuffle ()
 

Detailed Description

Definition at line 169 of file imagedata.h.

Constructor & Destructor Documentation

◆ DocumentData()

tesseract::DocumentData::DocumentData ( const std::string &  name)
explicit

Definition at line 381 of file imagedata.cpp.

382 : document_name_(name),
383 pages_offset_(-1),
384 total_pages_(-1),
385 memory_used_(0),
386 max_memory_(0),
387 reader_(nullptr) {}

◆ ~DocumentData()

tesseract::DocumentData::~DocumentData ( )

Definition at line 389 of file imagedata.cpp.

389 {
390 if (thread.joinable()) {
391 thread.join();
392 }
393 std::lock_guard<std::mutex> lock_p(pages_mutex_);
394 std::lock_guard<std::mutex> lock_g(general_mutex_);
395 for (auto data : pages_) {
396 delete data;
397 }
398}

Member Function Documentation

◆ AddPageToDocument()

void tesseract::DocumentData::AddPageToDocument ( ImageData page)

Definition at line 433 of file imagedata.cpp.

433 {
434 std::lock_guard<std::mutex> lock(pages_mutex_);
435 pages_.push_back(page);
436 set_memory_used(memory_used() + page->MemoryUsed());
437}
int64_t memory_used() const
Definition: imagedata.h:201

◆ document_name()

const std::string & tesseract::DocumentData::document_name ( ) const
inline

Definition at line 190 of file imagedata.h.

190 {
191 std::lock_guard<std::mutex> lock(general_mutex_);
192 return document_name_;
193 }

◆ GetPage()

const ImageData * tesseract::DocumentData::GetPage ( int  index)

Definition at line 467 of file imagedata.cpp.

467 {
468 ImageData *page = nullptr;
469 while (!IsPageAvailable(index, &page)) {
470 // If there is no background load scheduled, schedule one now.
471 pages_mutex_.lock();
472 bool needs_loading = pages_offset_ != index;
473 pages_mutex_.unlock();
474 if (needs_loading) {
476 }
477 // We can't directly load the page, or the background load will delete it
478 // while the caller is using it, so give it a chance to work.
479 std::this_thread::yield();
480 }
481 return page;
482}
bool IsPageAvailable(int index, ImageData **page)
Definition: imagedata.cpp:487
void LoadPageInBackground(int index)
Definition: imagedata.cpp:441

◆ IsCached()

bool tesseract::DocumentData::IsCached ( ) const
inline

Definition at line 234 of file imagedata.h.

234 {
235 return NumPages() >= 0;
236 }
int NumPages() const
Definition: imagedata.h:194

◆ IsPageAvailable()

bool tesseract::DocumentData::IsPageAvailable ( int  index,
ImageData **  page 
)

Definition at line 487 of file imagedata.cpp.

487 {
488 std::lock_guard<std::mutex> lock(pages_mutex_);
489 int num_pages = NumPages();
490 if (num_pages == 0 || index < 0) {
491 *page = nullptr; // Empty Document.
492 return true;
493 }
494 if (num_pages > 0) {
495 index = Modulo(index, num_pages);
496 if (pages_offset_ <= index &&
497 static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
498 *page = pages_[index - pages_offset_]; // Page is available already.
499 return true;
500 }
501 }
502 return false;
503}
int Modulo(int a, int b)
Definition: helpers.h:153

◆ LoadDocument()

bool tesseract::DocumentData::LoadDocument ( const char *  filename,
int  start_page,
int64_t  max_memory,
FileReader  reader 
)

Definition at line 402 of file imagedata.cpp.

403 {
404 SetDocument(filename, max_memory, reader);
405 pages_offset_ = start_page;
406 return ReCachePages();
407}
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
Definition: imagedata.cpp:410

◆ LoadPageInBackground()

void tesseract::DocumentData::LoadPageInBackground ( int  index)

Definition at line 441 of file imagedata.cpp.

441 {
442 ImageData *page = nullptr;
443 if (IsPageAvailable(index, &page)) {
444 return;
445 }
446 {
447 std::lock_guard<std::mutex> lock(pages_mutex_);
448 if (pages_offset_ == index) {
449 return;
450 }
451 pages_offset_ = index;
452 for (auto page : pages_) {
453 delete page;
454 }
455 pages_.clear();
456 }
457 if (thread.joinable()) {
458 thread.join();
459 }
460 // Don't run next statement asynchronously because that would
461 // create too many threads on Linux (see issue #3111).
462 ReCachePages();
463}

◆ memory_used()

int64_t tesseract::DocumentData::memory_used ( ) const
inline

Definition at line 201 of file imagedata.h.

201 {
202 std::lock_guard<std::mutex> lock(general_mutex_);
203 return memory_used_;
204 }

◆ NumPages()

int tesseract::DocumentData::NumPages ( ) const
inline

Definition at line 194 of file imagedata.h.

194 {
195 std::lock_guard<std::mutex> lock(general_mutex_);
196 return total_pages_;
197 }

◆ PagesSize()

size_t tesseract::DocumentData::PagesSize ( ) const
inline

Definition at line 198 of file imagedata.h.

198 {
199 return pages_.size();
200 }

◆ SaveDocument()

bool tesseract::DocumentData::SaveDocument ( const char *  filename,
FileWriter  writer 
)

Definition at line 421 of file imagedata.cpp.

421 {
422 std::lock_guard<std::mutex> lock(pages_mutex_);
423 TFile fp;
424 fp.OpenWrite(nullptr);
425 if (!fp.Serialize(pages_) || !fp.CloseWrite(filename, writer)) {
426 tprintf("Serialize failed: %s\n", filename);
427 return false;
428 }
429 return true;
430}
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

◆ SetDocument()

void tesseract::DocumentData::SetDocument ( const char *  filename,
int64_t  max_memory,
FileReader  reader 
)

Definition at line 410 of file imagedata.cpp.

411 {
412 std::lock_guard<std::mutex> lock_p(pages_mutex_);
413 std::lock_guard<std::mutex> lock(general_mutex_);
414 document_name_ = filename;
415 pages_offset_ = -1;
416 max_memory_ = max_memory;
417 reader_ = reader;
418}

◆ Shuffle()

void tesseract::DocumentData::Shuffle ( )

Definition at line 523 of file imagedata.cpp.

523 {
524 TRand random;
525 // Different documents get shuffled differently, but the same for the same
526 // name.
527 random.set_seed(document_name_.c_str());
528 int num_pages = pages_.size();
529 // Execute one random swap for each page in the document.
530 for (int i = 0; i < num_pages; ++i) {
531 int src = random.IntRand() % num_pages;
532 int dest = random.IntRand() % num_pages;
533 std::swap(pages_[src], pages_[dest]);
534 }
535}
dest
Definition: upload.py:409

◆ TakePage()

ImageData * tesseract::DocumentData::TakePage ( int  index)
inline

Definition at line 226 of file imagedata.h.

226 {
227 std::lock_guard<std::mutex> lock(pages_mutex_);
228 ImageData *page = pages_[index];
229 pages_[index] = nullptr;
230 return page;
231 }

◆ UnCache()

int64_t tesseract::DocumentData::UnCache ( )

Definition at line 507 of file imagedata.cpp.

507 {
508 std::lock_guard<std::mutex> lock(pages_mutex_);
509 int64_t memory_saved = memory_used();
510 for (auto page : pages_) {
511 delete page;
512 }
513 pages_.clear();
514 pages_offset_ = -1;
515 set_total_pages(-1);
516 set_memory_used(0);
517 tprintf("Unloaded document %s, saving %" PRId64 " memory\n",
518 document_name_.c_str(), memory_saved);
519 return memory_saved;
520}

The documentation for this class was generated from the following files: