21# include "config_auto.h"
34#include <allheaders.h>
47 : page_number_(0), vertical_text_(vertical) {
51#ifdef TESSERACT_IMAGEDATA_AS_PIX
52 internal_pix_.destroy();
59 const char *imagedata,
int imagedatasize,
60 const char *truth_text,
const char *box_text) {
68 memcpy(&
image_data->image_data_[0], imagedata, imagedatasize);
70 if (truth_text ==
nullptr || truth_text[0] ==
'\0') {
71 tprintf(
"Error: No text corresponding to page %d from image %s!\n",
78 image_data->box_texts_.emplace_back(truth_text);
81 }
else if (truth_text !=
nullptr && truth_text[0] !=
'\0' &&
112 int8_t vertical = vertical_text_;
143 vertical_text_ = vertical != 0;
172 for (
int i = 0;
i < number;
i++) {
185#ifdef TESSERACT_IMAGEDATA_AS_PIX
188 SetPixInternal(pix, &image_data_);
194#ifdef TESSERACT_IMAGEDATA_AS_PIX
195# ifdef GRAPHICS_DISABLED
198 return internal_pix_.
clone();
202 return internal_pix_.
copy();
205 return GetPixInternal(image_data_);
216 float *scale_factor,
int *scaled_width,
217 int *scaled_height, std::vector<TBOX> *boxes)
const {
219 int input_height = 0;
222 input_width = pixGetWidth(src_pix);
223 input_height = pixGetHeight(src_pix);
224 if (target_height == 0) {
225 target_height = std::min(input_height, max_height);
227 float im_factor =
static_cast<float>(target_height) / input_height;
228 if (scaled_width !=
nullptr) {
231 if (scaled_height !=
nullptr) {
232 *scaled_height = target_height;
235 Image pix = pixScale(src_pix, im_factor, im_factor);
236 if (pix ==
nullptr) {
237 tprintf(
"Scaling pix of size %d, %d by factor %g made null pix!!\n",
238 input_width, input_height, im_factor);
242 if (scaled_width !=
nullptr) {
243 *scaled_width = pixGetWidth(pix);
245 if (scaled_height !=
nullptr) {
246 *scaled_height = pixGetHeight(pix);
249 if (
boxes !=
nullptr) {
252 for (
auto box : boxes_) {
253 box.scale(im_factor);
254 boxes->push_back(box);
256 if (
boxes->empty()) {
258 TBOX box(0, 0, im_factor * input_width, target_height);
259 boxes->push_back(box);
262 if (scale_factor !=
nullptr) {
263 *scale_factor = im_factor;
269 return image_data_.size();
272#ifndef GRAPHICS_DISABLED
276 const int kTextSize = 64;
279 if (pix ==
nullptr) {
282 int width = pixGetWidth(pix);
283 int height = pixGetHeight(pix);
284 auto *win =
new ScrollView(
"Imagedata", 100, 100, 2 * (width + 2 * kTextSize),
285 2 * (height + 4 * kTextSize), width + 10,
286 height + 3 * kTextSize,
true);
287 win->Draw(pix, 0, height - 1);
292 int text_size = kTextSize;
293 if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) {
294 text_size = boxes_[0].height() * 2;
296 win->TextAttributes(
"Arial", text_size,
false,
false,
false);
297 if (!boxes_.empty()) {
298 for (
unsigned b = 0; b < boxes_.size(); ++b) {
300 win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].c_str());
305 win->Text(0, height + kTextSize * 2, transcription_.c_str());
316 const std::vector<std::string> &texts,
317 const std::vector<int> &box_pages) {
319 for (
unsigned i = 0;
i < box_pages.size(); ++
i) {
320 if (page_number_ >= 0 && box_pages[
i] != page_number_) {
323 transcription_ += texts[
i];
324 boxes_.push_back(
boxes[
i]);
325 box_texts_.push_back(texts[
i]);
329#ifndef TESSERACT_IMAGEDATA_AS_PIX
333void ImageData::SetPixInternal(
Image pix, std::vector<char> *image_data) {
337 ret = pixWriteMem(&data, &size, pix, IFF_PNG);
339 ret = pixWriteMem(&data, &size, pix, IFF_PNM);
349Image ImageData::GetPixInternal(
const std::vector<char> &image_data) {
354 reinterpret_cast<const unsigned char *
>(&
image_data[0]);
365 std::vector<TBOX>
boxes;
366 std::vector<std::string> texts;
367 std::vector<int> box_pages;
369 true, &
boxes, &texts,
nullptr,
374 tprintf(
"Error: No boxes for page %d from image %s!\n", page_number_,
375 imagefilename_.c_str());
382 : document_name_(name),
390 if (thread.joinable()) {
393 std::lock_guard<std::mutex> lock_p(pages_mutex_);
394 std::lock_guard<std::mutex> lock_g(general_mutex_);
395 for (
auto data : pages_) {
405 pages_offset_ = start_page;
406 return ReCachePages();
412 std::lock_guard<std::mutex> lock_p(pages_mutex_);
413 std::lock_guard<std::mutex> lock(general_mutex_);
414 document_name_ = filename;
416 max_memory_ = max_memory;
422 std::lock_guard<std::mutex> lock(pages_mutex_);
426 tprintf(
"Serialize failed: %s\n", filename);
434 std::lock_guard<std::mutex> lock(pages_mutex_);
435 pages_.push_back(page);
447 std::lock_guard<std::mutex> lock(pages_mutex_);
448 if (pages_offset_ == index) {
451 pages_offset_ = index;
452 for (
auto page : pages_) {
457 if (thread.joinable()) {
472 bool needs_loading = pages_offset_ != index;
473 pages_mutex_.unlock();
479 std::this_thread::yield();
488 std::lock_guard<std::mutex> lock(pages_mutex_);
490 if (num_pages == 0 || index < 0) {
495 index =
Modulo(index, num_pages);
496 if (pages_offset_ <= index &&
497 static_cast<unsigned>(index) < pages_offset_ + pages_.size()) {
498 *page = pages_[index - pages_offset_];
508 std::lock_guard<std::mutex> lock(pages_mutex_);
510 for (
auto page : pages_) {
517 tprintf(
"Unloaded document %s, saving %" PRId64
" memory\n",
518 document_name_.c_str(), memory_saved);
527 random.
set_seed(document_name_.c_str());
528 int num_pages = pages_.size();
530 for (
int i = 0;
i < num_pages; ++
i) {
531 int src = random.
IntRand() % num_pages;
533 std::swap(pages_[src], pages_[
dest]);
539bool DocumentData::ReCachePages() {
540 std::lock_guard<std::mutex> lock(pages_mutex_);
544 int loaded_pages = 0;
545 for (
auto page : pages_) {
550 if (!fp.Open(document_name_.c_str(), reader_) ||
551 !fp.DeSerializeSize(&loaded_pages) || loaded_pages <= 0) {
552 tprintf(
"Deserialize header failed: %s\n", document_name_.c_str());
555 pages_offset_ %= loaded_pages;
559 for (page = 0; page < loaded_pages; ++page) {
561 if (!fp.DeSerialize(&non_null)) {
564 if (page < pages_offset_ ||
565 (max_memory_ > 0 &&
memory_used() > max_memory_)) {
570 ImageData *image_data =
nullptr;
572 image_data =
new ImageData;
573 if (!image_data->DeSerialize(&fp)) {
578 pages_.push_back(image_data);
579 if (image_data->imagefilename().empty()) {
580 image_data->set_imagefilename(document_name_);
581 image_data->set_page_number(page);
583 set_memory_used(
memory_used() + image_data->MemoryUsed());
586 if (page < loaded_pages) {
587 tprintf(
"Deserialize failed: %s read %d/%d lines\n", document_name_.c_str(),
589 for (
auto page : pages_) {
593 }
else if (loaded_pages > 1) {
595 tprintf(
"Loaded %zu/%d lines (%d-%zu) of document %s\n", pages_.size(),
596 loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(),
597 document_name_.c_str());
599 set_total_pages(loaded_pages);
600 return !pages_.empty();
607 for (
auto *document : documents_) {
617 cache_strategy_ = cache_strategy;
618 int64_t fair_share_memory = 0;
623 fair_share_memory = max_memory_ / filenames.size();
625 for (
const auto &filename : filenames) {
627 document->SetDocument(filename.c_str(), fair_share_memory, reader);
630 if (!documents_.empty()) {
635 tprintf(
"Load of page 0 failed!\n");
642 documents_.push_back(data);
648 const std::string &document_name)
const {
649 for (
auto *document : documents_) {
650 if (document->document_name() == document_name) {
663 if (num_pages_per_doc_ == 0) {
664 GetPageSequential(0);
666 return num_pages_per_doc_ * documents_.size();
669 for (
auto *document : documents_) {
671 document->GetPage(0);
672 total_pages += document->NumPages();
680const ImageData *DocumentCache::GetPageRoundRobin(
int serial) {
681 int num_docs = documents_.size();
682 int doc_index = serial % num_docs;
683 const ImageData *doc = documents_[doc_index]->GetPage(serial / num_docs);
684 for (
int offset = 1; offset <=
kMaxReadAhead && offset < num_docs; ++offset) {
685 doc_index = (serial + offset) % num_docs;
686 int page = (serial + offset) / num_docs;
687 documents_[doc_index]->LoadPageInBackground(page);
695const ImageData *DocumentCache::GetPageSequential(
int serial) {
696 int num_docs = documents_.size();
698 if (num_pages_per_doc_ == 0) {
700 documents_[0]->GetPage(0);
701 num_pages_per_doc_ = documents_[0]->NumPages();
702 if (num_pages_per_doc_ == 0) {
703 tprintf(
"First document cannot be empty!!\n");
707 if (serial / num_pages_per_doc_ % num_docs > 0) {
708 documents_[0]->UnCache();
711 int doc_index = serial / num_pages_per_doc_ % num_docs;
712 const ImageData *doc =
713 documents_[doc_index]->GetPage(serial % num_pages_per_doc_);
716 int64_t total_memory = 0;
717 for (
auto *document : documents_) {
718 total_memory += document->memory_used();
720 if (total_memory >= max_memory_) {
726 int num_in_front = CountNeighbourDocs(doc_index, 1);
727 for (
int offset = num_in_front - 2;
728 offset > 1 && total_memory >= max_memory_; --offset) {
729 int next_index = (doc_index + offset) % num_docs;
730 total_memory -= documents_[next_index]->UnCache();
735 int num_behind = CountNeighbourDocs(doc_index, -1);
736 for (
int offset = num_behind; offset < 0 && total_memory >= max_memory_;
738 int next_index = (doc_index + offset + num_docs) % num_docs;
739 total_memory -= documents_[next_index]->UnCache();
742 int next_index = (doc_index + 1) % num_docs;
743 if (!documents_[next_index]->IsCached() && total_memory < max_memory_) {
744 documents_[next_index]->LoadPageInBackground(0);
751int DocumentCache::CountNeighbourDocs(
int index,
int dir) {
752 int num_docs = documents_.size();
753 for (
int offset = dir; abs(offset) < num_docs; offset += dir) {
754 int offset_index = (index + offset + num_docs) % num_docs;
755 if (!documents_[offset_index]->IsCached()) {
bool ReadMemBoxes(int target_page, bool skip_blanks, const char *box_data, bool continue_on_failure, std::vector< TBOX > *boxes, std::vector< std::string > *texts, std::vector< std::string > *box_texts, std::vector< int > *pages)
bool(*)(const std::vector< char > &data, const char *filename) FileWriter
void tprintf(const char *format,...)
int IntCastRounded(double x)
bool(*)(const char *filename, std::vector< char > *data) FileReader
void AddBoxes(const std::vector< TBOX > &boxes, const std::vector< std::string > &texts, const std::vector< int > &box_pages)
static bool SkipDeSerialize(TFile *fp)
const std::vector< char > & image_data() const
const std::string & box_text(int index) const
bool Serialize(TFile *fp) const
bool DeSerialize(TFile *fp)
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Image PreScale(int target_height, int max_height, float *scale_factor, int *scaled_width, int *scaled_height, std::vector< TBOX > *boxes) const
const std::vector< TBOX > & boxes() const
bool IsPageAvailable(int index, ImageData **page)
TESS_API DocumentData(const std::string &name)
void SetDocument(const char *filename, int64_t max_memory, FileReader reader)
int64_t memory_used() const
TESS_API bool SaveDocument(const char *filename, FileWriter writer)
void LoadPageInBackground(int index)
TESS_API bool LoadDocument(const char *filename, int start_page, int64_t max_memory, FileReader reader)
TESS_API const ImageData * GetPage(int index)
TESS_API void AddPageToDocument(ImageData *page)
bool AddToCache(DocumentData *data)
const ImageData * GetPageBySerial(int serial)
DocumentData * FindDocument(const std::string &document_name) const
TESS_API bool LoadDocuments(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, FileReader reader)
TESS_API ~DocumentCache()
TESS_API int TotalPages()
TESS_API DocumentCache(int64_t max_memory)
void set_seed(uint64_t seed)
void OpenWrite(std::vector< char > *data)
bool DeSerialize(std::string &data)
bool Serialize(const std::string &data)
bool DeSerializeSkip(size_t size=1)
bool CloseWrite(const char *filename, FileWriter writer)