All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
imagedata.h
Go to the documentation of this file.
1 // File: imagedata.h
3 // Description: Class to hold information about a single image and its
4 // corresponding boxes or text file.
5 // Author: Ray Smith
6 // Created: Mon Jul 22 14:17:06 PDT 2013
7 //
8 // (C) Copyright 2013, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
19 
20 #ifndef TESSERACT_IMAGE_IMAGEDATA_H_
21 #define TESSERACT_IMAGE_IMAGEDATA_H_
22 
23 
24 #include "genericvector.h"
25 #include "normalis.h"
26 #include "rect.h"
27 #include "strngs.h"
28 
29 struct Pix;
30 
31 namespace tesseract {
32 
33 // Amount of padding to apply in output pixels in feature mode.
34 const int kFeaturePadding = 2;
35 // Number of pixels to pad around text boxes.
36 const int kImagePadding = 4;
37 // Number of training images to combine into a mini-batch for training.
38 const int kNumPagesPerMiniBatch = 100;
39 
40 class WordFeature {
41  public:
42  WordFeature();
43  WordFeature(const FCOORD& fcoord, uinT8 dir);
44 
45  // Computes the maximum x and y value in the features.
46  static void ComputeSize(const GenericVector<WordFeature>& features,
47  int* max_x, int* max_y);
48  // Draws the features in the given window.
49  static void Draw(const GenericVector<WordFeature>& features,
50  ScrollView* window);
51 
52  // Accessors.
53  int x() const { return x_; }
54  int y() const { return y_; }
55  int dir() const { return dir_; }
56 
57  // Writes to the given file. Returns false in case of error.
58  bool Serialize(FILE* fp) const;
59  // Reads from the given file. Returns false in case of error.
60  // If swap is true, assumes a big/little-endian swap is needed.
61  bool DeSerialize(bool swap, FILE* fp);
62 
63  private:
64  inT16 x_;
65  uinT8 y_;
66  uinT8 dir_;
67 };
68 
69 // A floating-point version of WordFeature, used as an intermediate during
70 // scaling.
72  static void FromWordFeatures(const GenericVector<WordFeature>& word_features,
73  GenericVector<FloatWordFeature>* float_features);
74  // Sort function to sort first by x-bucket, then by y.
75  static int SortByXBucket(const void*, const void*);
76 
77  float x;
78  float y;
79  float dir;
80  int x_bucket;
81 };
82 
83 // Class to hold information on a single image:
84 // Filename, cached image as a Pix*, character boxes, text transcription.
85 // The text transcription is the ground truth UTF-8 text for the image.
86 // Character boxes are optional and indicate the desired segmentation of
87 // the text into recognition units.
88 class ImageData {
89  public:
90  ImageData();
91  // Takes ownership of the pix.
92  ImageData(bool vertical, Pix* pix);
93  ~ImageData();
94 
95  // Builds and returns an ImageData from the basic data. Note that imagedata,
96  // truth_text, and box_text are all the actual file data, NOT filenames.
97  static ImageData* Build(const char* name, int page_number, const char* lang,
98  const char* imagedata, int imagedatasize,
99  const char* truth_text, const char* box_text);
100 
101  // Writes to the given file. Returns false in case of error.
102  bool Serialize(TFile* fp) const;
103  // Reads from the given file. Returns false in case of error.
104  // If swap is true, assumes a big/little-endian swap is needed.
105  bool DeSerialize(bool swap, TFile* fp);
106 
107  // Other accessors.
108  const STRING& imagefilename() const {
109  return imagefilename_;
110  }
111  void set_imagefilename(const STRING& name) {
112  imagefilename_ = name;
113  }
114  int page_number() const {
115  return page_number_;
116  }
117  void set_page_number(int num) {
118  page_number_ = num;
119  }
121  return image_data_;
122  }
123  const STRING& language() const {
124  return language_;
125  }
126  void set_language(const STRING& lang) {
127  language_ = lang;
128  }
129  const STRING& transcription() const {
130  return transcription_;
131  }
132  const GenericVector<TBOX>& boxes() const {
133  return boxes_;
134  }
136  return box_texts_;
137  }
138  const STRING& box_text(int index) const {
139  return box_texts_[index];
140  }
141  // Saves the given Pix as a PNG-encoded string and destroys it.
142  void SetPix(Pix* pix);
143  // Returns the Pix image for *this. Must be pixDestroyed after use.
144  Pix* GetPix() const;
145  // Gets anything and everything with a non-NULL pointer, prescaled to a
146  // given target_height (if 0, then the original image height), and aligned.
147  // Also returns (if not NULL) the width and height of the scaled image.
148  // The return value is the scale factor that was applied to the image to
149  // achieve the target_height.
150  float PreScale(int target_height, Pix** pix,
151  int* scaled_width, int* scaled_height,
152  GenericVector<TBOX>* boxes) const;
153 
154  int MemoryUsed() const;
155 
156  // Draws the data in a new window.
157  void Display() const;
158 
159  // Adds the supplied boxes and transcriptions that correspond to the correct
160  // page number.
161  void AddBoxes(const GenericVector<TBOX>& boxes,
162  const GenericVector<STRING>& texts,
163  const GenericVector<int>& box_pages);
164 
165  private:
166  // Saves the given Pix as a PNG-encoded string and destroys it.
167  static void SetPixInternal(Pix* pix, GenericVector<char>* image_data);
168  // Returns the Pix image for the image_data. Must be pixDestroyed after use.
169  static Pix* GetPixInternal(const GenericVector<char>& image_data);
170  // Parses the text string as a box file and adds any discovered boxes that
171  // match the page number. Returns false on error.
172  bool AddBoxes(const char* box_text);
173 
174  private:
175  STRING imagefilename_; // File to read image from.
176  inT32 page_number_; // Page number if multi-page tif or -1.
177  GenericVector<char> image_data_; // PNG file data.
178  STRING language_; // Language code for image.
179  STRING transcription_; // UTF-8 ground truth of image.
180  GenericVector<TBOX> boxes_; // If non-empty boxes of the image.
181  GenericVector<STRING> box_texts_; // String for text in each box.
182  bool vertical_text_; // Image has been rotated from vertical.
183 };
184 
185 // A collection of ImageData that knows roughly how much memory it is using.
187  public:
188  explicit DocumentData(const STRING& name);
189  ~DocumentData();
190 
191  // Reads all the pages in the given lstmf filename to the cache. The reader
192  // is used to read the file.
193  bool LoadDocument(const char* filename, const char* lang, int start_page,
194  inT64 max_memory, FileReader reader);
195  // Writes all the pages to the given filename. Returns false on error.
196  bool SaveDocument(const char* filename, FileWriter writer);
197  bool SaveToBuffer(GenericVector<char>* buffer);
198 
199  // Adds the given page data to this document, counting up memory.
200  void AddPageToDocument(ImageData* page);
201 
202  const STRING& document_name() const {
203  return document_name_;
204  }
205  int NumPages() const {
206  return total_pages_;
207  }
208  inT64 memory_used() const {
209  return memory_used_;
210  }
211  // Returns a pointer to the page with the given index, modulo the total
212  // number of pages, recaching if needed.
213  const ImageData* GetPage(int index);
214  // Takes ownership of the given page index. The page is made NULL in *this.
215  ImageData* TakePage(int index) {
216  ImageData* page = pages_[index];
217  pages_[index] = NULL;
218  return page;
219  }
220 
221  private:
222  // Loads as many pages can fit in max_memory_ starting at index pages_offset_.
223  bool ReCachePages();
224 
225  private:
226  // A name for this document.
227  STRING document_name_;
228  // The language of this document.
229  STRING lang_;
230  // A group of pages that corresponds in some loose way to a document.
232  // Page number of the first index in pages_.
233  int pages_offset_;
234  // Total number of pages in document (may exceed size of pages_.)
235  int total_pages_;
236  // Total of all pix sizes in the document.
237  inT64 memory_used_;
238  // Max memory to use at any time.
239  inT64 max_memory_;
240  // Saved reader from LoadDocument to allow re-caching.
241  FileReader reader_;
242 };
243 
244 // A collection of DocumentData that knows roughly how much memory it is using.
246  public:
247  explicit DocumentCache(inT64 max_memory);
248  ~DocumentCache();
249 
250  // Adds all the documents in the list of filenames, counting memory.
251  // The reader is used to read the files.
252  bool LoadDocuments(const GenericVector<STRING>& filenames, const char* lang,
253  FileReader reader);
254 
255  // Adds document to the cache, throwing out other documents if needed.
256  bool AddToCache(DocumentData* data);
257 
258  // Finds and returns a document by name.
259  DocumentData* FindDocument(const STRING& document_name) const;
260 
261  // Returns a page by serial number, selecting them in a round-robin fashion
262  // from all the documents.
263  const ImageData* GetPageBySerial(int serial);
264 
266  return documents_;
267  }
268  int total_pages() const {
269  return total_pages_;
270  }
271 
272  private:
273  // A group of pages that corresponds in some loose way to a document.
274  PointerVector<DocumentData> documents_;
275  // Total of all pages.
276  int total_pages_;
277  // Total of all memory used by the cache.
278  inT64 memory_used_;
279  // Max memory allowed in this cache.
280  inT64 max_memory_;
281 };
282 
283 } // namespace tesseract
284 
285 
286 #endif // TESSERACT_IMAGE_IMAGEDATA_H_
bool DeSerialize(bool swap, FILE *fp)
Definition: imagedata.cpp:77
const STRING & transcription() const
Definition: imagedata.h:129
static ImageData * Build(const char *name, int page_number, const char *lang, const char *imagedata, int imagedatasize, const char *truth_text, const char *box_text)
Definition: imagedata.cpp:120
const int kImagePadding
Definition: imagedata.h:36
bool DeSerialize(bool swap, TFile *fp)
Definition: imagedata.cpp:166
ImageData * TakePage(int index)
Definition: imagedata.h:215
static void ComputeSize(const GenericVector< WordFeature > &features, int *max_x, int *max_y)
Definition: imagedata.cpp:41
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
int dir() const
Definition: imagedata.h:55
const STRING & language() const
Definition: imagedata.h:123
const int kFeaturePadding
Definition: imagedata.h:34
void set_language(const STRING &lang)
Definition: imagedata.h:126
int MemoryUsed() const
Definition: imagedata.cpp:243
float PreScale(int target_height, Pix **pix, int *scaled_width, int *scaled_height, GenericVector< TBOX > *boxes) const
Definition: imagedata.cpp:196
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
const ImageData * GetPageBySerial(int serial)
Definition: imagedata.cpp:489
void AddBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, const GenericVector< int > &box_pages)
Definition: imagedata.cpp:285
int NumPages() const
Definition: imagedata.h:205
const ImageData * GetPage(int index)
Definition: imagedata.cpp:376
const PointerVector< DocumentData > & documents() const
Definition: imagedata.h:265
bool Serialize(FILE *fp) const
Definition: imagedata.cpp:69
const GenericVector< STRING > & box_texts() const
Definition: imagedata.h:135
bool SaveDocument(const char *filename, FileWriter writer)
Definition: imagedata.cpp:359
name_table name
int x() const
Definition: imagedata.h:53
const int kNumPagesPerMiniBatch
Definition: imagedata.h:38
DocumentCache(inT64 max_memory)
Definition: imagedata.cpp:435
Pix * GetPix() const
Definition: imagedata.cpp:187
bool LoadDocuments(const GenericVector< STRING > &filenames, const char *lang, FileReader reader)
Definition: imagedata.cpp:441
void SetPix(Pix *pix)
Definition: imagedata.cpp:182
DocumentData * FindDocument(const STRING &document_name) const
Definition: imagedata.cpp:479
void AddPageToDocument(ImageData *page)
Definition: imagedata.cpp:429
void Display() const
Definition: imagedata.cpp:248
int page_number() const
Definition: imagedata.h:114
static int SortByXBucket(const void *, const void *)
Definition: imagedata.cpp:100
void set_imagefilename(const STRING &name)
Definition: imagedata.h:111
const GenericVector< TBOX > & boxes() const
Definition: imagedata.h:132
bool AddToCache(DocumentData *data)
Definition: imagedata.cpp:461
DocumentData(const STRING &name)
Definition: imagedata.cpp:339
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void set_page_number(int num)
Definition: imagedata.h:117
inT64 memory_used() const
Definition: imagedata.h:208
static void Draw(const GenericVector< WordFeature > &features, ScrollView *window)
Definition: imagedata.cpp:52
int y() const
Definition: imagedata.h:54
const GenericVector< char > & image_data() const
Definition: imagedata.h:120
int total_pages() const
Definition: imagedata.h:268
bool SaveToBuffer(GenericVector< char > *buffer)
Definition: imagedata.cpp:368
Definition: points.h:189
bool LoadDocument(const char *filename, const char *lang, int start_page, inT64 max_memory, FileReader reader)
Definition: imagedata.cpp:347
bool Serialize(TFile *fp) const
Definition: imagedata.cpp:151
const STRING & imagefilename() const
Definition: imagedata.h:108
const STRING & box_text(int index) const
Definition: imagedata.h:138
short inT16
Definition: host.h:100
const STRING & document_name() const
Definition: imagedata.h:202
int inT32
Definition: host.h:102
static void FromWordFeatures(const GenericVector< WordFeature > &word_features, GenericVector< FloatWordFeature > *float_features)
Definition: imagedata.cpp:85
unsigned char uinT8
Definition: host.h:99
long long int inT64
Definition: host.h:108