tesseract v5.3.3.20231005
tesseract::TrainingSample Class Reference

#include <trainingsample.h>

Inheritance diagram for tesseract::TrainingSample:
tesseract::ELIST_LINK

Public Member Functions

 TrainingSample ()
 
 ~TrainingSample ()
 
FEATURE_STRUCTGetCNFeature () const
 
TrainingSampleRandomizedCopy (int index) const
 
TrainingSampleCopy () const
 
bool Serialize (FILE *fp) const
 
bool DeSerialize (bool swap, FILE *fp)
 
void ExtractCharDesc (int feature_type, int micro_type, int cn_type, int geo_type, CHAR_DESC_STRUCT *char_desc)
 
void IndexFeatures (const IntFeatureSpace &feature_space)
 
Image RenderToPix (const UNICHARSET *unicharset) const
 
void DisplayFeatures (ScrollView::Color color, ScrollView *window) const
 
Image GetSamplePix (int padding, Image page_pix) const
 
UNICHAR_ID class_id () const
 
void set_class_id (int id)
 
int font_id () const
 
void set_font_id (int id)
 
int page_num () const
 
void set_page_num (int page)
 
const TBOXbounding_box () const
 
void set_bounding_box (const TBOX &box)
 
uint32_t num_features () const
 
const INT_FEATURE_STRUCTfeatures () const
 
uint32_t num_micro_features () const
 
const MicroFeaturemicro_features () const
 
int outline_length () const
 
float cn_feature (int index) const
 
int geo_feature (int index) const
 
double weight () const
 
void set_weight (double value)
 
double max_dist () const
 
void set_max_dist (double value)
 
int sample_index () const
 
void set_sample_index (int value)
 
bool features_are_mapped () const
 
const std::vector< int > & mapped_features () const
 
const std::vector< int > & indexed_features () const
 
bool is_error () const
 
void set_is_error (bool value)
 
- Public Member Functions inherited from tesseract::ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static TrainingSampleCopyFromFeatures (const INT_FX_RESULT_STRUCT &fx_info, const TBOX &bounding_box, const INT_FEATURE_STRUCT *features, int num_features)
 
static TrainingSampleDeSerializeCreate (bool swap, FILE *fp)
 

Public Attributes

std::vector< int > mapped_features_
 
bool features_are_indexed_
 
bool features_are_mapped_
 

Detailed Description

Definition at line 54 of file trainingsample.h.

Constructor & Destructor Documentation

◆ TrainingSample()

tesseract::TrainingSample::TrainingSample ( )
inline

Definition at line 56 of file trainingsample.h.

57 : class_id_(INVALID_UNICHAR_ID)
58 , font_id_(0)
59 , page_num_(0)
60 , num_features_(0)
61 , num_micro_features_(0)
62 , outline_length_(0)
63 , features_(nullptr)
64 , micro_features_(nullptr)
65 , weight_(1.0)
66 , max_dist_(0.0)
67 , sample_index_(0)
70 , is_error_(false) {}

◆ ~TrainingSample()

tesseract::TrainingSample::~TrainingSample ( )

Definition at line 42 of file trainingsample.cpp.

42 {
43 delete[] features_;
44 delete[] micro_features_;
45}

Member Function Documentation

◆ bounding_box()

const TBOX & tesseract::TrainingSample::bounding_box ( ) const
inline

Definition at line 137 of file trainingsample.h.

137 {
138 return bounding_box_;
139 }

◆ class_id()

UNICHAR_ID tesseract::TrainingSample::class_id ( ) const
inline

Definition at line 119 of file trainingsample.h.

119 {
120 return class_id_;
121 }

◆ cn_feature()

float tesseract::TrainingSample::cn_feature ( int  index) const
inline

Definition at line 158 of file trainingsample.h.

158 {
159 return cn_feature_[index];
160 }

◆ Copy()

TrainingSample * tesseract::TrainingSample::Copy ( ) const

Definition at line 213 of file trainingsample.cpp.

213 {
214 auto *sample = new TrainingSample;
215 sample->class_id_ = class_id_;
216 sample->font_id_ = font_id_;
217 sample->weight_ = weight_;
218 sample->sample_index_ = sample_index_;
219 sample->num_features_ = num_features_;
220 if (num_features_ > 0) {
221 sample->features_ = new INT_FEATURE_STRUCT[num_features_];
222 memcpy(sample->features_, features_, num_features_ * sizeof(features_[0]));
223 }
224 sample->num_micro_features_ = num_micro_features_;
225 if (num_micro_features_ > 0) {
226 sample->micro_features_ = new MicroFeature[num_micro_features_];
227 memcpy(sample->micro_features_, micro_features_,
228 num_micro_features_ * sizeof(micro_features_[0]));
229 }
230 memcpy(sample->cn_feature_, cn_feature_, sizeof(*cn_feature_) * kNumCNParams);
231 memcpy(sample->geo_feature_, geo_feature_, sizeof(*geo_feature_) * GeoCount);
232 return sample;
233}
@ GeoCount
Definition: picofeat.h:40
std::array< float,(int) MicroFeatureParameter::MFCount > MicroFeature
Definition: mfdefs.h:36

◆ CopyFromFeatures()

TrainingSample * tesseract::TrainingSample::CopyFromFeatures ( const INT_FX_RESULT_STRUCT fx_info,
const TBOX bounding_box,
const INT_FEATURE_STRUCT features,
int  num_features 
)
static

Definition at line 158 of file trainingsample.cpp.

161 {
162 auto *sample = new TrainingSample;
163 sample->num_features_ = num_features;
164 sample->features_ = new INT_FEATURE_STRUCT[num_features];
165 sample->outline_length_ = fx_info.Length;
166 memcpy(sample->features_, features, num_features * sizeof(features[0]));
167 sample->geo_feature_[GeoBottom] = bounding_box.bottom();
168 sample->geo_feature_[GeoTop] = bounding_box.top();
169 sample->geo_feature_[GeoWidth] = bounding_box.width();
170
171 // Generate the cn_feature_ from the fx_info.
172 sample->cn_feature_[CharNormY] = MF_SCALE_FACTOR * (fx_info.Ymean - kBlnBaselineOffset);
173 sample->cn_feature_[CharNormLength] = MF_SCALE_FACTOR * fx_info.Length / LENGTH_COMPRESSION;
174 sample->cn_feature_[CharNormRx] = MF_SCALE_FACTOR * fx_info.Rx;
175 sample->cn_feature_[CharNormRy] = MF_SCALE_FACTOR * fx_info.Ry;
176
177 sample->features_are_indexed_ = false;
178 sample->features_are_mapped_ = false;
179 return sample;
180}
#define LENGTH_COMPRESSION
Definition: normfeat.h:26
const float MF_SCALE_FACTOR
Definition: mfoutline.h:61
@ GeoTop
Definition: picofeat.h:37
@ GeoWidth
Definition: picofeat.h:38
@ GeoBottom
Definition: picofeat.h:36
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
const int kBlnBaselineOffset
Definition: normalis.h:34
TDimension width() const
Definition: rect.h:126
TDimension top() const
Definition: rect.h:68
TDimension bottom() const
Definition: rect.h:75
const INT_FEATURE_STRUCT * features() const
const TBOX & bounding_box() const
uint32_t num_features() const

◆ DeSerialize()

bool tesseract::TrainingSample::DeSerialize ( bool  swap,
FILE *  fp 
)

Definition at line 102 of file trainingsample.cpp.

102 {
103 if (fread(&class_id_, sizeof(class_id_), 1, fp) != 1) {
104 return false;
105 }
106 if (fread(&font_id_, sizeof(font_id_), 1, fp) != 1) {
107 return false;
108 }
109 if (fread(&page_num_, sizeof(page_num_), 1, fp) != 1) {
110 return false;
111 }
112 if (!bounding_box_.DeSerialize(swap, fp)) {
113 return false;
114 }
115 if (fread(&num_features_, sizeof(num_features_), 1, fp) != 1) {
116 return false;
117 }
118 if (fread(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1) {
119 return false;
120 }
121 if (fread(&outline_length_, sizeof(outline_length_), 1, fp) != 1) {
122 return false;
123 }
124 if (swap) {
125 ReverseN(&class_id_, sizeof(class_id_));
126 ReverseN(&num_features_, sizeof(num_features_));
127 ReverseN(&num_micro_features_, sizeof(num_micro_features_));
128 ReverseN(&outline_length_, sizeof(outline_length_));
129 }
130 // Arbitrarily limit the number of elements to protect against bad data.
131 if (num_features_ > UINT16_MAX) {
132 return false;
133 }
134 if (num_micro_features_ > UINT16_MAX) {
135 return false;
136 }
137 delete[] features_;
138 features_ = new INT_FEATURE_STRUCT[num_features_];
139 if (fread(features_, sizeof(*features_), num_features_, fp) != num_features_) {
140 return false;
141 }
142 delete[] micro_features_;
143 micro_features_ = new MicroFeature[num_micro_features_];
144 if (fread(micro_features_, sizeof(*micro_features_), num_micro_features_, fp) !=
145 num_micro_features_) {
146 return false;
147 }
148 if (fread(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {
149 return false;
150 }
151 if (fread(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {
152 return false;
153 }
154 return true;
155}
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
bool DeSerialize(bool swap, FILE *fp)
Definition: rect.cpp:198

◆ DeSerializeCreate()

TrainingSample * tesseract::TrainingSample::DeSerializeCreate ( bool  swap,
FILE *  fp 
)
static

Definition at line 91 of file trainingsample.cpp.

91 {
92 auto *sample = new TrainingSample;
93 if (sample->DeSerialize(swap, fp)) {
94 return sample;
95 }
96 delete sample;
97 return nullptr;
98}

◆ DisplayFeatures()

void tesseract::TrainingSample::DisplayFeatures ( ScrollView::Color  color,
ScrollView window 
) const

Definition at line 330 of file trainingsample.cpp.

330 {
331 for (uint32_t f = 0; f < num_features_; ++f) {
332 RenderIntFeature(window, &features_[f], color);
333 }
334}
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1500

◆ ExtractCharDesc()

void tesseract::TrainingSample::ExtractCharDesc ( int  feature_type,
int  micro_type,
int  cn_type,
int  geo_type,
CHAR_DESC_STRUCT char_desc 
)

Definition at line 236 of file trainingsample.cpp.

237 {
238 // Extract the INT features.
239 delete[] features_;
240 FEATURE_SET_STRUCT *char_features = char_desc->FeatureSets[int_feature_type];
241 if (char_features == nullptr) {
242 tprintf("Error: no features to train on of type %s\n", kIntFeatureType);
243 num_features_ = 0;
244 features_ = nullptr;
245 } else {
246 num_features_ = char_features->NumFeatures;
247 features_ = new INT_FEATURE_STRUCT[num_features_];
248 for (uint32_t f = 0; f < num_features_; ++f) {
249 features_[f].X = static_cast<uint8_t>(char_features->Features[f]->Params[IntX]);
250 features_[f].Y = static_cast<uint8_t>(char_features->Features[f]->Params[IntY]);
251 features_[f].Theta = static_cast<uint8_t>(char_features->Features[f]->Params[IntDir]);
252 features_[f].CP_misses = 0;
253 }
254 }
255 // Extract the Micro features.
256 delete[] micro_features_;
257 char_features = char_desc->FeatureSets[micro_type];
258 if (char_features == nullptr) {
259 tprintf("Error: no features to train on of type %s\n", kMicroFeatureType);
260 num_micro_features_ = 0;
261 micro_features_ = nullptr;
262 } else {
263 num_micro_features_ = char_features->NumFeatures;
264 micro_features_ = new MicroFeature[num_micro_features_];
265 for (uint32_t f = 0; f < num_micro_features_; ++f) {
266 for (int d = 0; d < (int)MicroFeatureParameter::MFCount; ++d) {
267 micro_features_[f][d] = char_features->Features[f]->Params[d];
268 }
269 }
270 }
271 // Extract the CN feature.
272 char_features = char_desc->FeatureSets[cn_type];
273 if (char_features == nullptr) {
274 tprintf("Error: no CN feature to train on.\n");
275 } else {
276 ASSERT_HOST(char_features->NumFeatures == 1);
277 cn_feature_[CharNormY] = char_features->Features[0]->Params[CharNormY];
278 cn_feature_[CharNormLength] = char_features->Features[0]->Params[CharNormLength];
279 cn_feature_[CharNormRx] = char_features->Features[0]->Params[CharNormRx];
280 cn_feature_[CharNormRy] = char_features->Features[0]->Params[CharNormRy];
281 }
282 // Extract the Geo feature.
283 char_features = char_desc->FeatureSets[geo_type];
284 if (char_features == nullptr) {
285 tprintf("Error: no Geo feature to train on.\n");
286 } else {
287 ASSERT_HOST(char_features->NumFeatures == 1);
288 geo_feature_[GeoBottom] = char_features->Features[0]->Params[GeoBottom];
289 geo_feature_[GeoTop] = char_features->Features[0]->Params[GeoTop];
290 geo_feature_[GeoWidth] = char_features->Features[0]->Params[GeoWidth];
291 }
292 features_are_indexed_ = false;
293 features_are_mapped_ = false;
294}
#define ASSERT_HOST(x)
Definition: errcode.h:54
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const char *const kIntFeatureType
Definition: featdefs.cpp:35
@ IntDir
Definition: picofeat.h:31
const char *const kMicroFeatureType
Definition: featdefs.cpp:33

◆ features()

const INT_FEATURE_STRUCT * tesseract::TrainingSample::features ( ) const
inline

Definition at line 146 of file trainingsample.h.

146 {
147 return features_;
148 }

◆ features_are_mapped()

bool tesseract::TrainingSample::features_are_mapped ( ) const
inline

Definition at line 182 of file trainingsample.h.

182 {
184 }

◆ font_id()

int tesseract::TrainingSample::font_id ( ) const
inline

Definition at line 125 of file trainingsample.h.

125 {
126 return font_id_;
127 }

◆ geo_feature()

int tesseract::TrainingSample::geo_feature ( int  index) const
inline

Definition at line 161 of file trainingsample.h.

161 {
162 return geo_feature_[index];
163 }

◆ GetCNFeature()

FEATURE_STRUCT * tesseract::TrainingSample::GetCNFeature ( ) const

Definition at line 183 of file trainingsample.cpp.

183 {
184 auto feature = new FEATURE_STRUCT(&CharNormDesc);
185 for (int i = 0; i < kNumCNParams; ++i) {
186 feature->Params[i] = cn_feature_[i];
187 }
188 return feature;
189}
const FEATURE_DESC_STRUCT CharNormDesc

◆ GetSamplePix()

Image tesseract::TrainingSample::GetSamplePix ( int  padding,
Image  page_pix 
) const

Definition at line 342 of file trainingsample.cpp.

342 {
343 if (page_pix == nullptr) {
344 return nullptr;
345 }
346 int page_width = pixGetWidth(page_pix);
347 int page_height = pixGetHeight(page_pix);
348 TBOX padded_box = bounding_box();
349 padded_box.pad(padding, padding);
350 // Clip the padded_box to the limits of the page
351 TBOX page_box(0, 0, page_width, page_height);
352 padded_box &= page_box;
353 Box *box =
354 boxCreate(page_box.left(), page_height - page_box.top(), page_box.width(), page_box.height());
355 Image sample_pix = pixClipRectangle(page_pix, box, nullptr);
356 boxDestroy(&box);
357 return sample_pix;
358}
@ TBOX

◆ indexed_features()

const std::vector< int > & tesseract::TrainingSample::indexed_features ( ) const
inline

Definition at line 189 of file trainingsample.h.

189 {
191 return mapped_features_;
192 }
std::vector< int > mapped_features_

◆ IndexFeatures()

void tesseract::TrainingSample::IndexFeatures ( const IntFeatureSpace feature_space)

Definition at line 298 of file trainingsample.cpp.

298 {
299 std::vector<int> indexed_features;
300 feature_space.IndexAndSortFeatures(features_, num_features_, &mapped_features_);
302 features_are_mapped_ = false;
303}
const std::vector< int > & indexed_features() const

◆ is_error()

bool tesseract::TrainingSample::is_error ( ) const
inline

Definition at line 193 of file trainingsample.h.

193 {
194 return is_error_;
195 }

◆ mapped_features()

const std::vector< int > & tesseract::TrainingSample::mapped_features ( ) const
inline

Definition at line 185 of file trainingsample.h.

185 {
187 return mapped_features_;
188 }

◆ max_dist()

double tesseract::TrainingSample::max_dist ( ) const
inline

Definition at line 170 of file trainingsample.h.

170 {
171 return max_dist_;
172 }

◆ micro_features()

const MicroFeature * tesseract::TrainingSample::micro_features ( ) const
inline

Definition at line 152 of file trainingsample.h.

152 {
153 return micro_features_;
154 }

◆ num_features()

uint32_t tesseract::TrainingSample::num_features ( ) const
inline

Definition at line 143 of file trainingsample.h.

143 {
144 return num_features_;
145 }

◆ num_micro_features()

uint32_t tesseract::TrainingSample::num_micro_features ( ) const
inline

Definition at line 149 of file trainingsample.h.

149 {
150 return num_micro_features_;
151 }

◆ outline_length()

int tesseract::TrainingSample::outline_length ( ) const
inline

Definition at line 155 of file trainingsample.h.

155 {
156 return outline_length_;
157 }

◆ page_num()

int tesseract::TrainingSample::page_num ( ) const
inline

Definition at line 131 of file trainingsample.h.

131 {
132 return page_num_;
133 }

◆ RandomizedCopy()

TrainingSample * tesseract::TrainingSample::RandomizedCopy ( int  index) const

Definition at line 194 of file trainingsample.cpp.

194 {
195 TrainingSample *sample = Copy();
196 if (index >= 0 && index < kSampleRandomSize) {
197 ++index; // Remove the first combination.
198 const int yshift = kYShiftValues[index / kSampleScaleSize];
199 double scaling = kScaleValues[index % kSampleScaleSize];
200 for (uint32_t i = 0; i < num_features_; ++i) {
201 double result = (features_[i].X - kRandomizingCenter) * scaling;
202 result += kRandomizingCenter;
203 sample->features_[i].X = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);
204 result = (features_[i].Y - kRandomizingCenter) * scaling;
205 result += kRandomizingCenter + yshift;
206 sample->features_[i].Y = ClipToRange<int>(result + 0.5, 0, UINT8_MAX);
207 }
208 }
209 return sample;
210}
const int kRandomizingCenter
TrainingSample * Copy() const

◆ RenderToPix()

Image tesseract::TrainingSample::RenderToPix ( const UNICHARSET unicharset) const

Definition at line 306 of file trainingsample.cpp.

306 {
307 Image pix = pixCreate(kIntFeatureExtent, kIntFeatureExtent, 1);
308 for (uint32_t f = 0; f < num_features_; ++f) {
309 int start_x = features_[f].X;
310 int start_y = kIntFeatureExtent - features_[f].Y;
311 double dx = cos((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);
312 double dy = -sin((features_[f].Theta / 256.0) * 2.0 * M_PI - M_PI);
313 for (int i = 0; i <= 5; ++i) {
314 int x = static_cast<int>(start_x + dx * i);
315 int y = static_cast<int>(start_y + dy * i);
316 if (x >= 0 && x < 256 && y >= 0 && y < 256) {
317 pixSetPixel(pix, x, y, 1);
318 }
319 }
320 }
321 if (unicharset != nullptr) {
322 pixSetText(pix, unicharset->id_to_unichar(class_id_));
323 }
324 return pix;
325}
const int kIntFeatureExtent
const double y

◆ sample_index()

int tesseract::TrainingSample::sample_index ( ) const
inline

Definition at line 176 of file trainingsample.h.

176 {
177 return sample_index_;
178 }

◆ Serialize()

bool tesseract::TrainingSample::Serialize ( FILE *  fp) const

Definition at line 51 of file trainingsample.cpp.

51 {
52 if (fwrite(&class_id_, sizeof(class_id_), 1, fp) != 1) {
53 return false;
54 }
55 if (fwrite(&font_id_, sizeof(font_id_), 1, fp) != 1) {
56 return false;
57 }
58 if (fwrite(&page_num_, sizeof(page_num_), 1, fp) != 1) {
59 return false;
60 }
61 if (!bounding_box_.Serialize(fp)) {
62 return false;
63 }
64 if (fwrite(&num_features_, sizeof(num_features_), 1, fp) != 1) {
65 return false;
66 }
67 if (fwrite(&num_micro_features_, sizeof(num_micro_features_), 1, fp) != 1) {
68 return false;
69 }
70 if (fwrite(&outline_length_, sizeof(outline_length_), 1, fp) != 1) {
71 return false;
72 }
73 if (fwrite(features_, sizeof(*features_), num_features_, fp) != num_features_) {
74 return false;
75 }
76 if (fwrite(micro_features_, sizeof(*micro_features_), num_micro_features_, fp) !=
77 num_micro_features_) {
78 return false;
79 }
80 if (fwrite(cn_feature_, sizeof(*cn_feature_), kNumCNParams, fp) != kNumCNParams) {
81 return false;
82 }
83 if (fwrite(geo_feature_, sizeof(*geo_feature_), GeoCount, fp) != GeoCount) {
84 return false;
85 }
86 return true;
87}
bool Serialize(FILE *fp) const
Definition: rect.cpp:187

◆ set_bounding_box()

void tesseract::TrainingSample::set_bounding_box ( const TBOX box)
inline

Definition at line 140 of file trainingsample.h.

140 {
141 bounding_box_ = box;
142 }

◆ set_class_id()

void tesseract::TrainingSample::set_class_id ( int  id)
inline

Definition at line 122 of file trainingsample.h.

122 {
123 class_id_ = id;
124 }

◆ set_font_id()

void tesseract::TrainingSample::set_font_id ( int  id)
inline

Definition at line 128 of file trainingsample.h.

128 {
129 font_id_ = id;
130 }

◆ set_is_error()

void tesseract::TrainingSample::set_is_error ( bool  value)
inline

Definition at line 196 of file trainingsample.h.

196 {
197 is_error_ = value;
198 }
int value

◆ set_max_dist()

void tesseract::TrainingSample::set_max_dist ( double  value)
inline

Definition at line 173 of file trainingsample.h.

173 {
174 max_dist_ = value;
175 }

◆ set_page_num()

void tesseract::TrainingSample::set_page_num ( int  page)
inline

Definition at line 134 of file trainingsample.h.

134 {
135 page_num_ = page;
136 }

◆ set_sample_index()

void tesseract::TrainingSample::set_sample_index ( int  value)
inline

Definition at line 179 of file trainingsample.h.

179 {
180 sample_index_ = value;
181 }

◆ set_weight()

void tesseract::TrainingSample::set_weight ( double  value)
inline

Definition at line 167 of file trainingsample.h.

167 {
168 weight_ = value;
169 }

◆ weight()

double tesseract::TrainingSample::weight ( ) const
inline

Definition at line 164 of file trainingsample.h.

164 {
165 return weight_;
166 }

Member Data Documentation

◆ features_are_indexed_

bool tesseract::TrainingSample::features_are_indexed_

Definition at line 244 of file trainingsample.h.

◆ features_are_mapped_

bool tesseract::TrainingSample::features_are_mapped_

Definition at line 245 of file trainingsample.h.

◆ mapped_features_

std::vector<int> tesseract::TrainingSample::mapped_features_

Definition at line 243 of file trainingsample.h.


The documentation for this class was generated from the following files: