All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
trainingsampleset.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 //
15 
16 #include "trainingsampleset.h"
17 #include "allheaders.h"
18 #include "boxread.h"
19 #include "fontinfo.h"
20 #include "indexmapbidi.h"
21 #include "intfeaturedist.h"
22 #include "intfeaturemap.h"
23 #include "intfeaturespace.h"
24 #include "shapetable.h"
25 #include "trainingsample.h"
26 #include "unicity_table.h"
27 
28 namespace tesseract {
29 
30 const int kTestChar = -1; // 37;
31 // Max number of distances to compute the squared way
32 const int kSquareLimit = 25;
33 // Prime numbers for subsampling distances.
34 const int kPrime1 = 17;
35 const int kPrime2 = 13;
36 // Min samples from which to start discarding outliers.
37 const int kMinOutlierSamples = 5;
38 
39 TrainingSampleSet::FontClassInfo::FontClassInfo()
40  : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {
41 }
42 
43 // Writes to the given file. Returns false in case of error.
44 bool TrainingSampleSet::FontClassInfo::Serialize(FILE* fp) const {
45  if (fwrite(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
46  return false;
47  if (fwrite(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
48  return false;
49  if (fwrite(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
50  if (!samples.Serialize(fp)) return false;
51  return true;
52 }
53 // Reads from the given file. Returns false in case of error.
54 // If swap is true, assumes a big/little-endian swap is needed.
55 bool TrainingSampleSet::FontClassInfo::DeSerialize(bool swap, FILE* fp) {
56  if (fread(&num_raw_samples, sizeof(num_raw_samples), 1, fp) != 1)
57  return false;
58  if (fread(&canonical_sample, sizeof(canonical_sample), 1, fp) != 1)
59  return false;
60  if (fread(&canonical_dist, sizeof(canonical_dist), 1, fp) != 1) return false;
61  if (!samples.DeSerialize(swap, fp)) return false;
62  if (swap) {
63  ReverseN(&num_raw_samples, sizeof(num_raw_samples));
64  ReverseN(&canonical_sample, sizeof(canonical_sample));
65  ReverseN(&canonical_dist, sizeof(canonical_dist));
66  }
67  return true;
68 }
69 
70 TrainingSampleSet::TrainingSampleSet(const FontInfoTable& font_table)
71  : num_raw_samples_(0), unicharset_size_(0),
72  font_class_array_(NULL), fontinfo_table_(font_table) {
73 }
74 
76  delete font_class_array_;
77 }
78 
79 // Writes to the given file. Returns false in case of error.
80 bool TrainingSampleSet::Serialize(FILE* fp) const {
81  if (!samples_.Serialize(fp)) return false;
82  if (!unicharset_.save_to_file(fp)) return false;
83  if (!font_id_map_.Serialize(fp)) return false;
84  inT8 not_null = font_class_array_ != NULL;
85  if (fwrite(&not_null, sizeof(not_null), 1, fp) != 1) return false;
86  if (not_null) {
87  if (!font_class_array_->SerializeClasses(fp)) return false;
88  }
89  return true;
90 }
91 
92 // Reads from the given file. Returns false in case of error.
93 // If swap is true, assumes a big/little-endian swap is needed.
94 bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) {
95  if (!samples_.DeSerialize(swap, fp)) return false;
96  num_raw_samples_ = samples_.size();
97  if (!unicharset_.load_from_file(fp)) return false;
98  if (!font_id_map_.DeSerialize(swap, fp)) return false;
99  if (font_class_array_ != NULL) {
100  delete font_class_array_;
101  font_class_array_ = NULL;
102  }
103  inT8 not_null;
104  if (fread(&not_null, sizeof(not_null), 1, fp) != 1) return false;
105  if (not_null) {
106  FontClassInfo empty;
107  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo >(1, 1 , empty);
108  if (!font_class_array_->DeSerializeClasses(swap, fp)) return false;
109  }
110  unicharset_size_ = unicharset_.size();
111  return true;
112 }
113 
114 // Load an initial unicharset, or set one up if the file cannot be read.
116  if (!unicharset_.load_from_file(filename)) {
117  tprintf("Failed to load unicharset from file %s\n"
118  "Building unicharset from scratch...\n",
119  filename);
120  unicharset_.clear();
121  // Add special characters as they were removed by the clear.
122  UNICHARSET empty;
123  unicharset_.AppendOtherUnicharset(empty);
124  }
125  unicharset_size_ = unicharset_.size();
126 }
127 
128 // Adds a character sample to this sample set.
129 // If the unichar is not already in the local unicharset, it is added.
130 // Returns the unichar_id of the added sample, from the local unicharset.
132  if (!unicharset_.contains_unichar(unichar)) {
133  unicharset_.unichar_insert(unichar);
134  if (unicharset_.size() > MAX_NUM_CLASSES) {
135  tprintf("Error: Size of unicharset in TrainingSampleSet::AddSample is "
136  "greater than MAX_NUM_CLASSES\n");
137  return -1;
138  }
139  }
140  UNICHAR_ID char_id = unicharset_.unichar_to_id(unichar);
141  AddSample(char_id, sample);
142  return char_id;
143 }
144 
145 // Adds a character sample to this sample set with the given unichar_id,
146 // which must correspond to the local unicharset (in this).
148  sample->set_class_id(unichar_id);
149  samples_.push_back(sample);
150  num_raw_samples_ = samples_.size();
151  unicharset_size_ = unicharset_.size();
152 }
153 
154 // Returns the number of samples for the given font,class pair.
155 // If randomize is true, returns the number of samples accessible
156 // with randomizing on. (Increases the number of samples if small.)
157 // OrganizeByFontAndClass must have been already called.
158 int TrainingSampleSet::NumClassSamples(int font_id, int class_id,
159  bool randomize) const {
160  ASSERT_HOST(font_class_array_ != NULL);
161  if (font_id < 0 || class_id < 0 ||
162  font_id >= font_id_map_.SparseSize() || class_id >= unicharset_size_) {
163  // There are no samples because the font or class doesn't exist.
164  return 0;
165  }
166  int font_index = font_id_map_.SparseToCompact(font_id);
167  if (font_index < 0)
168  return 0; // The font has no samples.
169  if (randomize)
170  return (*font_class_array_)(font_index, class_id).samples.size();
171  else
172  return (*font_class_array_)(font_index, class_id).num_raw_samples;
173 }
174 
175 // Gets a sample by its index.
177  return samples_[index];
178 }
179 
180 // Gets a sample by its font, class, index.
181 // OrganizeByFontAndClass must have been already called.
182 const TrainingSample* TrainingSampleSet::GetSample(int font_id, int class_id,
183  int index) const {
184  ASSERT_HOST(font_class_array_ != NULL);
185  int font_index = font_id_map_.SparseToCompact(font_id);
186  if (font_index < 0) return NULL;
187  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
188  return samples_[sample_index];
189 }
190 
191 // Get a sample by its font, class, index. Does not randomize.
192 // OrganizeByFontAndClass must have been already called.
194  int index) {
195  ASSERT_HOST(font_class_array_ != NULL);
196  int font_index = font_id_map_.SparseToCompact(font_id);
197  if (font_index < 0) return NULL;
198  int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
199  return samples_[sample_index];
200 }
201 
202 // Returns a string debug representation of the given sample:
203 // font, unichar_str, bounding box, page.
205  STRING boxfile_str;
206  MakeBoxFileStr(unicharset_.id_to_unichar(sample.class_id()),
207  sample.bounding_box(), sample.page_num(), &boxfile_str);
208  return STRING(fontinfo_table_.get(sample.font_id()).name) + " " + boxfile_str;
209 }
210 
211 // Gets the combined set of features used by all the samples of the given
212 // font/class combination.
214  int font_id, int class_id) const {
215  int font_index = font_id_map_.SparseToCompact(font_id);
216  ASSERT_HOST(font_index >= 0);
217  return (*font_class_array_)(font_index, class_id).cloud_features;
218 }
219 // Gets the indexed features of the canonical sample of the given
220 // font/class combination.
222  int font_id, int class_id) const {
223  int font_index = font_id_map_.SparseToCompact(font_id);
224  ASSERT_HOST(font_index >= 0);
225  return (*font_class_array_)(font_index, class_id).canonical_features;
226 }
227 
228 // Returns the distance between the given UniCharAndFonts pair.
229 // If matched_fonts, only matching fonts, are considered, unless that yields
230 // the empty set.
231 // OrganizeByFontAndClass must have been already called.
233  const UnicharAndFonts& uf2,
234  bool matched_fonts,
235  const IntFeatureMap& feature_map) {
236  int num_fonts1 = uf1.font_ids.size();
237  int c1 = uf1.unichar_id;
238  int num_fonts2 = uf2.font_ids.size();
239  int c2 = uf2.unichar_id;
240  double dist_sum = 0.0;
241  int dist_count = 0;
242  bool debug = false;
243  if (matched_fonts) {
244  // Compute distances only where fonts match.
245  for (int i = 0; i < num_fonts1; ++i) {
246  int f1 = uf1.font_ids[i];
247  for (int j = 0; j < num_fonts2; ++j) {
248  int f2 = uf2.font_ids[j];
249  if (f1 == f2) {
250  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
251  ++dist_count;
252  }
253  }
254  }
255  } else if (num_fonts1 * num_fonts2 <= kSquareLimit) {
256  // Small enough sets to compute all the distances.
257  for (int i = 0; i < num_fonts1; ++i) {
258  int f1 = uf1.font_ids[i];
259  for (int j = 0; j < num_fonts2; ++j) {
260  int f2 = uf2.font_ids[j];
261  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
262  if (debug) {
263  tprintf("Cluster dist %d %d %d %d = %g\n",
264  f1, c1, f2, c2,
265  ClusterDistance(f1, c1, f2, c2, feature_map));
266  }
267  ++dist_count;
268  }
269  }
270  } else {
271  // Subsample distances, using the largest set once, and stepping through
272  // the smaller set so as to ensure that all the pairs are different.
273  int increment = kPrime1 != num_fonts2 ? kPrime1 : kPrime2;
274  int index = 0;
275  int num_samples = MAX(num_fonts1, num_fonts2);
276  for (int i = 0; i < num_samples; ++i, index += increment) {
277  int f1 = uf1.font_ids[i % num_fonts1];
278  int f2 = uf2.font_ids[index % num_fonts2];
279  if (debug) {
280  tprintf("Cluster dist %d %d %d %d = %g\n",
281  f1, c1, f2, c2, ClusterDistance(f1, c1, f2, c2, feature_map));
282  }
283  dist_sum += ClusterDistance(f1, c1, f2, c2, feature_map);
284  ++dist_count;
285  }
286  }
287  if (dist_count == 0) {
288  if (matched_fonts)
289  return UnicharDistance(uf1, uf2, false, feature_map);
290  return 0.0f;
291  }
292  return dist_sum / dist_count;
293 }
294 
295 // Returns the distance between the given pair of font/class pairs.
296 // Finds in cache or computes and caches.
297 // OrganizeByFontAndClass must have been already called.
298 float TrainingSampleSet::ClusterDistance(int font_id1, int class_id1,
299  int font_id2, int class_id2,
300  const IntFeatureMap& feature_map) {
301  ASSERT_HOST(font_class_array_ != NULL);
302  int font_index1 = font_id_map_.SparseToCompact(font_id1);
303  int font_index2 = font_id_map_.SparseToCompact(font_id2);
304  if (font_index1 < 0 || font_index2 < 0)
305  return 0.0f;
306  FontClassInfo& fc_info = (*font_class_array_)(font_index1, class_id1);
307  if (font_id1 == font_id2) {
308  // Special case cache for speed.
309  if (fc_info.unichar_distance_cache.size() == 0)
310  fc_info.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
311  if (fc_info.unichar_distance_cache[class_id2] < 0) {
312  // Distance has to be calculated.
313  float result = ComputeClusterDistance(font_id1, class_id1,
314  font_id2, class_id2,
315  feature_map);
316  fc_info.unichar_distance_cache[class_id2] = result;
317  // Copy to the symmetric cache entry.
318  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
319  if (fc_info2.unichar_distance_cache.size() == 0)
320  fc_info2.unichar_distance_cache.init_to_size(unicharset_size_, -1.0f);
321  fc_info2.unichar_distance_cache[class_id1] = result;
322  }
323  return fc_info.unichar_distance_cache[class_id2];
324  } else if (class_id1 == class_id2) {
325  // Another special-case cache for equal class-id.
326  if (fc_info.font_distance_cache.size() == 0)
327  fc_info.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
328  -1.0f);
329  if (fc_info.font_distance_cache[font_index2] < 0) {
330  // Distance has to be calculated.
331  float result = ComputeClusterDistance(font_id1, class_id1,
332  font_id2, class_id2,
333  feature_map);
334  fc_info.font_distance_cache[font_index2] = result;
335  // Copy to the symmetric cache entry.
336  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
337  if (fc_info2.font_distance_cache.size() == 0)
338  fc_info2.font_distance_cache.init_to_size(font_id_map_.CompactSize(),
339  -1.0f);
340  fc_info2.font_distance_cache[font_index1] = result;
341  }
342  return fc_info.font_distance_cache[font_index2];
343  }
344  // Both font and class are different. Linear search for class_id2/font_id2
345  // in what is a hopefully short list of distances.
346  int cache_index = 0;
347  while (cache_index < fc_info.distance_cache.size() &&
348  (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
349  fc_info.distance_cache[cache_index].font_id != font_id2))
350  ++cache_index;
351  if (cache_index == fc_info.distance_cache.size()) {
352  // Distance has to be calculated.
353  float result = ComputeClusterDistance(font_id1, class_id1,
354  font_id2, class_id2,
355  feature_map);
356  FontClassDistance fc_dist = { class_id2, font_id2, result };
357  fc_info.distance_cache.push_back(fc_dist);
358  // Copy to the symmetric cache entry. We know it isn't there already, as
359  // we always copy to the symmetric entry.
360  FontClassInfo& fc_info2 = (*font_class_array_)(font_index2, class_id2);
361  fc_dist.unichar_id = class_id1;
362  fc_dist.font_id = font_id1;
363  fc_info2.distance_cache.push_back(fc_dist);
364  }
365  return fc_info.distance_cache[cache_index].distance;
366 }
367 
368 // Computes the distance between the given pair of font/class pairs.
370  int font_id1, int class_id1, int font_id2, int class_id2,
371  const IntFeatureMap& feature_map) const {
372  int dist = ReliablySeparable(font_id1, class_id1, font_id2, class_id2,
373  feature_map, false);
374  dist += ReliablySeparable(font_id2, class_id2, font_id1, class_id1,
375  feature_map, false);
376  int denominator = GetCanonicalFeatures(font_id1, class_id1).size();
377  denominator += GetCanonicalFeatures(font_id2, class_id2).size();
378  return static_cast<float>(dist) / denominator;
379 }
380 
381 // Helper to add a feature and its near neighbors to the good_features.
382 // levels indicates how many times to compute the offset features of what is
383 // already there. This is done by iteration rather than recursion.
384 static void AddNearFeatures(const IntFeatureMap& feature_map, int f, int levels,
385  GenericVector<int>* good_features) {
386  int prev_num_features = 0;
387  good_features->push_back(f);
388  int num_features = 1;
389  for (int level = 0; level < levels; ++level) {
390  for (int i = prev_num_features; i < num_features; ++i) {
391  int feature = (*good_features)[i];
392  for (int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
393  if (dir == 0) continue;
394  int f1 = feature_map.OffsetFeature(feature, dir);
395  if (f1 >= 0) {
396  good_features->push_back(f1);
397  }
398  }
399  }
400  prev_num_features = num_features;
401  num_features = good_features->size();
402  }
403 }
404 
405 // Returns the number of canonical features of font/class 2 for which
406 // neither the feature nor any of its near neighbors occurs in the cloud
407 // of font/class 1. Each such feature is a reliable separation between
408 // the classes, ASSUMING that the canonical sample is sufficiently
409 // representative that every sample has a feature near that particular
410 // feature. To check that this is so on the fly would be prohibitively
411 // expensive, but it might be possible to pre-qualify the canonical features
412 // to include only those for which this assumption is true.
413 // ComputeCanonicalFeatures and ComputeCloudFeatures must have been called
414 // first, or the results will be nonsense.
415 int TrainingSampleSet::ReliablySeparable(int font_id1, int class_id1,
416  int font_id2, int class_id2,
417  const IntFeatureMap& feature_map,
418  bool thorough) const {
419  int result = 0;
420  const TrainingSample* sample2 = GetCanonicalSample(font_id2, class_id2);
421  if (sample2 == NULL)
422  return 0; // There are no canonical features.
423  const GenericVector<int>& canonical2 = GetCanonicalFeatures(font_id2,
424  class_id2);
425  const BitVector& cloud1 = GetCloudFeatures(font_id1, class_id1);
426  if (cloud1.size() == 0)
427  return canonical2.size(); // There are no cloud features.
428 
429  // Find a canonical2 feature that is not in cloud1.
430  for (int f = 0; f < canonical2.size(); ++f) {
431  int feature = canonical2[f];
432  if (cloud1[feature])
433  continue;
434  // Gather the near neighbours of f.
435  GenericVector<int> good_features;
436  AddNearFeatures(feature_map, feature, 1, &good_features);
437  // Check that none of the good_features are in the cloud.
438  int i;
439  for (i = 0; i < good_features.size(); ++i) {
440  int good_f = good_features[i];
441  if (cloud1[good_f]) {
442  break;
443  }
444  }
445  if (i < good_features.size())
446  continue; // Found one in the cloud.
447  ++result;
448  }
449  return result;
450 }
451 
452 // Returns the total index of the requested sample.
453 // OrganizeByFontAndClass must have been already called.
454 int TrainingSampleSet::GlobalSampleIndex(int font_id, int class_id,
455  int index) const {
456  ASSERT_HOST(font_class_array_ != NULL);
457  int font_index = font_id_map_.SparseToCompact(font_id);
458  if (font_index < 0) return -1;
459  return (*font_class_array_)(font_index, class_id).samples[index];
460 }
461 
462 // Gets the canonical sample for the given font, class pair.
463 // ComputeCanonicalSamples must have been called first.
465  int font_id, int class_id) const {
466  ASSERT_HOST(font_class_array_ != NULL);
467  int font_index = font_id_map_.SparseToCompact(font_id);
468  if (font_index < 0) return NULL;
469  int sample_index = (*font_class_array_)(font_index,
470  class_id).canonical_sample;
471  return sample_index >= 0 ? samples_[sample_index] : NULL;
472 }
473 
474 // Gets the max distance for the given canonical sample.
475 // ComputeCanonicalSamples must have been called first.
476 float TrainingSampleSet::GetCanonicalDist(int font_id, int class_id) const {
477  ASSERT_HOST(font_class_array_ != NULL);
478  int font_index = font_id_map_.SparseToCompact(font_id);
479  if (font_index < 0) return 0.0f;
480  if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0)
481  return (*font_class_array_)(font_index, class_id).canonical_dist;
482  else
483  return 0.0f;
484 }
485 
486 // Generates indexed features for all samples with the supplied feature_space.
488  for (int s = 0; s < samples_.size(); ++s)
489  samples_[s]->IndexFeatures(feature_space);
490 }
491 
492 // Delete outlier samples with few features that are shared with others.
493 // IndexFeatures must have been called already.
495  bool debug) {
496  if (font_class_array_ == NULL)
498  Pixa* pixa = NULL;
499  if (debug)
500  pixa = pixaCreate(0);
501  GenericVector<int> feature_counts;
502  int fs_size = feature_space.Size();
503  int font_size = font_id_map_.CompactSize();
504  for (int font_index = 0; font_index < font_size; ++font_index) {
505  for (int c = 0; c < unicharset_size_; ++c) {
506  // Create a histogram of the features used by all samples of this
507  // font/class combination.
508  feature_counts.init_to_size(fs_size, 0);
509  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
510  int sample_count = fcinfo.samples.size();
511  if (sample_count < kMinOutlierSamples)
512  continue;
513  for (int i = 0; i < sample_count; ++i) {
514  int s = fcinfo.samples[i];
515  const GenericVector<int>& features = samples_[s]->indexed_features();
516  for (int f = 0; f < features.size(); ++f) {
517  ++feature_counts[features[f]];
518  }
519  }
520  for (int i = 0; i < sample_count; ++i) {
521  int s = fcinfo.samples[i];
522  const TrainingSample& sample = *samples_[s];
523  const GenericVector<int>& features = sample.indexed_features();
524  // A feature that has a histogram count of 1 is only used by this
525  // sample, making it 'bad'. All others are 'good'.
526  int good_features = 0;
527  int bad_features = 0;
528  for (int f = 0; f < features.size(); ++f) {
529  if (feature_counts[features[f]] > 1)
530  ++good_features;
531  else
532  ++bad_features;
533  }
534  // If more than 1/3 features are bad, then this is an outlier.
535  if (bad_features * 2 > good_features) {
536  tprintf("Deleting outlier sample of %s, %d good, %d bad\n",
537  SampleToString(sample).string(),
538  good_features, bad_features);
539  if (debug) {
540  pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT);
541  // Add the previous sample as well, so it is easier to see in
542  // the output what is wrong with this sample.
543  int t;
544  if (i == 0)
545  t = fcinfo.samples[1];
546  else
547  t = fcinfo.samples[i - 1];
548  const TrainingSample &csample = *samples_[t];
549  pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT);
550  }
551  // Mark the sample for deletion.
552  KillSample(samples_[s]);
553  }
554  }
555  }
556  }
557  // Truly delete all bad samples and renumber everything.
559  if (pixa != NULL) {
560  Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10);
561  pixaDestroy(&pixa);
562  pixWrite("outliers.png", pix, IFF_PNG);
563  pixDestroy(&pix);
564  }
565 }
566 
567 // Marks the given sample index for deletion.
568 // Deletion is actually completed by DeleteDeadSamples.
570  sample->set_sample_index(-1);
571 }
572 
573 // Deletes all samples with zero features marked by KillSample.
575  samples_.compact(
577  num_raw_samples_ = samples_.size();
578  // Samples must be re-organized now we have deleted a few.
579 }
580 
581 // Callback function returns true if the given sample is to be deleted, due
582 // to having a negative classid.
584  return sample == NULL || sample->class_id() < 0;
585 }
586 
587 static Pix* DebugSample(const UNICHARSET& unicharset,
589  tprintf("\nOriginal features:\n");
590  for (int i = 0; i < sample->num_features(); ++i) {
591  sample->features()[i].print();
592  }
593  if (sample->features_are_mapped()) {
594  tprintf("\nMapped features:\n");
595  for (int i = 0; i < sample->mapped_features().size(); ++i) {
596  tprintf("%d ", sample->mapped_features()[i]);
597  }
598  tprintf("\n");
599  }
600  return sample->RenderToPix(&unicharset);
601 }
602 
603 // Construct an array to access the samples by font,class pair.
605  // Font indexes are sparse, so we used a map to compact them, so we can
606  // have an efficient 2-d array of fonts and character classes.
607  SetupFontIdMap();
608  int compact_font_size = font_id_map_.CompactSize();
609  // Get a 2-d array of generic vectors.
610  if (font_class_array_ != NULL)
611  delete font_class_array_;
612  FontClassInfo empty;
613  font_class_array_ = new GENERIC_2D_ARRAY<FontClassInfo>(
614  compact_font_size, unicharset_size_, empty);
615  for (int s = 0; s < samples_.size(); ++s) {
616  int font_id = samples_[s]->font_id();
617  int class_id = samples_[s]->class_id();
618  if (font_id < 0 || font_id >= font_id_map_.SparseSize()) {
619  tprintf("Font id = %d/%d, class id = %d/%d on sample %d\n",
620  font_id, font_id_map_.SparseSize(), class_id, unicharset_size_,
621  s);
622  }
623  ASSERT_HOST(font_id >= 0 && font_id < font_id_map_.SparseSize());
624  ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
625  int font_index = font_id_map_.SparseToCompact(font_id);
626  (*font_class_array_)(font_index, class_id).samples.push_back(s);
627  }
628  // Set the num_raw_samples member of the FontClassInfo, to set the boundary
629  // between the raw samples and the replicated ones.
630  for (int f = 0; f < compact_font_size; ++f) {
631  for (int c = 0; c < unicharset_size_; ++c)
632  (*font_class_array_)(f, c).num_raw_samples =
633  (*font_class_array_)(f, c).samples.size();
634  }
635  // This is the global number of samples and also marks the boundary between
636  // real and replicated samples.
637  num_raw_samples_ = samples_.size();
638 }
639 
640 // Constructs the font_id_map_ which maps real font_ids (sparse) to a compact
641 // index for the font_class_array_.
643  // Number of samples for each font_id.
644  GenericVector<int> font_counts;
645  for (int s = 0; s < samples_.size(); ++s) {
646  int font_id = samples_[s]->font_id();
647  while (font_id >= font_counts.size())
648  font_counts.push_back(0);
649  ++font_counts[font_id];
650  }
651  font_id_map_.Init(font_counts.size(), false);
652  for (int f = 0; f < font_counts.size(); ++f) {
653  font_id_map_.SetMap(f, font_counts[f] > 0);
654  }
655  font_id_map_.Setup();
656 }
657 
658 
659 // Finds the sample for each font, class pair that has least maximum
660 // distance to all the other samples of the same font, class.
661 // OrganizeByFontAndClass must have been already called.
663  bool debug) {
664  ASSERT_HOST(font_class_array_ != NULL);
665  IntFeatureDist f_table;
666  if (debug) tprintf("feature table size %d\n", map.sparse_size());
667  f_table.Init(&map);
668  int worst_s1 = 0;
669  int worst_s2 = 0;
670  double global_worst_dist = 0.0;
671  // Compute distances independently for each font and char index.
672  int font_size = font_id_map_.CompactSize();
673  for (int font_index = 0; font_index < font_size; ++font_index) {
674  int font_id = font_id_map_.CompactToSparse(font_index);
675  for (int c = 0; c < unicharset_size_; ++c) {
676  int samples_found = 0;
677  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
678  if (fcinfo.samples.size() == 0 ||
679  (kTestChar >= 0 && c != kTestChar)) {
680  fcinfo.canonical_sample = -1;
681  fcinfo.canonical_dist = 0.0f;
682  if (debug) tprintf("Skipping class %d\n", c);
683  continue;
684  }
685  // The canonical sample will be the one with the min_max_dist, which
686  // is the sample with the lowest maximum distance to all other samples.
687  double min_max_dist = 2.0;
688  // We keep track of the farthest apart pair (max_s1, max_s2) which
689  // are max_max_dist apart, so we can see how bad the variability is.
690  double max_max_dist = 0.0;
691  int max_s1 = 0;
692  int max_s2 = 0;
693  fcinfo.canonical_sample = fcinfo.samples[0];
694  fcinfo.canonical_dist = 0.0f;
695  for (int i = 0; i < fcinfo.samples.size(); ++i) {
696  int s1 = fcinfo.samples[i];
697  const GenericVector<int>& features1 = samples_[s1]->indexed_features();
698  f_table.Set(features1, features1.size(), true);
699  double max_dist = 0.0;
700  // Run the full squared-order search for similar samples. It is still
701  // reasonably fast because f_table.FeatureDistance is fast, but we
702  // may have to reconsider if we start playing with too many samples
703  // of a single char/font.
704  for (int j = 0; j < fcinfo.samples.size(); ++j) {
705  int s2 = fcinfo.samples[j];
706  if (samples_[s2]->class_id() != c ||
707  samples_[s2]->font_id() != font_id ||
708  s2 == s1)
709  continue;
710  GenericVector<int> features2 = samples_[s2]->indexed_features();
711  double dist = f_table.FeatureDistance(features2);
712  if (dist > max_dist) {
713  max_dist = dist;
714  if (dist > max_max_dist) {
715  max_s1 = s1;
716  max_s2 = s2;
717  }
718  }
719  }
720  // Using Set(..., false) is far faster than re initializing, due to
721  // the sparseness of the feature space.
722  f_table.Set(features1, features1.size(), false);
723  samples_[s1]->set_max_dist(max_dist);
724  ++samples_found;
725  if (max_dist < min_max_dist) {
726  fcinfo.canonical_sample = s1;
727  fcinfo.canonical_dist = max_dist;
728  }
729  UpdateRange(max_dist, &min_max_dist, &max_max_dist);
730  }
731  if (max_max_dist > global_worst_dist) {
732  // Keep a record of the worst pair over all characters/fonts too.
733  global_worst_dist = max_max_dist;
734  worst_s1 = max_s1;
735  worst_s2 = max_s2;
736  }
737  if (debug) {
738  tprintf("Found %d samples of class %d=%s, font %d, "
739  "dist range [%g, %g], worst pair= %s, %s\n",
740  samples_found, c, unicharset_.debug_str(c).string(),
741  font_index, min_max_dist, max_max_dist,
742  SampleToString(*samples_[max_s1]).string(),
743  SampleToString(*samples_[max_s2]).string());
744  }
745  }
746  }
747  if (debug) {
748  tprintf("Global worst dist = %g, between sample %d and %d\n",
749  global_worst_dist, worst_s1, worst_s2);
750  Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]);
751  Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]);
752  pixOr(pix1, pix1, pix2);
753  pixWrite("worstpair.png", pix1, IFF_PNG);
754  pixDestroy(&pix1);
755  pixDestroy(&pix2);
756  }
757 }
758 
759 // Replicates the samples to a minimum frequency defined by
760 // 2 * kSampleRandomSize, or for larger counts duplicates all samples.
761 // After replication, the replicated samples are perturbed slightly, but
762 // in a predictable and repeatable way.
763 // Use after OrganizeByFontAndClass().
765  ASSERT_HOST(font_class_array_ != NULL);
766  int font_size = font_id_map_.CompactSize();
767  for (int font_index = 0; font_index < font_size; ++font_index) {
768  for (int c = 0; c < unicharset_size_; ++c) {
769  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
770  int sample_count = fcinfo.samples.size();
771  int min_samples = 2 * MAX(kSampleRandomSize, sample_count);
772  if (sample_count > 0 && sample_count < min_samples) {
773  int base_count = sample_count;
774  for (int base_index = 0; sample_count < min_samples; ++sample_count) {
775  int src_index = fcinfo.samples[base_index++];
776  if (base_index >= base_count) base_index = 0;
777  TrainingSample* sample = samples_[src_index]->RandomizedCopy(
778  sample_count % kSampleRandomSize);
779  int sample_index = samples_.size();
780  sample->set_sample_index(sample_index);
781  samples_.push_back(sample);
782  fcinfo.samples.push_back(sample_index);
783  }
784  }
785  }
786  }
787 }
788 
789 // Caches the indexed features of the canonical samples.
790 // ComputeCanonicalSamples must have been already called.
791 // TODO(rays) see note on ReliablySeparable and try restricting the
792 // canonical features to those that truly represent all samples.
794  ASSERT_HOST(font_class_array_ != NULL);
795  int font_size = font_id_map_.CompactSize();
796  for (int font_index = 0; font_index < font_size; ++font_index) {
797  int font_id = font_id_map_.CompactToSparse(font_index);
798  for (int c = 0; c < unicharset_size_; ++c) {
799  int num_samples = NumClassSamples(font_id, c, false);
800  if (num_samples == 0)
801  continue;
802  const TrainingSample* sample = GetCanonicalSample(font_id, c);
803  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
804  fcinfo.canonical_features = sample->indexed_features();
805  }
806  }
807 }
808 
809 // Computes the combined set of features used by all the samples of each
810 // font/class combination. Use after ReplicateAndRandomizeSamples.
811 void TrainingSampleSet::ComputeCloudFeatures(int feature_space_size) {
812  ASSERT_HOST(font_class_array_ != NULL);
813  int font_size = font_id_map_.CompactSize();
814  for (int font_index = 0; font_index < font_size; ++font_index) {
815  int font_id = font_id_map_.CompactToSparse(font_index);
816  for (int c = 0; c < unicharset_size_; ++c) {
817  int num_samples = NumClassSamples(font_id, c, false);
818  if (num_samples == 0)
819  continue;
820  FontClassInfo& fcinfo = (*font_class_array_)(font_index, c);
821  fcinfo.cloud_features.Init(feature_space_size);
822  for (int s = 0; s < num_samples; ++s) {
823  const TrainingSample* sample = GetSample(font_id, c, s);
824  const GenericVector<int>& sample_features = sample->indexed_features();
825  for (int i = 0; i < sample_features.size(); ++i)
826  fcinfo.cloud_features.SetBit(sample_features[i]);
827  }
828  }
829  }
830 }
831 
832 // Adds all fonts of the given class to the shape.
833 void TrainingSampleSet::AddAllFontsForClass(int class_id, Shape* shape) const {
834  for (int f = 0; f < font_id_map_.CompactSize(); ++f) {
835  int font_id = font_id_map_.CompactToSparse(f);
836  shape->AddToShape(class_id, font_id);
837  }
838 }
839 
840 // Display the samples with the given indexed feature that also match
841 // the given shape.
843  const Shape& shape,
844  const IntFeatureSpace& space,
845  ScrollView::Color color,
846  ScrollView* window) const {
847  for (int s = 0; s < num_raw_samples(); ++s) {
848  const TrainingSample* sample = GetSample(s);
849  if (shape.ContainsUnichar(sample->class_id())) {
850  GenericVector<int> indexed_features;
851  space.IndexAndSortFeatures(sample->features(), sample->num_features(),
852  &indexed_features);
853  for (int f = 0; f < indexed_features.size(); ++f) {
854  if (indexed_features[f] == f_index) {
855  sample->DisplayFeatures(color, window);
856  }
857  }
858  }
859  }
860 }
861 
862 
863 } // namespace tesseract.
void Init(int size, bool all_mapped)
const GenericVector< int > & GetCanonicalFeatures(int font_id, int class_id) const
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
int size() const
Definition: genericvector.h:72
bool SerializeClasses(FILE *fp) const
Definition: matrix.h:128
bool Serialize(FILE *fp) const
Pix * RenderToPix(const UNICHARSET *unicharset) const
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
#define MAX(x, y)
Definition: ndminx.h:24
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
virtual int SparseSize() const
Definition: indexmapbidi.h:142
void ComputeCloudFeatures(int feature_space_size)
const INT_FEATURE_STRUCT * features() const
int push_back(T object)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
#define tprintf(...)
Definition: tprintf.h:31
const TrainingSample * GetSample(int index) const
const GenericVector< int > & mapped_features() const
const int kTestChar
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
void SetMap(int sparse_index, bool mapped)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:125
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:110
bool ContainsUnichar(int unichar_id) const
Definition: shapetable.cpp:156
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, GenericVector< int > *sorted_features) const
int OffsetFeature(int index_feature, int dir) const
void KillSample(TrainingSample *sample)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void IndexFeatures(const IntFeatureSpace &feature_space)
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
TrainingSample * MutableSample(int font_id, int class_id, int index)
const GenericVector< int > & indexed_features() const
void set_sample_index(int value)
void Init(const IntFeatureMap *feature_map)
bool Serialize(FILE *fp) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
name_table name
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
int GlobalSampleIndex(int font_id, int class_id, int index) const
const int kPrime1
void init_to_size(int size, T t)
virtual int SparseToCompact(int sparse_index) const
Definition: indexmapbidi.h:138
void LoadUnicharset(const char *filename)
bool DeleteableSample(const TrainingSample *sample)
int NumClassSamples(int font_id, int class_id, bool randomize) const
const int kMinOutlierSamples
float GetCanonicalDist(int font_id, int class_id) const
const BitVector & GetCloudFeatures(int font_id, int class_id) const
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, STRING *box_str)
Definition: boxread.cpp:225
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int UNICHAR_ID
Definition: unichar.h:33
double FeatureDistance(const GenericVector< int > &features) const
GenericVector< inT32 > font_ids
Definition: shapetable.h:176
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
bool DeSerializeClasses(bool swap, FILE *fp)
Definition: matrix.h:141
bool features_are_mapped() const
void DeleteOutliers(const IntFeatureSpace &feature_space, bool debug)
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
UNICHAR_ID class_id() const
void AddAllFontsForClass(int class_id, Shape *shape) const
int AddSample(const char *unichar, TrainingSample *sample)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
int CompactSize() const
Definition: indexmapbidi.h:61
int CompactToSparse(int compact_index) const
Definition: indexmapbidi.h:53
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: cluster.h:32
const int kPrime2
int size() const
Definition: bitvector.h:57
void Set(const GenericVector< int > &indexed_features, int canonical_count, bool value)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
Definition: strngs.h:44
#define NULL
Definition: host.h:144
void print() const
Definition: intproto.h:148
SIGNED char inT8
Definition: host.h:98
STRING SampleToString(const TrainingSample &sample) const
void clear()
Definition: unicharset.h:266
void DisplayFeatures(ScrollView::Color color, ScrollView *window) const
const TBOX & bounding_box() const
int size() const
Definition: unicharset.h:297
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
const char * string() const
Definition: strngs.cpp:193
T & get(int index) const
bool DeSerialize(bool swap, FILE *fp)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
const int kSquareLimit
bool DeSerialize(bool swap, FILE *fp)