All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
mastertrainer.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: mastertrainer.cpp
5 // Description: Trainer to build the MasterClassifier.
6 // Author: Ray Smith
7 // Created: Wed Nov 03 18:10:01 PDT 2010
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "mastertrainer.h"
28 #include <math.h>
29 #include <time.h>
30 #include "allheaders.h"
31 #include "boxread.h"
32 #include "classify.h"
33 #include "efio.h"
34 #include "errorcounter.h"
35 #include "featdefs.h"
36 #include "sampleiterator.h"
37 #include "shapeclassifier.h"
38 #include "shapetable.h"
39 #include "svmnode.h"
40 
41 #include "scanutils.h"
42 
43 namespace tesseract {
44 
45 // Constants controlling clustering. With a low kMinClusteredShapes and a high
46 // kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
47 // Min number of shapes in the output.
48 const int kMinClusteredShapes = 1;
49 // Max number of unichars in any individual cluster.
50 const int kMaxUnicharsPerCluster = 2000;
51 // Mean font distance below which to merge fonts and unichars.
52 const float kFontMergeDistance = 0.025;
53 
55  bool shape_analysis,
56  bool replicate_samples,
57  int debug_level)
58  : norm_mode_(norm_mode), samples_(fontinfo_table_),
59  junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
60  charsetsize_(0),
61  enable_shape_anaylsis_(shape_analysis),
62  enable_replication_(replicate_samples),
63  fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
64 }
65 
67  delete [] fragments_;
68  for (int p = 0; p < page_images_.size(); ++p)
69  pixDestroy(&page_images_[p]);
70 }
71 
72 // WARNING! Serialize/DeSerialize are only partial, providing
73 // enough data to get the samples back and display them.
74 // Writes to the given file. Returns false in case of error.
75 bool MasterTrainer::Serialize(FILE* fp) const {
76  if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
77  if (!unicharset_.save_to_file(fp)) return false;
78  if (!feature_space_.Serialize(fp)) return false;
79  if (!samples_.Serialize(fp)) return false;
80  if (!junk_samples_.Serialize(fp)) return false;
81  if (!verify_samples_.Serialize(fp)) return false;
82  if (!master_shapes_.Serialize(fp)) return false;
83  if (!flat_shapes_.Serialize(fp)) return false;
84  if (!fontinfo_table_.Serialize(fp)) return false;
85  if (!xheights_.Serialize(fp)) return false;
86  return true;
87 }
88 
89 // Reads from the given file. Returns false in case of error.
90 // If swap is true, assumes a big/little-endian swap is needed.
91 bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
92  if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
93  if (swap) {
94  ReverseN(&norm_mode_, sizeof(norm_mode_));
95  }
96  if (!unicharset_.load_from_file(fp)) return false;
97  charsetsize_ = unicharset_.size();
98  if (!feature_space_.DeSerialize(swap, fp)) return false;
99  feature_map_.Init(feature_space_);
100  if (!samples_.DeSerialize(swap, fp)) return false;
101  if (!junk_samples_.DeSerialize(swap, fp)) return false;
102  if (!verify_samples_.DeSerialize(swap, fp)) return false;
103  if (!master_shapes_.DeSerialize(swap, fp)) return false;
104  if (!flat_shapes_.DeSerialize(swap, fp)) return false;
105  if (!fontinfo_table_.DeSerialize(swap, fp)) return false;
106  if (!xheights_.DeSerialize(swap, fp)) return false;
107  return true;
108 }
109 
110 // Load an initial unicharset, or set one up if the file cannot be read.
112  if (!unicharset_.load_from_file(filename)) {
113  tprintf("Failed to load unicharset from file %s\n"
114  "Building unicharset for training from scratch...\n",
115  filename);
116  unicharset_.clear();
117  UNICHARSET initialized;
118  // Add special characters, as they were removed by the clear, but the
119  // default constructor puts them in.
120  unicharset_.AppendOtherUnicharset(initialized);
121  }
122  charsetsize_ = unicharset_.size();
123  delete [] fragments_;
124  fragments_ = new int[charsetsize_];
125  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
126  samples_.LoadUnicharset(filename);
127  junk_samples_.LoadUnicharset(filename);
128  verify_samples_.LoadUnicharset(filename);
129 }
130 
131 // Reads the samples and their features from the given .tr format file,
132 // adding them to the trainer with the font_id from the content of the file.
133 // See mftraining.cpp for a description of the file format.
134 // If verification, then these are verification samples, not training.
135 void MasterTrainer::ReadTrainingSamples(const char* page_name,
137  bool verification) {
138  char buffer[2048];
139  int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
140  int micro_feature_type = ShortNameToFeatureType(feature_defs,
142  int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
143  int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
144 
145  FILE* fp = Efopen(page_name, "rb");
146  if (fp == NULL) {
147  tprintf("Failed to open tr file: %s\n", page_name);
148  return;
149  }
150  tr_filenames_.push_back(STRING(page_name));
151  while (fgets(buffer, sizeof(buffer), fp) != NULL) {
152  if (buffer[0] == '\n')
153  continue;
154 
155  char* space = strchr(buffer, ' ');
156  if (space == NULL) {
157  tprintf("Bad format in tr file, reading fontname, unichar\n");
158  continue;
159  }
160  *space++ = '\0';
161  int font_id = GetFontInfoId(buffer);
162  if (font_id < 0) font_id = 0;
163  int page_number;
164  STRING unichar;
165  TBOX bounding_box;
166  if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
167  tprintf("Bad format in tr file, reading box coords\n");
168  continue;
169  }
170  CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
172  sample->set_font_id(font_id);
173  sample->set_page_num(page_number + page_images_.size());
174  sample->set_bounding_box(bounding_box);
175  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
176  cn_feature_type, geo_feature_type, char_desc);
177  AddSample(verification, unichar.string(), sample);
178  FreeCharDescription(char_desc);
179  }
180  charsetsize_ = unicharset_.size();
181  fclose(fp);
182 }
183 
184 // Adds the given single sample to the trainer, setting the classid
185 // appropriately from the given unichar_str.
186 void MasterTrainer::AddSample(bool verification, const char* unichar,
188  if (verification) {
189  verify_samples_.AddSample(unichar, sample);
190  prev_unichar_id_ = -1;
191  } else if (unicharset_.contains_unichar(unichar)) {
192  if (prev_unichar_id_ >= 0)
193  fragments_[prev_unichar_id_] = -1;
194  prev_unichar_id_ = samples_.AddSample(unichar, sample);
195  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
196  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
197  } else {
198  int junk_id = junk_samples_.AddSample(unichar, sample);
199  if (prev_unichar_id_ >= 0) {
201  if (frag != NULL && frag->is_natural()) {
202  if (fragments_[prev_unichar_id_] == 0)
203  fragments_[prev_unichar_id_] = junk_id;
204  else if (fragments_[prev_unichar_id_] != junk_id)
205  fragments_[prev_unichar_id_] = -1;
206  }
207  delete frag;
208  }
209  prev_unichar_id_ = -1;
210  }
211 }
212 
213 // Loads all pages from the given tif filename and append to page_images_.
214 // Must be called after ReadTrainingSamples, as the current number of images
215 // is used as an offset for page numbers in the samples.
217  int page;
218  Pix* pix;
219  for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) {
220  page_images_.push_back(pix);
221  }
222  tprintf("Loaded %d page images from %s\n", page, filename);
223 }
224 
225 // Cleans up the samples after initial load from the tr files, and prior to
226 // saving the MasterTrainer:
227 // Remaps fragmented chars if running shape anaylsis.
228 // Sets up the samples appropriately for class/fontwise access.
229 // Deletes outlier samples.
231  if (debug_level_ > 0)
232  tprintf("PostLoadCleanup...\n");
233  if (enable_shape_anaylsis_)
234  ReplaceFragmentedSamples();
235  SampleIterator sample_it;
236  sample_it.Init(NULL, NULL, true, &verify_samples_);
237  sample_it.NormalizeSamples();
238  verify_samples_.OrganizeByFontAndClass();
239 
240  samples_.IndexFeatures(feature_space_);
241  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
242  // against current training.
243  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
244  samples_.OrganizeByFontAndClass();
245  if (debug_level_ > 0)
246  tprintf("ComputeCanonicalSamples...\n");
247  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
248 }
249 
250 // Gets the samples ready for training. Use after both
251 // ReadTrainingSamples+PostLoadCleanup or DeSerialize.
252 // Re-indexes the features and computes canonical and cloud features.
254  if (debug_level_ > 0)
255  tprintf("PreTrainingSetup...\n");
256  samples_.IndexFeatures(feature_space_);
257  samples_.ComputeCanonicalFeatures();
258  if (debug_level_ > 0)
259  tprintf("ComputeCloudFeatures...\n");
260  samples_.ComputeCloudFeatures(feature_space_.Size());
261 }
262 
263 // Sets up the master_shapes_ table, which tells which fonts should stay
264 // together until they get to a leaf node classifier.
266  tprintf("Building master shape table\n");
267  int num_fonts = samples_.NumFonts();
268 
269  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
270  ShapeTable char_shapes_end_fragment(samples_.unicharset());
271  ShapeTable char_shapes(samples_.unicharset());
272  for (int c = 0; c < samples_.charsetsize(); ++c) {
273  ShapeTable shapes(samples_.unicharset());
274  for (int f = 0; f < num_fonts; ++f) {
275  if (samples_.NumClassSamples(f, c, true) > 0)
276  shapes.AddShape(c, f);
277  }
278  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
279 
280  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
281 
282  if (fragment == NULL)
283  char_shapes.AppendMasterShapes(shapes, NULL);
284  else if (fragment->is_beginning())
285  char_shapes_begin_fragment.AppendMasterShapes(shapes, NULL);
286  else if (fragment->is_ending())
287  char_shapes_end_fragment.AppendMasterShapes(shapes, NULL);
288  else
289  char_shapes.AppendMasterShapes(shapes, NULL);
290  }
291  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
292  kFontMergeDistance, &char_shapes_begin_fragment);
293  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, NULL);
294  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
295  kFontMergeDistance, &char_shapes_end_fragment);
296  char_shapes.AppendMasterShapes(char_shapes_end_fragment, NULL);
297  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster,
298  kFontMergeDistance, &char_shapes);
299  master_shapes_.AppendMasterShapes(char_shapes, NULL);
300  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
301 }
302 
303 // Adds the junk_samples_ to the main samples_ set. Junk samples are initially
304 // fragments and n-grams (all incorrectly segmented characters).
305 // Various training functions may result in incorrectly segmented characters
306 // being added to the unicharset of the main samples, perhaps because they
307 // form a "radical" decomposition of some (Indic) grapheme, or because they
308 // just look the same as a real character (like rn/m)
309 // This function moves all the junk samples, to the main samples_ set, but
310 // desirable junk, being any sample for which the unichar already exists in
311 // the samples_ unicharset gets the unichar-ids re-indexed to match, but
312 // anything else gets re-marked as unichar_id 0 (space character) to identify
313 // it as junk to the error counter.
315  // Get ids of fragments in junk_samples_ that replace the dead chars.
316  const UNICHARSET& junk_set = junk_samples_.unicharset();
317  const UNICHARSET& sample_set = samples_.unicharset();
318  int num_junks = junk_samples_.num_samples();
319  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
320  for (int s = 0; s < num_junks; ++s) {
321  TrainingSample* sample = junk_samples_.mutable_sample(s);
322  int junk_id = sample->class_id();
323  const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
324  int sample_id = sample_set.unichar_to_id(junk_utf8);
325  if (sample_id == INVALID_UNICHAR_ID)
326  sample_id = 0;
327  sample->set_class_id(sample_id);
328  junk_samples_.extract_sample(s);
329  samples_.AddSample(sample_id, sample);
330  }
331  junk_samples_.DeleteDeadSamples();
332  samples_.OrganizeByFontAndClass();
333 }
334 
335 // Replicates the samples and perturbs them if the enable_replication_ flag
336 // is set. MUST be used after the last call to OrganizeByFontAndClass on
337 // the training samples, ie after IncludeJunk if it is going to be used, as
338 // OrganizeByFontAndClass will eat the replicated samples into the regular
339 // samples.
341  if (enable_replication_) {
342  if (debug_level_ > 0)
343  tprintf("ReplicateAndRandomize...\n");
344  verify_samples_.ReplicateAndRandomizeSamples();
345  samples_.ReplicateAndRandomizeSamples();
346  samples_.IndexFeatures(feature_space_);
347  }
348 }
349 
350 // Loads the basic font properties file into fontinfo_table_.
351 // Returns false on failure.
353  FILE* fp = fopen(filename, "rb");
354  if (fp == NULL) {
355  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
356  return false;
357  }
358  int italic, bold, fixed, serif, fraktur;
359  while (!feof(fp)) {
360  FontInfo fontinfo;
361  char* font_name = new char[1024];
362  fontinfo.name = font_name;
363  fontinfo.properties = 0;
364  fontinfo.universal_id = 0;
365  if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name,
366  &italic, &bold, &fixed, &serif, &fraktur) != 6)
367  continue;
368  fontinfo.properties =
369  (italic << 0) +
370  (bold << 1) +
371  (fixed << 2) +
372  (serif << 3) +
373  (fraktur << 4);
374  if (!fontinfo_table_.contains(fontinfo)) {
375  fontinfo_table_.push_back(fontinfo);
376  }
377  }
378  fclose(fp);
379  return true;
380 }
381 
382 // Loads the xheight font properties file into xheights_.
383 // Returns false on failure.
385  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
386  xheights_.init_to_size(fontinfo_table_.size(), -1);
387  if (filename == NULL) return true;
388  FILE *f = fopen(filename, "rb");
389  if (f == NULL) {
390  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
391  return false;
392  }
393  tprintf("Reading x-heights from %s ...\n", filename);
394  FontInfo fontinfo;
395  fontinfo.properties = 0; // Not used to lookup in the table.
396  fontinfo.universal_id = 0;
397  char buffer[1024];
398  int xht;
399  int total_xheight = 0;
400  int xheight_count = 0;
401  while (!feof(f)) {
402  if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2)
403  continue;
404  buffer[1023] = '\0';
405  fontinfo.name = buffer;
406  if (!fontinfo_table_.contains(fontinfo)) continue;
407  int fontinfo_id = fontinfo_table_.get_index(fontinfo);
408  xheights_[fontinfo_id] = xht;
409  total_xheight += xht;
410  ++xheight_count;
411  }
412  if (xheight_count == 0) {
413  fprintf(stderr, "No valid xheights in %s!\n", filename);
414  fclose(f);
415  return false;
416  }
417  int mean_xheight = DivRounded(total_xheight, xheight_count);
418  for (int i = 0; i < fontinfo_table_.size(); ++i) {
419  if (xheights_[i] < 0)
420  xheights_[i] = mean_xheight;
421  }
422  fclose(f);
423  return true;
424 } // LoadXHeights
425 
426 // Reads spacing stats from filename and adds them to fontinfo_table.
428  FILE* fontinfo_file = fopen(filename, "rb");
429  if (fontinfo_file == NULL)
430  return true; // We silently ignore missing files!
431  // Find the fontinfo_id.
432  int fontinfo_id = GetBestMatchingFontInfoId(filename);
433  if (fontinfo_id < 0) {
434  tprintf("No font found matching fontinfo filename %s\n", filename);
435  fclose(fontinfo_file);
436  return false;
437  }
438  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
439  // TODO(rays) scale should probably be a double, but keep as an int for now
440  // to duplicate current behavior.
441  int scale = kBlnXHeight / xheights_[fontinfo_id];
442  int num_unichars;
443  char uch[UNICHAR_LEN];
444  char kerned_uch[UNICHAR_LEN];
445  int x_gap, x_gap_before, x_gap_after, num_kerned;
446  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
447  FontInfo *fi = &fontinfo_table_.get(fontinfo_id);
448  fi->init_spacing(unicharset_.size());
449  FontSpacingInfo *spacing = NULL;
450  for (int l = 0; l < num_unichars; ++l) {
451  if (tfscanf(fontinfo_file, "%s %d %d %d",
452  uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
453  tprintf("Bad format of font spacing file %s\n", filename);
454  fclose(fontinfo_file);
455  return false;
456  }
457  bool valid = unicharset_.contains_unichar(uch);
458  if (valid) {
459  spacing = new FontSpacingInfo();
460  spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
461  spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
462  }
463  for (int k = 0; k < num_kerned; ++k) {
464  if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
465  tprintf("Bad format of font spacing file %s\n", filename);
466  fclose(fontinfo_file);
467  delete spacing;
468  return false;
469  }
470  if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
471  spacing->kerned_unichar_ids.push_back(
472  unicharset_.unichar_to_id(kerned_uch));
473  spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
474  }
475  if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
476  }
477  fclose(fontinfo_file);
478  return true;
479 }
480 
481 // Returns the font id corresponding to the given font name.
482 // Returns -1 if the font cannot be found.
483 int MasterTrainer::GetFontInfoId(const char* font_name) {
484  FontInfo fontinfo;
485  // We are only borrowing the string, so it is OK to const cast it.
486  fontinfo.name = const_cast<char*>(font_name);
487  fontinfo.properties = 0; // Not used to lookup in the table
488  fontinfo.universal_id = 0;
489  return fontinfo_table_.get_index(fontinfo);
490 }
491 // Returns the font_id of the closest matching font name to the given
492 // filename. It is assumed that a substring of the filename will match
493 // one of the fonts. If more than one is matched, the longest is returned.
495  int fontinfo_id = -1;
496  int best_len = 0;
497  for (int f = 0; f < fontinfo_table_.size(); ++f) {
498  if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
499  int len = strlen(fontinfo_table_.get(f).name);
500  // Use the longest matching length in case a substring of a font matched.
501  if (len > best_len) {
502  best_len = len;
503  fontinfo_id = f;
504  }
505  }
506  }
507  return fontinfo_id;
508 }
509 
510 // Sets up a flat shapetable with one shape per class/font combination.
512  // To exactly mimic the results of the previous implementation, the shapes
513  // must be clustered in order the fonts arrived, and reverse order of the
514  // characters within each font.
515  // Get a list of the fonts in the order they appeared.
516  GenericVector<int> active_fonts;
517  int num_shapes = flat_shapes_.NumShapes();
518  for (int s = 0; s < num_shapes; ++s) {
519  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
520  int f = 0;
521  for (f = 0; f < active_fonts.size(); ++f) {
522  if (active_fonts[f] == font)
523  break;
524  }
525  if (f == active_fonts.size())
526  active_fonts.push_back(font);
527  }
528  // For each font in order, add all the shapes with that font in reverse order.
529  int num_fonts = active_fonts.size();
530  for (int f = 0; f < num_fonts; ++f) {
531  for (int s = num_shapes - 1; s >= 0; --s) {
532  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
533  if (font == active_fonts[f]) {
534  shape_table->AddShape(flat_shapes_.GetShape(s));
535  }
536  }
537  }
538 }
539 
540 // Sets up a Clusterer for mftraining on a single shape_id.
541 // Call FreeClusterer on the return value after use.
543  const ShapeTable& shape_table,
545  int shape_id,
546  int* num_samples) {
547 
548  int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);
549  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
550  ASSERT_HOST(num_params == MFCount);
551  CLUSTERER* clusterer = MakeClusterer(
552  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
553 
554  // We want to iterate over the samples of just the one shape.
555  IndexMapBiDi shape_map;
556  shape_map.Init(shape_table.NumShapes(), false);
557  shape_map.SetMap(shape_id, true);
558  shape_map.Setup();
559  // Reverse the order of the samples to match the previous behavior.
561  SampleIterator it;
562  it.Init(&shape_map, &shape_table, false, &samples_);
563  for (it.Begin(); !it.AtEnd(); it.Next()) {
564  sample_ptrs.push_back(&it.GetSample());
565  }
566  int sample_id = 0;
567  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
568  const TrainingSample* sample = sample_ptrs[i];
569  int num_features = sample->num_micro_features();
570  for (int f = 0; f < num_features; ++f)
571  MakeSample(clusterer, sample->micro_features()[f], sample_id);
572  ++sample_id;
573  }
574  *num_samples = sample_id;
575  return clusterer;
576 }
577 
578 // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
579 // to the given inttemp_file, and the corresponding pffmtable.
580 // The unicharset is the original encoding of graphemes, and shape_set should
581 // match the size of the shape_table, and may possibly be totally fake.
583  const UNICHARSET& shape_set,
584  const ShapeTable& shape_table,
585  CLASS_STRUCT* float_classes,
586  const char* inttemp_file,
587  const char* pffmtable_file) {
588  tesseract::Classify *classify = new tesseract::Classify();
589  // Move the fontinfo table to classify.
590  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());
591  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
592  shape_set);
593  FILE* fp = fopen(inttemp_file, "wb");
594  classify->WriteIntTemplates(fp, int_templates, shape_set);
595  fclose(fp);
596  // Now write pffmtable. This is complicated by the fact that the adaptive
597  // classifier still wants one indexed by unichar-id, but the static
598  // classifier needs one indexed by its shape class id.
599  // We put the shapetable_cutoffs in a GenericVector, and compute the
600  // unicharset cutoffs along the way.
601  GenericVector<uinT16> shapetable_cutoffs;
602  GenericVector<uinT16> unichar_cutoffs;
603  for (int c = 0; c < unicharset.size(); ++c)
604  unichar_cutoffs.push_back(0);
605  /* then write out each class */
606  for (int i = 0; i < int_templates->NumClasses; ++i) {
607  INT_CLASS Class = ClassForClassId(int_templates, i);
608  // Todo: Test with min instead of max
609  // int MaxLength = LengthForConfigId(Class, 0);
610  uinT16 max_length = 0;
611  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
612  // Todo: Test with min instead of max
613  // if (LengthForConfigId (Class, config_id) < MaxLength)
614  uinT16 length = Class->ConfigLengths[config_id];
615  if (length > max_length)
616  max_length = Class->ConfigLengths[config_id];
617  int shape_id = float_classes[i].font_set.get(config_id);
618  const Shape& shape = shape_table.GetShape(shape_id);
619  for (int c = 0; c < shape.size(); ++c) {
620  int unichar_id = shape[c].unichar_id;
621  if (length > unichar_cutoffs[unichar_id])
622  unichar_cutoffs[unichar_id] = length;
623  }
624  }
625  shapetable_cutoffs.push_back(max_length);
626  }
627  fp = fopen(pffmtable_file, "wb");
628  shapetable_cutoffs.Serialize(fp);
629  for (int c = 0; c < unicharset.size(); ++c) {
630  const char *unichar = unicharset.id_to_unichar(c);
631  if (strcmp(unichar, " ") == 0) {
632  unichar = "NULL";
633  }
634  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
635  }
636  fclose(fp);
637  free_int_templates(int_templates);
638  delete classify;
639 }
640 
641 // Generate debug output relating to the canonical distance between the
642 // two given UTF8 grapheme strings.
643 void MasterTrainer::DebugCanonical(const char* unichar_str1,
644  const char* unichar_str2) {
645  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
646  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
647  if (class_id2 == INVALID_UNICHAR_ID)
648  class_id2 = class_id1;
649  if (class_id1 == INVALID_UNICHAR_ID) {
650  tprintf("No unicharset entry found for %s\n", unichar_str1);
651  return;
652  } else {
653  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
654  class_id1, unichar_str1, class_id2, unichar_str2);
655  }
656  int num_fonts = samples_.NumFonts();
657  const IntFeatureMap& feature_map = feature_map_;
658  // Iterate the fonts to get the similarity with other fonst of the same
659  // class.
660  tprintf(" ");
661  for (int f = 0; f < num_fonts; ++f) {
662  if (samples_.NumClassSamples(f, class_id2, false) == 0)
663  continue;
664  tprintf("%6d", f);
665  }
666  tprintf("\n");
667  for (int f1 = 0; f1 < num_fonts; ++f1) {
668  // Map the features of the canonical_sample.
669  if (samples_.NumClassSamples(f1, class_id1, false) == 0)
670  continue;
671  tprintf("%4d ", f1);
672  for (int f2 = 0; f2 < num_fonts; ++f2) {
673  if (samples_.NumClassSamples(f2, class_id2, false) == 0)
674  continue;
675  float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
676  feature_map);
677  tprintf(" %5.3f", dist);
678  }
679  tprintf("\n");
680  }
681  // Build a fake ShapeTable containing all the sample types.
682  ShapeTable shapes(unicharset_);
683  for (int f = 0; f < num_fonts; ++f) {
684  if (samples_.NumClassSamples(f, class_id1, true) > 0)
685  shapes.AddShape(class_id1, f);
686  if (class_id1 != class_id2 &&
687  samples_.NumClassSamples(f, class_id2, true) > 0)
688  shapes.AddShape(class_id2, f);
689  }
690 }
691 
692 #ifndef GRAPHICS_DISABLED
693 // Debugging for cloud/canonical features.
694 // Displays a Features window containing:
695 // If unichar_str2 is in the unicharset, and canonical_font is non-negative,
696 // displays the canonical features of the char/font combination in red.
697 // If unichar_str1 is in the unicharset, and cloud_font is non-negative,
698 // displays the cloud feature of the char/font combination in green.
699 // The canonical features are drawn first to show which ones have no
700 // matches in the cloud features.
701 // Until the features window is destroyed, each click in the features window
702 // will display the samples that have that feature in a separate window.
703 void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
704  const char* unichar_str2,
705  int canonical_font) {
706  const IntFeatureMap& feature_map = feature_map_;
707  const IntFeatureSpace& feature_space = feature_map.feature_space();
708  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
710  f_window);
711  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
712  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
713  const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
714  class_id2);
715  for (int f = 0; f < sample->num_features(); ++f) {
716  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
717  }
718  }
719  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
720  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
721  const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
722  for (int f = 0; f < cloud.size(); ++f) {
723  if (cloud[f]) {
724  INT_FEATURE_STRUCT feature =
725  feature_map.InverseIndexFeature(f);
726  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
727  }
728  }
729  }
730  f_window->Update();
731  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
732  SVEventType ev_type;
733  do {
734  SVEvent* ev;
735  // Wait until a click or popup event.
736  ev = f_window->AwaitEvent(SVET_ANY);
737  ev_type = ev->type;
738  if (ev_type == SVET_CLICK) {
739  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
740  if (feature_index >= 0) {
741  // Iterate samples and display those with the feature.
742  Shape shape;
743  shape.AddToShape(class_id1, cloud_font);
744  s_window->Clear();
745  samples_.DisplaySamplesWithFeature(feature_index, shape,
746  feature_space, ScrollView::GREEN,
747  s_window);
748  s_window->Update();
749  }
750  }
751  delete ev;
752  } while (ev_type != SVET_DESTROY);
753 }
754 #endif // GRAPHICS_DISABLED
755 
756 void MasterTrainer::TestClassifierVOld(bool replicate_samples,
757  ShapeClassifier* test_classifier,
758  ShapeClassifier* old_classifier) {
759  SampleIterator sample_it;
760  sample_it.Init(NULL, NULL, replicate_samples, &samples_);
761  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,
762  CT_UNICHAR_TOPN_ERR, fontinfo_table_,
763  page_images_, &sample_it);
764 }
765 
766 // Tests the given test_classifier on the internal samples.
767 // See TestClassifier for details.
769  int report_level,
770  bool replicate_samples,
771  ShapeClassifier* test_classifier,
772  STRING* report_string) {
773  TestClassifier(error_mode, report_level, replicate_samples, &samples_,
774  test_classifier, report_string);
775 }
776 
777 // Tests the given test_classifier on the given samples.
778 // error_mode indicates what counts as an error.
779 // report_levels:
780 // 0 = no output.
781 // 1 = bottom-line error rate.
782 // 2 = bottom-line error rate + time.
783 // 3 = font-level error rate + time.
784 // 4 = list of all errors + short classifier debug output on 16 errors.
785 // 5 = list of all errors + short classifier debug output on 25 errors.
786 // If replicate_samples is true, then the test is run on an extended test
787 // sample including replicated and systematically perturbed samples.
788 // If report_string is non-NULL, a summary of the results for each font
789 // is appended to the report_string.
791  int report_level,
792  bool replicate_samples,
793  TrainingSampleSet* samples,
794  ShapeClassifier* test_classifier,
795  STRING* report_string) {
796  SampleIterator sample_it;
797  sample_it.Init(NULL, NULL, replicate_samples, samples);
798  if (report_level > 0) {
799  int num_samples = 0;
800  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
801  ++num_samples;
802  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
803  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
804  test_classifier->GetShapeTable()->NumShapes(), num_samples);
805  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
806  }
807  double unichar_error = 0.0;
808  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
809  error_mode, fontinfo_table_,
810  page_images_, &sample_it, &unichar_error,
811  NULL, report_string);
812  return unichar_error;
813 }
814 
815 // Returns the average (in some sense) distance between the two given
816 // shapes, which may contain multiple fonts and/or unichars.
817 float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
818  const IntFeatureMap& feature_map = feature_map_;
819  const Shape& shape1 = shapes.GetShape(s1);
820  const Shape& shape2 = shapes.GetShape(s2);
821  int num_chars1 = shape1.size();
822  int num_chars2 = shape2.size();
823  float dist_sum = 0.0f;
824  int dist_count = 0;
825  if (num_chars1 > 1 || num_chars2 > 1) {
826  // In the multi-char case try to optimize the calculation by computing
827  // distances between characters of matching font where possible.
828  for (int c1 = 0; c1 < num_chars1; ++c1) {
829  for (int c2 = 0; c2 < num_chars2; ++c2) {
830  dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
831  true, feature_map);
832  ++dist_count;
833  }
834  }
835  } else {
836  // In the single unichar case, there is little alternative, but to compute
837  // the squared-order distance between pairs of fonts.
838  dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
839  false, feature_map);
840  ++dist_count;
841  }
842  return dist_sum / dist_count;
843 }
844 
845 // Replaces samples that are always fragmented with the corresponding
846 // fragment samples.
847 void MasterTrainer::ReplaceFragmentedSamples() {
848  if (fragments_ == NULL) return;
849  // Remove samples that are replaced by fragments. Each class that was
850  // always naturally fragmented should be replaced by its fragments.
851  int num_samples = samples_.num_samples();
852  for (int s = 0; s < num_samples; ++s) {
853  TrainingSample* sample = samples_.mutable_sample(s);
854  if (fragments_[sample->class_id()] > 0)
855  samples_.KillSample(sample);
856  }
857  samples_.DeleteDeadSamples();
858 
859  // Get ids of fragments in junk_samples_ that replace the dead chars.
860  const UNICHARSET& frag_set = junk_samples_.unicharset();
861 #if 0
862  // TODO(rays) The original idea was to replace only graphemes that were
863  // always naturally fragmented, but that left a lot of the Indic graphemes
864  // out. Determine whether we can go back to that idea now that spacing
865  // is fixed in the training images, or whether this code is obsolete.
866  bool* good_junk = new bool[frag_set.size()];
867  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
868  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
869  int frag_ch = fragments_[dead_ch];
870  if (frag_ch <= 0) continue;
871  const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
873  // Mark the chars for all parts of the fragment as good in good_junk.
874  for (int part = 0; part < frag->get_total(); ++part) {
875  frag->set_pos(part);
876  int good_ch = frag_set.unichar_to_id(frag->to_string().string());
877  if (good_ch != INVALID_UNICHAR_ID)
878  good_junk[good_ch] = true; // We want this one.
879  }
880  }
881 #endif
882  // For now just use all the junk that was from natural fragments.
883  // Get samples of fragments in junk_samples_ that replace the dead chars.
884  int num_junks = junk_samples_.num_samples();
885  for (int s = 0; s < num_junks; ++s) {
886  TrainingSample* sample = junk_samples_.mutable_sample(s);
887  int junk_id = sample->class_id();
888  const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
890  if (frag != NULL && frag->is_natural()) {
891  junk_samples_.extract_sample(s);
892  samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
893  }
894  }
895  junk_samples_.DeleteDeadSamples();
896  junk_samples_.OrganizeByFontAndClass();
897  samples_.OrganizeByFontAndClass();
898  unicharset_.clear();
899  unicharset_.AppendOtherUnicharset(samples_.unicharset());
900  // delete [] good_junk;
901  // Fragments_ no longer needed?
902  delete [] fragments_;
903  fragments_ = NULL;
904 }
905 
906 // Runs a hierarchical agglomerative clustering to merge shapes in the given
907 // shape_table, while satisfying the given constraints:
908 // * End with at least min_shapes left in shape_table,
909 // * No shape shall have more than max_shape_unichars in it,
910 // * Don't merge shapes where the distance between them exceeds max_dist.
911 const float kInfiniteDist = 999.0f;
912 void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
913  float max_dist, ShapeTable* shapes) {
914  int num_shapes = shapes->NumShapes();
915  int max_merges = num_shapes - min_shapes;
916  GenericVector<ShapeDist>* shape_dists =
917  new GenericVector<ShapeDist>[num_shapes];
918  float min_dist = kInfiniteDist;
919  int min_s1 = 0;
920  int min_s2 = 0;
921  tprintf("Computing shape distances...");
922  for (int s1 = 0; s1 < num_shapes; ++s1) {
923  for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
924  ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
925  shape_dists[s1].push_back(dist);
926  if (dist.distance < min_dist) {
927  min_dist = dist.distance;
928  min_s1 = s1;
929  min_s2 = s2;
930  }
931  }
932  tprintf(" %d", s1);
933  }
934  tprintf("\n");
935  int num_merged = 0;
936  while (num_merged < max_merges && min_dist < max_dist) {
937  tprintf("Distance = %f: ", min_dist);
938  int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
939  shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
940  if (num_unichars > max_shape_unichars) {
941  tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
942  min_s1, min_s2, num_unichars, max_shape_unichars);
943  } else {
944  shapes->MergeShapes(min_s1, min_s2);
945  shape_dists[min_s2].clear();
946  ++num_merged;
947 
948  for (int s = 0; s < min_s1; ++s) {
949  if (!shape_dists[s].empty()) {
950  shape_dists[s][min_s1 - s - 1].distance =
951  ShapeDistance(*shapes, s, min_s1);
952  shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
953  }
954  }
955  for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
956  if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
957  shape_dists[min_s1][s2 - min_s1 - 1].distance =
958  ShapeDistance(*shapes, min_s1, s2);
959  }
960  for (int s = min_s1 + 1; s < min_s2; ++s) {
961  if (!shape_dists[s].empty()) {
962  shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
963  }
964  }
965  }
966  min_dist = kInfiniteDist;
967  for (int s1 = 0; s1 < num_shapes; ++s1) {
968  for (int i = 0; i < shape_dists[s1].size(); ++i) {
969  if (shape_dists[s1][i].distance < min_dist) {
970  min_dist = shape_dists[s1][i].distance;
971  min_s1 = s1;
972  min_s2 = s1 + 1 + i;
973  }
974  }
975  }
976  }
977  tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
978  delete [] shape_dists;
979  if (debug_level_ > 1) {
980  for (int s1 = 0; s1 < num_shapes; ++s1) {
981  if (shapes->MasterDestinationIndex(s1) == s1) {
982  tprintf("Master shape:%s\n", shapes->DebugStr(s1).string());
983  }
984  }
985  }
986 }
987 
988 
989 } // namespace tesseract.
const int kBlnXHeight
Definition: normalis.h:28
void Init(int size, bool all_mapped)
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:263
int size() const
Definition: shapetable.h:202
int size() const
Definition: genericvector.h:72
bool Serialize(FILE *fp) const
bool LoadFontInfo(const char *filename)
void LoadPageImages(const char *filename)
virtual const ShapeTable * GetShapeTable() const =0
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void set_bounding_box(const TBOX &box)
bool save_to_file(const char *const filename) const
Definition: unicharset.h:306
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:1104
void ComputeCloudFeatures(int feature_space_size)
const INT_FEATURE_STRUCT * features() const
int push_back(T object)
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:141
static void Update()
Definition: scrollview.cpp:715
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
#define tprintf(...)
Definition: tprintf.h:31
const float kInfiniteDist
int y
Definition: scrollview.h:67
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:229
const char * kIntFeatureType
Definition: featdefs.cpp:43
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
void SetupFlatShapeTable(ShapeTable *shape_table)
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250
Definition: mf.h:30
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:400
bool DeSerialize(bool swap, FILE *fp)
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:165
STRING SummaryStr() const
Definition: shapetable.cpp:323
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_pos(int p)
Definition: unicharset.h:62
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:457
bool DeSerialize(bool swap, FILE *fp)
Definition: shapetable.cpp:256
void SetMap(int sparse_index, bool mapped)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
UnicityTableEqEq< int > font_set
Definition: protos.h:65
const int kMaxUnicharsPerCluster
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:110
int get_total() const
Definition: unicharset.h:66
bool Serialize(FILE *fp) const
const char * kCNFeatureType
Definition: featdefs.cpp:42
uinT16 ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:113
bool Serialize(FILE *fp) const
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60
INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:564
void KillSample(TrainingSample *sample)
#define ASSERT_HOST(x)
Definition: errcode.h:84
TrainingSample * extract_sample(int index)
void IndexFeatures(const IntFeatureSpace &feature_space)
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:439
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
SVEventType
Definition: scrollview.h:45
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
int MasterDestinationIndex(int shape_id) const
Definition: shapetable.cpp:541
void MergeShapes(int shape_id1, int shape_id2)
Definition: shapetable.cpp:523
int GetFontInfoId(const char *font_name)
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:682
void Clear()
Definition: scrollview.cpp:595
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, STRING *report_string)
const int kMinClusteredShapes
GenericVector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:54
FEATURE_DEFS_STRUCT feature_defs
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
void Init(const IntFeatureSpace &feature_space)
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
void init_to_size(int size, T t)
bool DeSerialize(bool swap, FILE *fp)
void LoadUnicharset(const char *filename)
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449
bool LoadXHeights(const char *filename)
int NumClassSamples(int font_id, int class_id, bool randomize) const
bool is_natural() const
Definition: unicharset.h:107
bool Serialize(FILE *fp) const
void ReplicateAndRandomizeSamplesIfRequired()
const BitVector & GetCloudFeatures(int font_id, int class_id) const
const char * kGeoFeatureType
Definition: featdefs.cpp:44
const char * kMicroFeatureType
Definition: featdefs.cpp:41
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1770
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:80
int GetBestMatchingFontInfoId(const char *filename)
void LoadUnicharset(const char *filename)
bool is_ending() const
Definition: unicharset.h:102
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:345
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)
static STRING to_string(const char *unichar, int pos, int total, bool natural)
void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:106
#define ClassForClassId(T, c)
Definition: intproto.h:181
bool contains(T object) const
bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:49
void ExtractCharDesc(int feature_type, int micro_type, int cn_type, int geo_type, CHAR_DESC_STRUCT *char_desc)
STRING DebugStr(int shape_id) const
Definition: shapetable.cpp:291
const T & get(int id) const
Return the object from an id.
const TrainingSample & GetSample() const
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
UNICHAR_ID class_id() const
int DivRounded(int a, int b)
Definition: helpers.h:166
int AddSample(const char *unichar, TrainingSample *sample)
int MergedUnicharCount(int shape_id1, int shape_id2) const
Definition: shapetable.cpp:513
void free_int_templates(INT_TEMPLATES templates)
Definition: intproto.cpp:748
const UNICHARSET & unicharset() const
bool DeSerialize(bool swap, FILE *fp)
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: cluster.h:32
int size() const
Definition: bitvector.h:57
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix * > &page_images, SampleIterator *it)
Definition: rect.h:30
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:396
SVEventType type
Definition: scrollview.h:64
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
uinT8 NumConfigs
Definition: intproto.h:110
bool DeSerialize(bool swap, FILE *fp)
Definition: fontinfo.cpp:54
void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
Definition: intproto.cpp:1138
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:302
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
Definition: strngs.h:44
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
void init_spacing(int unicharset_size)
Definition: fontinfo.h:73
#define NULL
Definition: host.h:144
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1936
const MicroFeature * micro_features() const
#define UNICHAR_LEN
Definition: unichar.h:30
int AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:346
void clear()
Definition: unicharset.h:266
const float kFontMergeDistance
TrainingSample * mutable_sample(int index)
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
void AppendMasterShapes(const ShapeTable &other, GenericVector< int > *shape_map)
Definition: shapetable.cpp:666
NormalizationMode
Definition: normalis.h:44
int size() const
Definition: unicharset.h:297
const char * string() const
Definition: strngs.cpp:193
const Shape & GetShape(int shape_id) const
Definition: shapetable.h:323
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
int get_index(T object) const
T & get(int index) const
int NumShapes() const
Definition: shapetable.h:278
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
int XYToFeatureIndex(int x, int y) const
unsigned short uinT16
Definition: host.h:101
int x
Definition: scrollview.h:66
short inT16
Definition: host.h:100
bool is_beginning() const
Definition: unicharset.h:99
bool AddSpacingInfo(const char *filename)
GenericVector< inT16 > kerned_x_gaps
Definition: fontinfo.h:55
bool DeSerialize(bool swap, FILE *fp)