17# include "config_auto.h"
22#include <allheaders.h>
44TrainingSampleSet::FontClassInfo::FontClassInfo()
45 : num_raw_samples(0), canonical_sample(-1), canonical_dist(0.0f) {}
49 if (fwrite(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1) {
52 if (fwrite(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1) {
55 if (fwrite(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1) {
66 if (fread(&num_raw_samples,
sizeof(num_raw_samples), 1, fp) != 1) {
69 if (fread(&canonical_sample,
sizeof(canonical_sample), 1, fp) != 1) {
72 if (fread(&canonical_dist,
sizeof(canonical_dist), 1, fp) != 1) {
79 ReverseN(&num_raw_samples,
sizeof(num_raw_samples));
80 ReverseN(&canonical_sample,
sizeof(canonical_sample));
81 ReverseN(&canonical_dist,
sizeof(canonical_dist));
89 , font_class_array_(nullptr)
90 , fontinfo_table_(font_table) {}
93 for (
auto sample : samples_) {
96 delete font_class_array_;
110 int8_t not_null = font_class_array_ !=
nullptr;
111 if (fwrite(¬_null,
sizeof(not_null), 1, fp) != 1) {
128 num_raw_samples_ = samples_.size();
135 delete font_class_array_;
136 font_class_array_ =
nullptr;
138 if (fread(¬_null,
sizeof(not_null), 1, fp) != 1) {
148 unicharset_size_ = unicharset_.
size();
156 "Failed to load unicharset from file %s\n"
157 "Building unicharset from scratch...\n",
164 unicharset_size_ = unicharset_.
size();
175 "Error: Size of unicharset in TrainingSampleSet::AddSample is "
176 "greater than MAX_NUM_CLASSES\n");
189 samples_.push_back(sample);
190 num_raw_samples_ = samples_.size();
191 unicharset_size_ = unicharset_.
size();
200 if (font_id < 0 || class_id < 0 || font_id >= font_id_map_.
SparseSize() ||
201 class_id >= unicharset_size_) {
206 if (font_index < 0) {
210 return (*font_class_array_)(font_index, class_id).samples.size();
218 return samples_[index];
226 if (font_index < 0) {
229 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
230 return samples_[sample_index];
238 if (font_index < 0) {
241 int sample_index = (*font_class_array_)(font_index, class_id).samples[index];
242 return samples_[sample_index];
248 std::string boxfile_str;
251 return std::string(fontinfo_table_.
at(sample.
font_id()).
name) +
" " + boxfile_str;
259 return (*font_class_array_)(font_index, class_id).cloud_features;
266 return (*font_class_array_)(font_index, class_id).canonical_features;
275 int num_fonts1 = uf1.
font_ids.size();
277 int num_fonts2 = uf2.
font_ids.size();
279 double dist_sum = 0.0;
281 const bool debug =
false;
284 for (
int i = 0;
i < num_fonts1; ++
i) {
286 for (
int j = 0; j < num_fonts2; ++j) {
296 for (
int i = 0;
i < num_fonts1; ++
i) {
298 for (
int j = 0; j < num_fonts2; ++j) {
302 tprintf(
"Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
313 int num_samples = std::max(num_fonts1, num_fonts2);
316 int f2 = uf2.
font_ids[index % num_fonts2];
318 tprintf(
"Cluster dist %d %d %d %d = %g\n", f1, c1, f2, c2,
325 if (dist_count == 0) {
331 return dist_sum / dist_count;
342 if (font_index1 < 0 || font_index2 < 0) {
345 FontClassInfo &fc_info = (*font_class_array_)(font_index1, class_id1);
346 if (font_id1 == font_id2) {
348 if (fc_info.unichar_distance_cache.empty()) {
349 fc_info.unichar_distance_cache.resize(unicharset_size_, -1.0f);
351 if (fc_info.unichar_distance_cache[class_id2] < 0) {
354 fc_info.unichar_distance_cache[class_id2] = result;
356 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
357 if (fc_info2.unichar_distance_cache.empty()) {
358 fc_info2.unichar_distance_cache.resize(unicharset_size_, -1.0f);
360 fc_info2.unichar_distance_cache[class_id1] = result;
362 return fc_info.unichar_distance_cache[class_id2];
363 }
else if (class_id1 == class_id2) {
365 if (fc_info.font_distance_cache.empty()) {
366 fc_info.font_distance_cache.resize(font_id_map_.
CompactSize(), -1.0f);
368 if (fc_info.font_distance_cache[font_index2] < 0) {
371 fc_info.font_distance_cache[font_index2] = result;
373 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
374 if (fc_info2.font_distance_cache.empty()) {
375 fc_info2.font_distance_cache.resize(font_id_map_.
CompactSize(), -1.0f);
377 fc_info2.font_distance_cache[font_index1] = result;
379 return fc_info.font_distance_cache[font_index2];
383 size_t cache_index = 0;
384 while (cache_index < fc_info.distance_cache.size() &&
385 (fc_info.distance_cache[cache_index].unichar_id != class_id2 ||
386 fc_info.distance_cache[cache_index].font_id != font_id2)) {
389 if (cache_index == fc_info.distance_cache.size()) {
392 FontClassDistance fc_dist = {class_id2, font_id2, result};
393 fc_info.distance_cache.push_back(fc_dist);
396 FontClassInfo &fc_info2 = (*font_class_array_)(font_index2, class_id2);
397 fc_dist.unichar_id = class_id1;
398 fc_dist.font_id = font_id1;
399 fc_info2.distance_cache.push_back(fc_dist);
401 return fc_info.distance_cache[cache_index].distance;
408 int dist =
ReliablySeparable(font_id1, class_id1, font_id2, class_id2, feature_map,
false);
409 dist +=
ReliablySeparable(font_id2, class_id2, font_id1, class_id1, feature_map,
false);
412 return static_cast<float>(dist) / denominator;
418static void AddNearFeatures(
const IntFeatureMap &feature_map,
int f,
int levels,
419 std::vector<int> *good_features) {
420 int prev_num_features = 0;
421 good_features->push_back(f);
422 int num_features = 1;
423 for (
int level = 0; level < levels; ++level) {
424 for (
int i = prev_num_features;
i < num_features; ++
i) {
425 int feature = (*good_features)[
i];
426 for (
int dir = -kNumOffsetMaps; dir <= kNumOffsetMaps; ++dir) {
432 good_features->push_back(f1);
436 prev_num_features = num_features;
437 num_features = good_features->size();
455 if (sample2 ==
nullptr) {
460 if (cloud1.
empty()) {
461 return canonical2.size();
465 for (
int feature : canonical2) {
466 if (cloud1[feature]) {
470 std::vector<int> good_features;
471 AddNearFeatures(feature_map, feature, 1, &good_features);
474 for (
auto good_f : good_features) {
475 if (cloud1[good_f]) {
493 if (font_index < 0) {
496 return (*font_class_array_)(font_index, class_id).samples[index];
504 if (font_index < 0) {
507 const int sample_index = (*font_class_array_)(font_index, class_id).canonical_sample;
508 return sample_index >= 0 ? samples_[sample_index] :
nullptr;
516 if (font_index < 0) {
519 if ((*font_class_array_)(font_index, class_id).canonical_sample >= 0) {
520 return (*font_class_array_)(font_index, class_id).canonical_dist;
528 for (
auto &sample : samples_) {
529 sample->IndexFeatures(feature_space);
541 using namespace std::placeholders;
542 for (
auto &&it = samples_.begin(); it < samples_.end();) {
543 if (*it ==
nullptr || (*it)->class_id() < 0) {
550 num_raw_samples_ = samples_.size();
559 int compact_font_size = font_id_map_.
CompactSize();
561 delete font_class_array_;
565 for (
size_t s = 0; s < samples_.size(); ++s) {
566 int font_id = samples_[s]->font_id();
567 int class_id = samples_[s]->class_id();
568 if (font_id < 0 || font_id >= font_id_map_.
SparseSize()) {
569 tprintf(
"Font id = %d/%d, class id = %d/%d on sample %zu\n", font_id,
570 font_id_map_.
SparseSize(), class_id, unicharset_size_, s);
573 ASSERT_HOST(class_id >= 0 && class_id < unicharset_size_);
575 (*font_class_array_)(font_index, class_id).samples.push_back(s);
579 for (
int f = 0; f < compact_font_size; ++f) {
580 for (
int c = 0; c < unicharset_size_; ++c) {
581 (*font_class_array_)(f, c).
num_raw_samples = (*font_class_array_)(f, c).samples.size();
586 num_raw_samples_ = samples_.size();
593 std::vector<int> font_counts;
594 for (
auto &sample : samples_) {
595 const int font_id = sample->font_id();
596 while (font_id >= font_counts.size()) {
597 font_counts.push_back(0);
599 ++font_counts[font_id];
601 font_id_map_.
Init(font_counts.size(),
false);
602 for (
size_t f = 0; f < font_counts.size(); ++f) {
603 font_id_map_.
SetMap(f, font_counts[f] > 0);
605 font_id_map_.
Setup();
620 double global_worst_dist = 0.0;
623 for (
int font_index = 0; font_index < font_size; ++font_index) {
625 for (
int c = 0; c < unicharset_size_; ++c) {
626 int samples_found = 0;
627 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
629 fcinfo.canonical_sample = -1;
630 fcinfo.canonical_dist = 0.0f;
632 tprintf(
"Skipping class %d\n", c);
638 double min_max_dist = 2.0;
641 double max_max_dist = 0.0;
644 fcinfo.canonical_sample = fcinfo.samples[0];
645 fcinfo.canonical_dist = 0.0f;
646 for (
auto s1 : fcinfo.samples) {
647 const std::vector<int> &features1 = samples_[s1]->indexed_features();
648 f_table.
Set(features1, features1.size(),
true);
649 double max_dist = 0.0;
654 for (
int s2 : fcinfo.samples) {
655 if (samples_[s2]->class_id() != c || samples_[s2]->font_id() != font_id || s2 == s1) {
658 std::vector<int> features2 = samples_[s2]->indexed_features();
660 if (dist > max_dist) {
662 if (dist > max_max_dist) {
671 f_table.
Set(features1, features1.size(),
false);
672 samples_[s1]->set_max_dist(max_dist);
674 if (max_dist < min_max_dist) {
675 fcinfo.canonical_sample = s1;
676 fcinfo.canonical_dist = max_dist;
678 UpdateRange(max_dist, &min_max_dist, &max_max_dist);
680 if (max_max_dist > global_worst_dist) {
682 global_worst_dist = max_max_dist;
688 "Found %d samples of class %d=%s, font %d, "
689 "dist range [%g, %g], worst pair= %s, %s\n",
690 samples_found, c, unicharset_.
debug_str(c).c_str(), font_index, min_max_dist,
697 tprintf(
"Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1,
710 for (
int font_index = 0; font_index < font_size; ++font_index) {
711 for (
int c = 0; c < unicharset_size_; ++c) {
712 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
713 int sample_count = fcinfo.samples.size();
714 int min_samples = 2 * std::max(kSampleRandomSize, sample_count);
715 if (sample_count > 0 && sample_count < min_samples) {
716 int base_count = sample_count;
717 for (
int base_index = 0; sample_count < min_samples; ++sample_count) {
718 int src_index = fcinfo.samples[base_index++];
719 if (base_index >= base_count) {
723 samples_[src_index]->RandomizedCopy(sample_count % kSampleRandomSize);
724 int sample_index = samples_.size();
726 samples_.push_back(sample);
727 fcinfo.samples.push_back(sample_index);
741 for (
int font_index = 0; font_index < font_size; ++font_index) {
743 for (
int c = 0; c < unicharset_size_; ++c) {
749 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
760 for (
int font_index = 0; font_index < font_size; ++font_index) {
762 for (
int c = 0; c < unicharset_size_; ++c) {
767 FontClassInfo &fcinfo = (*font_class_array_)(font_index, c);
768 fcinfo.cloud_features.Init(feature_space_size);
772 for (
int sample_feature : sample_features) {
773 fcinfo.cloud_features.SetBit(sample_feature);
782 for (
int f = 0; f < font_id_map_.
CompactSize(); ++f) {
788#ifndef GRAPHICS_DISABLED
799 std::vector<int> indexed_features;
801 for (
int indexed_feature : indexed_features) {
802 if (indexed_feature == f_index) {
void ReverseN(void *ptr, int num_bytes)
void tprintf(const char *format,...)
void MakeBoxFileStr(const char *unichar_str, const TBOX &box, int page_num, std::string &box_str)
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
bool Serialize(FILE *fp, const std::vector< T > &data)
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
bool DeSerializeClasses(bool swap, FILE *fp)
bool SerializeClasses(FILE *fp) const
int CompactToSparse(int compact_index) const
void Init(int size, bool all_mapped)
void SetMap(int sparse_index, bool mapped)
int SparseSize() const override
bool Serialize(FILE *fp) const
bool DeSerialize(bool swap, FILE *fp)
int SparseToCompact(int sparse_index) const override
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool load_from_file(const char *const filename, bool skip_fragments)
const char * id_to_unichar(UNICHAR_ID id) const
bool contains_unichar(const char *const unichar_repr) const
bool save_to_file(const char *const filename) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void AppendOtherUnicharset(const UNICHARSET &src)
std::string debug_str(UNICHAR_ID id) const
void IndexAndSortFeatures(const INT_FEATURE_STRUCT *features, int num_features, std::vector< int > *sorted_features) const
std::vector< int32_t > font_ids
void AddToShape(int unichar_id, int font_id)
bool ContainsUnichar(int unichar_id) const
const INT_FEATURE_STRUCT * features() const
const TBOX & bounding_box() const
UNICHAR_ID class_id() const
void DisplayFeatures(ScrollView::Color color, ScrollView *window) const
uint32_t num_features() const
const std::vector< int > & indexed_features() const
void set_class_id(int id)
void set_sample_index(int value)
void Set(const std::vector< int > &indexed_features, int canonical_count, bool value)
void Init(const IntFeatureMap *feature_map)
double FeatureDistance(const std::vector< int > &features) const
int OffsetFeature(int index_feature, int dir) const
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
int NumClassSamples(int font_id, int class_id, bool randomize) const
int AddSample(const char *unichar, TrainingSample *sample)
bool Serialize(FILE *fp) const
void AddAllFontsForClass(int class_id, Shape *shape) const
void IndexFeatures(const IntFeatureSpace &feature_space)
float ComputeClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map) const
void KillSample(TrainingSample *sample)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
void ComputeCloudFeatures(int feature_space_size)
bool DeSerialize(bool swap, FILE *fp)
int num_raw_samples() const
void LoadUnicharset(const char *filename)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
int GlobalSampleIndex(int font_id, int class_id, int index) const
void ComputeCanonicalFeatures()
const std::vector< int > & GetCanonicalFeatures(int font_id, int class_id) const
void OrganizeByFontAndClass()
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
float GetCanonicalDist(int font_id, int class_id) const
void ReplicateAndRandomizeSamples()
const TrainingSample * GetSample(int index) const
std::string SampleToString(const TrainingSample &sample) const
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
int ReliablySeparable(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map, bool thorough) const
TrainingSample * MutableSample(int font_id, int class_id, int index)
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)