21# include "config_auto.h"
24#include <allheaders.h>
35#ifndef GRAPHICS_DISABLED
53 bool replicate_samples,
int debug_level)
54 : norm_mode_(norm_mode),
55 samples_(fontinfo_table_),
56 junk_samples_(fontinfo_table_),
57 verify_samples_(fontinfo_table_),
59 enable_shape_analysis_(shape_analysis),
60 enable_replication_(replicate_samples),
63 debug_level_(debug_level) {}
67 for (
auto &page_image : page_images_) {
76 uint32_t
value = norm_mode_;
114 "Failed to load unicharset from file %s\n"
115 "Building unicharset for training from scratch...\n",
123 charsetsize_ = unicharset_.
size();
125 fragments_ =
new int[charsetsize_];
126 memset(fragments_, 0,
sizeof(*fragments_) * charsetsize_);
140 const int int_feature_type =
142 const int micro_feature_type =
144 const int cn_feature_type =
146 const int geo_feature_type =
149 FILE *fp = fopen(page_name,
"rb");
151 tprintf(
"Failed to open tr file: %s\n", page_name);
154 tr_filenames_.emplace_back(page_name);
155 while (fgets(buffer,
sizeof(buffer), fp) !=
nullptr) {
156 if (buffer[0] ==
'\n') {
160 char *space = strchr(buffer,
' ');
161 if (space ==
nullptr) {
162 tprintf(
"Bad format in tr file, reading fontname, unichar\n");
174 tprintf(
"Bad format in tr file, reading box coords\n");
180 sample->set_page_num(page_number + page_images_.size());
181 sample->set_bounding_box(bounding_box);
182 sample->ExtractCharDesc(int_feature_type, micro_feature_type,
183 cn_feature_type, geo_feature_type, char_desc);
184 AddSample(verification, unichar.c_str(), sample);
187 charsetsize_ = unicharset_.
size();
196 verify_samples_.
AddSample(unichar, sample);
197 prev_unichar_id_ = -1;
199 if (prev_unichar_id_ >= 0) {
200 fragments_[prev_unichar_id_] = -1;
202 prev_unichar_id_ = samples_.
AddSample(unichar, sample);
207 const int junk_id = junk_samples_.
AddSample(unichar, sample);
208 if (prev_unichar_id_ >= 0) {
211 if (fragments_[prev_unichar_id_] == 0) {
212 fragments_[prev_unichar_id_] = junk_id;
213 }
else if (fragments_[prev_unichar_id_] != junk_id) {
214 fragments_[prev_unichar_id_] = -1;
219 prev_unichar_id_ = -1;
230 for (page = 0;; page++) {
231 pix = pixReadFromMultipageTiff(filename, &offset);
235 page_images_.push_back(pix);
240 tprintf(
"Loaded %d page images from %s\n", page, filename);
249 if (debug_level_ > 0) {
250 tprintf(
"PostLoadCleanup...\n");
252 if (enable_shape_analysis_) {
253 ReplaceFragmentedSamples();
256 sample_it.
Init(
nullptr,
nullptr,
true, &verify_samples_);
265 if (debug_level_ > 0) {
266 tprintf(
"ComputeCanonicalSamples...\n");
275 if (debug_level_ > 0) {
276 tprintf(
"PreTrainingSetup...\n");
280 if (debug_level_ > 0) {
281 tprintf(
"ComputeCloudFeatures...\n");
289 tprintf(
"Building master shape table\n");
290 const int num_fonts = samples_.
NumFonts();
297 for (
int f = 0; f < num_fonts; ++f) {
306 if (fragment ==
nullptr) {
317 &char_shapes_begin_fragment);
320 &char_shapes_end_fragment);
344 tprintf(
"Moving %d junk samples to master sample set.\n", num_junks);
345 for (
int s = 0; s < num_junks; ++s) {
350 if (sample_id == INVALID_UNICHAR_ID) {
367 if (enable_replication_) {
368 if (debug_level_ > 0) {
369 tprintf(
"ReplicateAndRandomize...\n");
380 FILE *fp = fopen(filename,
"rb");
382 fprintf(stderr,
"Failed to load font_properties from %s\n", filename);
385 int italic, bold, fixed, serif, fraktur;
388 char *font_name =
new char[1024];
389 fontinfo.
name = font_name;
392 if (
tfscanf(fp,
"%1024s %i %i %i %i %i\n", font_name, &italic, &bold,
393 &fixed, &serif, &fraktur) != 6) {
397 fontinfo.
properties = (italic << 0) + (bold << 1) + (fixed << 2) +
398 (serif << 3) + (fraktur << 4);
399 if (fontinfo_table_.
get_index(fontinfo) < 0) {
413 tprintf(
"fontinfo table is of size %d\n", fontinfo_table_.
size());
415 xheights_.resize(fontinfo_table_.
size(), -1);
416 if (filename ==
nullptr) {
419 FILE *f = fopen(filename,
"rb");
421 fprintf(stderr,
"Failed to load font xheights from %s\n", filename);
424 tprintf(
"Reading x-heights from %s ...\n", filename);
430 int total_xheight = 0;
431 int xheight_count = 0;
433 if (
tfscanf(f,
"%1023s %d\n", buffer, &xht) != 2) {
437 fontinfo.
name = buffer;
438 auto fontinfo_id = fontinfo_table_.
get_index(fontinfo);
439 if (fontinfo_id < 0) {
443 xheights_[fontinfo_id] = xht;
444 total_xheight += xht;
447 if (xheight_count == 0) {
448 fprintf(stderr,
"No valid xheights in %s!\n", filename);
452 int mean_xheight =
DivRounded(total_xheight, xheight_count);
453 for (
size_t i = 0;
i < fontinfo_table_.
size(); ++
i) {
454 if (xheights_[
i] < 0) {
455 xheights_[
i] = mean_xheight;
464 FILE *fontinfo_file = fopen(filename,
"rb");
465 if (fontinfo_file ==
nullptr) {
470 if (fontinfo_id < 0) {
471 tprintf(
"No font found matching fontinfo filename %s\n", filename);
472 fclose(fontinfo_file);
475 tprintf(
"Reading spacing from %s for font %d...\n", filename, fontinfo_id);
482 int x_gap, x_gap_before, x_gap_after, num_kerned;
484 FontInfo *fi = &fontinfo_table_.
at(fontinfo_id);
487 for (
int l = 0; l < num_unichars; ++l) {
488 if (
tfscanf(fontinfo_file,
"%s %d %d %d", uch, &x_gap_before, &x_gap_after,
490 tprintf(
"Bad format of font spacing file %s\n", filename);
491 fclose(fontinfo_file);
497 spacing->
x_gap_before =
static_cast<int16_t
>(x_gap_before * scale);
498 spacing->
x_gap_after =
static_cast<int16_t
>(x_gap_after * scale);
500 for (
int k = 0; k < num_kerned; ++k) {
501 if (
tfscanf(fontinfo_file,
"%s %d", kerned_uch, &x_gap) != 2) {
502 tprintf(
"Bad format of font spacing file %s\n", filename);
503 fclose(fontinfo_file);
512 spacing->
kerned_x_gaps.push_back(
static_cast<int16_t
>(x_gap * scale));
518 fclose(fontinfo_file);
527 fontinfo.
name =
const_cast<char *
>(font_name);
530 return fontinfo_table_.
get_index(fontinfo);
536 int fontinfo_id = -1;
538 for (
size_t f = 0; f < fontinfo_table_.
size(); ++f) {
539 if (strstr(filename, fontinfo_table_.
at(f).
name) !=
nullptr) {
540 int len = strlen(fontinfo_table_.
at(f).
name);
542 if (len > best_len) {
557 std::vector<int> active_fonts;
558 int num_shapes = flat_shapes_.
NumShapes();
559 for (
int s = 0; s < num_shapes; ++s) {
560 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
562 for (f = 0; f < active_fonts.size(); ++f) {
563 if (active_fonts[f] == font) {
567 if (f == active_fonts.size()) {
568 active_fonts.push_back(font);
572 int num_fonts = active_fonts.
size();
573 for (
int f = 0; f < num_fonts; ++f) {
574 for (
int s = num_shapes - 1; s >= 0; --s) {
575 int font = flat_shapes_.
GetShape(s)[0].font_ids[0];
576 if (font == active_fonts[f]) {
587 int shape_id,
int *num_samples) {
597 shape_map.
SetMap(shape_id,
true);
600 std::vector<const TrainingSample *> sample_ptrs;
602 it.
Init(&shape_map, &shape_table,
false, &samples_);
606 uint32_t sample_id = 0;
607 for (
int i = sample_ptrs.size() - 1;
i >= 0; --
i) {
610 for (uint32_t f = 0; f < num_features; ++f) {
615 *num_samples = sample_id;
627 const char *inttemp_file,
628 const char *pffmtable_file) {
631 fontinfo_table_.
MoveTo(&classify->get_fontinfo_table());
633 classify->CreateIntTemplates(float_classes, shape_set);
634 FILE *fp = fopen(inttemp_file,
"wb");
636 tprintf(
"Error, failed to open file \"%s\"\n", inttemp_file);
638 classify->WriteIntTemplates(fp, int_templates, shape_set);
646 std::vector<uint16_t> shapetable_cutoffs;
653 uint16_t max_length = 0;
654 for (
int config_id = 0; config_id < Class->
NumConfigs; config_id++) {
658 if (length > max_length) {
661 int shape_id = float_classes[
i].
font_set.
at(config_id);
663 for (
int c = 0; c < shape.
size(); ++c) {
664 int unichar_id = shape[c].unichar_id;
665 if (length > unichar_cutoffs[unichar_id]) {
666 unichar_cutoffs[unichar_id] = length;
670 shapetable_cutoffs.push_back(max_length);
672 fp = fopen(pffmtable_file,
"wb");
674 tprintf(
"Error, failed to open file \"%s\"\n", pffmtable_file);
679 if (strcmp(unichar,
" ") == 0) {
682 fprintf(fp,
"%s %d\n", unichar, unichar_cutoffs[c]);
686 delete int_templates;
693 const char *unichar_str2) {
696 if (class_id2 == INVALID_UNICHAR_ID) {
697 class_id2 = class_id1;
699 if (class_id1 == INVALID_UNICHAR_ID) {
700 tprintf(
"No unicharset entry found for %s\n", unichar_str1);
703 tprintf(
"Font ambiguities for unichar %d = %s and %d = %s\n", class_id1,
704 unichar_str1, class_id2, unichar_str2);
706 int num_fonts = samples_.
NumFonts();
711 for (
int f = 0; f < num_fonts; ++f) {
718 for (
int f1 = 0; f1 < num_fonts; ++f1) {
724 for (
int f2 = 0; f2 < num_fonts; ++f2) {
736 for (
int f = 0; f < num_fonts; ++f) {
740 if (class_id1 != class_id2 &&
747#ifndef GRAPHICS_DISABLED
759 const char *unichar_str2,
760 int canonical_font) {
767 if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
775 if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
777 for (
int f = 0; f < cloud.
size(); ++f) {
793 if (feature_index >= 0) {
811 sample_it.
Init(
nullptr,
nullptr, replicate_samples, &samples_);
814 page_images_, &sample_it);
821 bool replicate_samples,
823 std::string *report_string) {
824 TestClassifier(error_mode, report_level, replicate_samples, &samples_,
825 test_classifier, report_string);
842 bool replicate_samples,
845 std::string *report_string) {
847 sample_it.
Init(
nullptr,
nullptr, replicate_samples, samples);
848 if (report_level > 0) {
853 tprintf(
"Iterator has charset size of %d/%d, %d shapes, %d samples\n",
856 tprintf(
"Testing %sREPLICATED:\n", replicate_samples ?
"" :
"NON-");
858 double unichar_error = 0.0;
860 fontinfo_table_, page_images_, &sample_it,
861 &unichar_error,
nullptr, report_string);
862 return unichar_error;
871 int num_chars1 = shape1.
size();
872 int num_chars2 = shape2.
size();
873 float dist_sum = 0.0f;
875 if (num_chars1 > 1 || num_chars2 > 1) {
878 for (
int c1 = 0; c1 < num_chars1; ++c1) {
879 for (
int c2 = 0; c2 < num_chars2; ++c2) {
892 return dist_sum / dist_count;
897void MasterTrainer::ReplaceFragmentedSamples() {
898 if (fragments_ ==
nullptr) {
904 for (
int s = 0; s < num_samples; ++s) {
906 if (fragments_[sample->class_id()] > 0) {
913 const UNICHARSET &frag_set = junk_samples_.
unicharset();
919 bool* good_junk =
new bool[frag_set.size()];
920 memset(good_junk, 0,
sizeof(*good_junk) * frag_set.size());
921 for (
int dead_ch = 1; dead_ch < unicharset_.
size(); ++dead_ch) {
922 int frag_ch = fragments_[dead_ch];
923 if (frag_ch <= 0)
continue;
924 const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
927 for (
int part = 0; part < frag->get_total(); ++part) {
929 int good_ch = frag_set.unichar_to_id(frag->to_string().c_str());
930 if (good_ch != INVALID_UNICHAR_ID)
931 good_junk[good_ch] =
true;
939 for (
int s = 0; s < num_junks; ++s) {
942 const char *frag_utf8 = frag_set.id_to_unichar(junk_id);
944 if (frag !=
nullptr && frag->is_natural()) {
946 samples_.
AddSample(frag_set.id_to_unichar(junk_id), sample);
958 fragments_ =
nullptr;
967void MasterTrainer::ClusterShapes(
int min_shapes,
int max_shape_unichars,
970 int max_merges = num_shapes - min_shapes;
972 auto *shape_dists =
new std::vector<ShapeDist>[num_shapes];
976 tprintf(
"Computing shape distances...");
977 for (
int s1 = 0; s1 < num_shapes; ++s1) {
978 for (
int s2 = s1 + 1; s2 < num_shapes; ++s2) {
980 shape_dists[s1].push_back(dist);
991 while (num_merged < max_merges && min_dist < max_dist) {
992 tprintf(
"Distance = %f: ", min_dist);
994 shape_dists[min_s1][min_s2 - min_s1 - 1].distance =
kInfiniteDist;
995 if (num_unichars > max_shape_unichars) {
996 tprintf(
"Merge of %d and %d with %d would exceed max of %d unichars\n",
997 min_s1, min_s2, num_unichars, max_shape_unichars);
1000 shape_dists[min_s2].clear();
1003 for (
int s = 0; s < min_s1; ++s) {
1004 if (!shape_dists[s].empty()) {
1005 shape_dists[s][min_s1 - s - 1].distance =
1010 for (
int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
1012 shape_dists[min_s1][s2 - min_s1 - 1].distance =
1016 for (
int s = min_s1 + 1; s < min_s2; ++s) {
1017 if (!shape_dists[s].empty()) {
1023 for (
int s1 = 0; s1 < num_shapes; ++s1) {
1024 for (
unsigned i = 0;
i < shape_dists[s1].size(); ++
i) {
1025 if (shape_dists[s1][
i].
distance < min_dist) {
1026 min_dist = shape_dists[s1][
i].distance;
1028 min_s2 = s1 + 1 +
i;
1033 tprintf(
"Stopped with %d merged, min dist %f\n", num_merged, min_dist);
1034 delete[] shape_dists;
1035 if (debug_level_ > 1) {
1036 for (
int s1 = 0; s1 < num_shapes; ++s1) {
int tfscanf(FILE *stream, const char *format,...)
#define ClassForClassId(T, c)
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
const char *const kCNFeatureType
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, TBOX *bounding_box)
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
void tprintf(const char *format,...)
int DivRounded(int a, int b)
const char *const kGeoFeatureType
const float kFontMergeDistance
bool Serialize(FILE *fp, const std::vector< T > &data)
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
const int kMinClusteredShapes
const char *const kIntFeatureType
const float kInfiniteDist
FEATURE_DEFS_STRUCT feature_defs
const int kMaxUnicharsPerCluster
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
const char *const kMicroFeatureType
const T & at(int id) const
Return the object from an id.
std::vector< int16_t > kerned_x_gaps
std::vector< UNICHAR_ID > kerned_unichar_ids
void init_spacing(int unicharset_size)
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
TESS_API bool Serialize(FILE *fp) const
TESS_API void MoveTo(UnicityTable< FontInfo > *target)
int get_index(const T &object) const
void Init(int size, bool all_mapped)
void SetMap(int sparse_index, bool mapped)
static CHAR_FRAGMENT * parse_from_string(const char *str)
bool is_beginning() const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
bool load_from_file(const char *const filename, bool skip_fragments)
const char * id_to_unichar(UNICHAR_ID id) const
bool contains_unichar(const char *const unichar_repr) const
bool save_to_file(const char *const filename) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
void AppendOtherUnicharset(const UNICHARSET &src)
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
int XYToFeatureIndex(int x, int y) const
bool Serialize(FILE *fp) const
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
const PARAM_DESC * ParamDesc
UnicityTable< int > font_set
virtual const ShapeTable * GetShapeTable() const =0
void AddToShape(int unichar_id, int font_id)
int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const
std::string DebugStr(unsigned shape_id) const
unsigned AddShape(int unichar_id, int font_id)
std::string SummaryStr() const
unsigned MasterDestinationIndex(unsigned shape_id) const
bool Serialize(FILE *fp) const
unsigned NumShapes() const
const Shape & GetShape(unsigned shape_id) const
void MergeShapes(unsigned shape_id1, unsigned shape_id2)
int FindShape(int unichar_id, int font_id) const
void AppendMasterShapes(const ShapeTable &other, std::vector< int > *shape_map)
const INT_FEATURE_STRUCT * features() const
UNICHAR_ID class_id() const
uint32_t num_features() const
void set_class_id(int id)
uint32_t num_micro_features() const
const MicroFeature * micro_features() const
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it)
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, std::string *fonts_report)
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
const IntFeatureSpace & feature_space() const
bool LoadFontInfo(const char *filename)
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, std::string *report_string)
int GetBestMatchingFontInfoId(const char *filename)
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
void LoadUnicharset(const char *filename)
void ReplicateAndRandomizeSamplesIfRequired()
void LoadPageImages(const char *filename)
int GetFontInfoId(const char *font_name)
bool Serialize(FILE *fp) const
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, std::string *report_string)
bool LoadXHeights(const char *filename)
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
const UNICHARSET & unicharset() const
void SetupFlatShapeTable(ShapeTable *shape_table)
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
bool AddSpacingInfo(const char *filename)
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
const TrainingSample & GetSample() const
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
int SparseCharsetSize() const
int CompactCharsetSize() const
double NormalizeSamples()
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
int NumClassSamples(int font_id, int class_id, bool randomize) const
int AddSample(const char *unichar, TrainingSample *sample)
TrainingSample * extract_sample(int index)
bool Serialize(FILE *fp) const
void IndexFeatures(const IntFeatureSpace &feature_space)
void KillSample(TrainingSample *sample)
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
void ComputeCloudFeatures(int feature_space_size)
void LoadUnicharset(const char *filename)
const BitVector & GetCloudFeatures(int font_id, int class_id) const
const UNICHARSET & unicharset() const
void ComputeCanonicalFeatures()
TrainingSample * mutable_sample(int index)
void OrganizeByFontAndClass()
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
void ReplicateAndRandomizeSamples()
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)