tesseract-ocr.github.io/5.3.3/a00803_source.html

// File:        mastertrainer.cpp

// Description: Trainer to build the MasterClassifier.

// Author:      Ray Smith

//

// (C) Copyright 2010, Google Inc.

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

//


// Include automatically generated configuration file if running autoconf.

#ifdef HAVE_CONFIG_H

#  include "config_auto.h"

#endif


#include <allheaders.h>

#include <cmath>

#include <ctime>

#include "boxread.h"

#include "classify.h"

#include "errorcounter.h"

#include "featdefs.h"

#include "mastertrainer.h"

#include "sampleiterator.h"

#include "shapeclassifier.h"

#include "shapetable.h"

#ifndef GRAPHICS_DISABLED

#  include "svmnode.h"

#endif


#include "scanutils.h"


namespace tesseract {


// Constants controlling clustering. With a low kMinClusteredShapes and a high

// kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.

// Min number of shapes in the output.

const int kMinClusteredShapes = 1;

// Max number of unichars in any individual cluster.

const int kMaxUnicharsPerCluster = 2000;

// Mean font distance below which to merge fonts and unichars.

const float kFontMergeDistance = 0.025;


MasterTrainer::MasterTrainer(NormalizationMode norm_mode, bool shape_analysis,

                             bool replicate_samples, int debug_level)

    : norm_mode_(norm_mode),

      samples_(fontinfo_table_),

      junk_samples_(fontinfo_table_),

      verify_samples_(fontinfo_table_),

      charsetsize_(0),

      enable_shape_analysis_(shape_analysis),

      enable_replication_(replicate_samples),

      fragments_(nullptr),

      prev_unichar_id_(-1),

      debug_level_(debug_level) {}


MasterTrainer::~MasterTrainer() {

  delete[] fragments_;

  for (auto &page_image : page_images_) {

    page_image.destroy();

  }

}


// WARNING! Serialize/DeSerialize are only partial, providing

// enough data to get the samples back and display them.

// Writes to the given file. Returns false in case of error.

bool MasterTrainer::Serialize(FILE *fp) const {

  uint32_t value = norm_mode_;

  if (!tesseract::Serialize(fp, &value)) {

    return false;

  }

  if (!unicharset_.save_to_file(fp)) {

    return false;

  }

  if (!feature_space_.Serialize(fp)) {

    return false;

  }

  if (!samples_.Serialize(fp)) {

    return false;

  }

  if (!junk_samples_.Serialize(fp)) {

    return false;

  }

  if (!verify_samples_.Serialize(fp)) {

    return false;

  }

  if (!master_shapes_.Serialize(fp)) {

    return false;

  }

  if (!flat_shapes_.Serialize(fp)) {

    return false;

  }

  if (!fontinfo_table_.Serialize(fp)) {

    return false;

  }

  if (!tesseract::Serialize(fp, xheights_)) {

    return false;

  }

  return true;

}


// Load an initial unicharset, or set one up if the file cannot be read.

void MasterTrainer::LoadUnicharset(const char *filename) {

  if (!unicharset_.load_from_file(filename)) {

    tprintf(

        "Failed to load unicharset from file %s\n"

        "Building unicharset for training from scratch...\n",

        filename);

    unicharset_.clear();

    UNICHARSET initialized;

    // Add special characters, as they were removed by the clear, but the

    // default constructor puts them in.

    unicharset_.AppendOtherUnicharset(initialized);

  }

  charsetsize_ = unicharset_.size();

  delete[] fragments_;

  fragments_ = new int[charsetsize_];

  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);

  samples_.LoadUnicharset(filename);

  junk_samples_.LoadUnicharset(filename);

  verify_samples_.LoadUnicharset(filename);

}


// Reads the samples and their features from the given .tr format file,

// adding them to the trainer with the font_id from the content of the file.

// See mftraining.cpp for a description of the file format.

// If verification, then these are verification samples, not training.

void MasterTrainer::ReadTrainingSamples(const char *page_name,

                                        const FEATURE_DEFS_STRUCT &feature_defs,

                                        bool verification) {

  char buffer[2048];

  const int int_feature_type =

      ShortNameToFeatureType(feature_defs, kIntFeatureType);

  const int micro_feature_type =

      ShortNameToFeatureType(feature_defs, kMicroFeatureType);

  const int cn_feature_type =

      ShortNameToFeatureType(feature_defs, kCNFeatureType);

  const int geo_feature_type =

      ShortNameToFeatureType(feature_defs, kGeoFeatureType);


  FILE *fp = fopen(page_name, "rb");

  if (fp == nullptr) {

    tprintf("Failed to open tr file: %s\n", page_name);

    return;

  }

  tr_filenames_.emplace_back(page_name);

  while (fgets(buffer, sizeof(buffer), fp) != nullptr) {

    if (buffer[0] == '\n') {

      continue;

    }


    char *space = strchr(buffer, ' ');

    if (space == nullptr) {

      tprintf("Bad format in tr file, reading fontname, unichar\n");

      continue;

    }

    *space++ = '\0';

    int font_id = GetFontInfoId(buffer);

    if (font_id < 0) {

      font_id = 0;

    }

    int page_number;

    std::string unichar;

    TBOX bounding_box;

    if (!ParseBoxFileStr(space, &page_number, unichar, &bounding_box)) {

      tprintf("Bad format in tr file, reading box coords\n");

      continue;

    }

    auto char_desc = ReadCharDescription(feature_defs, fp);

    auto *sample = new TrainingSample;

    sample->set_font_id(font_id);

    sample->set_page_num(page_number + page_images_.size());

    sample->set_bounding_box(bounding_box);

    sample->ExtractCharDesc(int_feature_type, micro_feature_type,

                            cn_feature_type, geo_feature_type, char_desc);

    AddSample(verification, unichar.c_str(), sample);

    delete char_desc;

  }

  charsetsize_ = unicharset_.size();

  fclose(fp);

}


// Adds the given single sample to the trainer, setting the classid

// appropriately from the given unichar_str.

void MasterTrainer::AddSample(bool verification, const char *unichar,

                              TrainingSample *sample) {

  if (verification) {

    verify_samples_.AddSample(unichar, sample);

    prev_unichar_id_ = -1;

  } else if (unicharset_.contains_unichar(unichar)) {

    if (prev_unichar_id_ >= 0) {

      fragments_[prev_unichar_id_] = -1;

    }

    prev_unichar_id_ = samples_.AddSample(unichar, sample);

    if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0) {

      flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());

    }

  } else {

    const int junk_id = junk_samples_.AddSample(unichar, sample);

    if (prev_unichar_id_ >= 0) {

      CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(unichar);

      if (frag != nullptr && frag->is_natural()) {

        if (fragments_[prev_unichar_id_] == 0) {

          fragments_[prev_unichar_id_] = junk_id;

        } else if (fragments_[prev_unichar_id_] != junk_id) {

          fragments_[prev_unichar_id_] = -1;

        }

      }

      delete frag;

    }

    prev_unichar_id_ = -1;

  }

}


// Loads all pages from the given tif filename and append to page_images_.

// Must be called after ReadTrainingSamples, as the current number of images

// is used as an offset for page numbers in the samples.

void MasterTrainer::LoadPageImages(const char *filename) {

  size_t offset = 0;

  int page;

  Image pix;

  for (page = 0;; page++) {

    pix = pixReadFromMultipageTiff(filename, &offset);

    if (!pix) {

      break;

    }

    page_images_.push_back(pix);

    if (!offset) {

      break;

    }

  }

  tprintf("Loaded %d page images from %s\n", page, filename);

}


// Cleans up the samples after initial load from the tr files, and prior to

// saving the MasterTrainer:

// Remaps fragmented chars if running shape analysis.

// Sets up the samples appropriately for class/fontwise access.

// Deletes outlier samples.

void MasterTrainer::PostLoadCleanup() {

  if (debug_level_ > 0) {

    tprintf("PostLoadCleanup...\n");

  }

  if (enable_shape_analysis_) {

    ReplaceFragmentedSamples();

  }

  SampleIterator sample_it;

  sample_it.Init(nullptr, nullptr, true, &verify_samples_);

  sample_it.NormalizeSamples();

  verify_samples_.OrganizeByFontAndClass();


  samples_.IndexFeatures(feature_space_);

  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness

  // against current training.

  //  samples_.DeleteOutliers(feature_space_, debug_level_ > 0);

  samples_.OrganizeByFontAndClass();

  if (debug_level_ > 0) {

    tprintf("ComputeCanonicalSamples...\n");

  }

  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);

}


// Gets the samples ready for training. Use after both

// ReadTrainingSamples+PostLoadCleanup or DeSerialize.

// Re-indexes the features and computes canonical and cloud features.

void MasterTrainer::PreTrainingSetup() {

  if (debug_level_ > 0) {

    tprintf("PreTrainingSetup...\n");

  }

  samples_.IndexFeatures(feature_space_);

  samples_.ComputeCanonicalFeatures();

  if (debug_level_ > 0) {

    tprintf("ComputeCloudFeatures...\n");

  }

  samples_.ComputeCloudFeatures(feature_space_.Size());

}


// Sets up the master_shapes_ table, which tells which fonts should stay

// together until they get to a leaf node classifier.

void MasterTrainer::SetupMasterShapes() {

  tprintf("Building master shape table\n");

  const int num_fonts = samples_.NumFonts();


  ShapeTable char_shapes_begin_fragment(samples_.unicharset());

  ShapeTable char_shapes_end_fragment(samples_.unicharset());

  ShapeTable char_shapes(samples_.unicharset());

  for (int c = 0; c < samples_.charsetsize(); ++c) {

    ShapeTable shapes(samples_.unicharset());

    for (int f = 0; f < num_fonts; ++f) {

      if (samples_.NumClassSamples(f, c, true) > 0) {

        shapes.AddShape(c, f);

      }

    }

    ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);


    const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);


    if (fragment == nullptr) {

      char_shapes.AppendMasterShapes(shapes, nullptr);

    } else if (fragment->is_beginning()) {

      char_shapes_begin_fragment.AppendMasterShapes(shapes, nullptr);

    } else if (fragment->is_ending()) {

      char_shapes_end_fragment.AppendMasterShapes(shapes, nullptr);

    } else {

      char_shapes.AppendMasterShapes(shapes, nullptr);

    }

  }

  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,

                &char_shapes_begin_fragment);

  char_shapes.AppendMasterShapes(char_shapes_begin_fragment, nullptr);

  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,

                &char_shapes_end_fragment);

  char_shapes.AppendMasterShapes(char_shapes_end_fragment, nullptr);

  ClusterShapes(kMinClusteredShapes, kMaxUnicharsPerCluster, kFontMergeDistance,

                &char_shapes);

  master_shapes_.AppendMasterShapes(char_shapes, nullptr);

  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().c_str());

}


// Adds the junk_samples_ to the main samples_ set. Junk samples are initially

// fragments and n-grams (all incorrectly segmented characters).

// Various training functions may result in incorrectly segmented characters

// being added to the unicharset of the main samples, perhaps because they

// form a "radical" decomposition of some (Indic) grapheme, or because they

// just look the same as a real character (like rn/m)

// This function moves all the junk samples, to the main samples_ set, but

// desirable junk, being any sample for which the unichar already exists in

// the samples_ unicharset gets the unichar-ids re-indexed to match, but

// anything else gets re-marked as unichar_id 0 (space character) to identify

// it as junk to the error counter.

void MasterTrainer::IncludeJunk() {

  // Get ids of fragments in junk_samples_ that replace the dead chars.

  const UNICHARSET &junk_set = junk_samples_.unicharset();

  const UNICHARSET &sample_set = samples_.unicharset();

  int num_junks = junk_samples_.num_samples();

  tprintf("Moving %d junk samples to master sample set.\n", num_junks);

  for (int s = 0; s < num_junks; ++s) {

    TrainingSample *sample = junk_samples_.mutable_sample(s);

    int junk_id = sample->class_id();

    const char *junk_utf8 = junk_set.id_to_unichar(junk_id);

    int sample_id = sample_set.unichar_to_id(junk_utf8);

    if (sample_id == INVALID_UNICHAR_ID) {

      sample_id = 0;

    }

    sample->set_class_id(sample_id);

    junk_samples_.extract_sample(s);

    samples_.AddSample(sample_id, sample);

  }

  junk_samples_.DeleteDeadSamples();

  samples_.OrganizeByFontAndClass();

}


// Replicates the samples and perturbs them if the enable_replication_ flag

// is set. MUST be used after the last call to OrganizeByFontAndClass on

// the training samples, ie after IncludeJunk if it is going to be used, as

// OrganizeByFontAndClass will eat the replicated samples into the regular

// samples.

void MasterTrainer::ReplicateAndRandomizeSamplesIfRequired() {

  if (enable_replication_) {

    if (debug_level_ > 0) {

      tprintf("ReplicateAndRandomize...\n");

    }

    verify_samples_.ReplicateAndRandomizeSamples();

    samples_.ReplicateAndRandomizeSamples();

    samples_.IndexFeatures(feature_space_);

  }

}


// Loads the basic font properties file into fontinfo_table_.

// Returns false on failure.

bool MasterTrainer::LoadFontInfo(const char *filename) {

  FILE *fp = fopen(filename, "rb");

  if (fp == nullptr) {

    fprintf(stderr, "Failed to load font_properties from %s\n", filename);

    return false;

  }

  int italic, bold, fixed, serif, fraktur;

  while (!feof(fp)) {

    FontInfo fontinfo;

    char *font_name = new char[1024];

    fontinfo.name = font_name;

    fontinfo.properties = 0;

    fontinfo.universal_id = 0;

    if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold,

                &fixed, &serif, &fraktur) != 6) {

      delete[] font_name;

      continue;

    }

    fontinfo.properties = (italic << 0) + (bold << 1) + (fixed << 2) +

                          (serif << 3) + (fraktur << 4);

    if (fontinfo_table_.get_index(fontinfo) < 0) {

      // fontinfo not in table.

      fontinfo_table_.push_back(fontinfo);

    } else {

      delete[] font_name;

    }

  }

  fclose(fp);

  return true;

}


// Loads the xheight font properties file into xheights_.

// Returns false on failure.

bool MasterTrainer::LoadXHeights(const char *filename) {

  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());

  xheights_.clear();

  xheights_.resize(fontinfo_table_.size(), -1);

  if (filename == nullptr) {

    return true;

  }

  FILE *f = fopen(filename, "rb");

  if (f == nullptr) {

    fprintf(stderr, "Failed to load font xheights from %s\n", filename);

    return false;

  }

  tprintf("Reading x-heights from %s ...\n", filename);

  FontInfo fontinfo;

  fontinfo.properties = 0; // Not used to lookup in the table.

  fontinfo.universal_id = 0;

  char buffer[1024];

  int xht;

  int total_xheight = 0;

  int xheight_count = 0;

  while (!feof(f)) {

    if (tfscanf(f, "%1023s %d\n", buffer, &xht) != 2) {

      continue;

    }

    buffer[1023] = '\0';

    fontinfo.name = buffer;

    auto fontinfo_id = fontinfo_table_.get_index(fontinfo);

    if (fontinfo_id < 0) {

      // fontinfo not in table.

      continue;

    }

    xheights_[fontinfo_id] = xht;

    total_xheight += xht;

    ++xheight_count;

  }

  if (xheight_count == 0) {

    fprintf(stderr, "No valid xheights in %s!\n", filename);

    fclose(f);

    return false;

  }

  int mean_xheight = DivRounded(total_xheight, xheight_count);

  for (size_t i = 0; i < fontinfo_table_.size(); ++i) {

    if (xheights_[i] < 0) {

      xheights_[i] = mean_xheight;

    }

  }

  fclose(f);

  return true;

} // LoadXHeights


// Reads spacing stats from filename and adds them to fontinfo_table.

bool MasterTrainer::AddSpacingInfo(const char *filename) {

  FILE *fontinfo_file = fopen(filename, "rb");

  if (fontinfo_file == nullptr) {

    return true; // We silently ignore missing files!

  }

  // Find the fontinfo_id.

  int fontinfo_id = GetBestMatchingFontInfoId(filename);

  if (fontinfo_id < 0) {

    tprintf("No font found matching fontinfo filename %s\n", filename);

    fclose(fontinfo_file);

    return false;

  }

  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);

  // TODO(rays) scale should probably be a double, but keep as an int for now

  // to duplicate current behavior.

  int scale = kBlnXHeight / xheights_[fontinfo_id];

  int num_unichars;

  char uch[UNICHAR_LEN];

  char kerned_uch[UNICHAR_LEN];

  int x_gap, x_gap_before, x_gap_after, num_kerned;

  ASSERT_HOST(tfscanf(fontinfo_file, "%d\n", &num_unichars) == 1);

  FontInfo *fi = &fontinfo_table_.at(fontinfo_id);

  fi->init_spacing(unicharset_.size());

  FontSpacingInfo *spacing = nullptr;

  for (int l = 0; l < num_unichars; ++l) {

    if (tfscanf(fontinfo_file, "%s %d %d %d", uch, &x_gap_before, &x_gap_after,

                &num_kerned) != 4) {

      tprintf("Bad format of font spacing file %s\n", filename);

      fclose(fontinfo_file);

      return false;

    }

    bool valid = unicharset_.contains_unichar(uch);

    if (valid) {

      spacing = new FontSpacingInfo();

      spacing->x_gap_before = static_cast<int16_t>(x_gap_before * scale);

      spacing->x_gap_after = static_cast<int16_t>(x_gap_after * scale);

    }

    for (int k = 0; k < num_kerned; ++k) {

      if (tfscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {

        tprintf("Bad format of font spacing file %s\n", filename);

        fclose(fontinfo_file);

        delete spacing;

        return false;

      }

      if (!valid || !unicharset_.contains_unichar(kerned_uch)) {

        continue;

      }

      spacing->kerned_unichar_ids.push_back(

          unicharset_.unichar_to_id(kerned_uch));

      spacing->kerned_x_gaps.push_back(static_cast<int16_t>(x_gap * scale));

    }

    if (valid) {

      fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);

    }

  }

  fclose(fontinfo_file);

  return true;

}


// Returns the font id corresponding to the given font name.

// Returns -1 if the font cannot be found.

int MasterTrainer::GetFontInfoId(const char *font_name) {

  FontInfo fontinfo;

  // We are only borrowing the string, so it is OK to const cast it.

  fontinfo.name = const_cast<char *>(font_name);

  fontinfo.properties = 0; // Not used to lookup in the table

  fontinfo.universal_id = 0;

  return fontinfo_table_.get_index(fontinfo);

}

// Returns the font_id of the closest matching font name to the given

// filename. It is assumed that a substring of the filename will match

// one of the fonts. If more than one is matched, the longest is returned.

int MasterTrainer::GetBestMatchingFontInfoId(const char *filename) {

  int fontinfo_id = -1;

  int best_len = 0;

  for (size_t f = 0; f < fontinfo_table_.size(); ++f) {

    if (strstr(filename, fontinfo_table_.at(f).name) != nullptr) {

      int len = strlen(fontinfo_table_.at(f).name);

      // Use the longest matching length in case a substring of a font matched.

      if (len > best_len) {

        best_len = len;

        fontinfo_id = f;

      }

    }

  }

  return fontinfo_id;

}


// Sets up a flat shapetable with one shape per class/font combination.

void MasterTrainer::SetupFlatShapeTable(ShapeTable *shape_table) {

  // To exactly mimic the results of the previous implementation, the shapes

  // must be clustered in order the fonts arrived, and reverse order of the

  // characters within each font.

  // Get a list of the fonts in the order they appeared.

  std::vector<int> active_fonts;

  int num_shapes = flat_shapes_.NumShapes();

  for (int s = 0; s < num_shapes; ++s) {

    int font = flat_shapes_.GetShape(s)[0].font_ids[0];

    unsigned f = 0;

    for (f = 0; f < active_fonts.size(); ++f) {

      if (active_fonts[f] == font) {

        break;

      }

    }

    if (f == active_fonts.size()) {

      active_fonts.push_back(font);

    }

  }

  // For each font in order, add all the shapes with that font in reverse order.

  int num_fonts = active_fonts.size();

  for (int f = 0; f < num_fonts; ++f) {

    for (int s = num_shapes - 1; s >= 0; --s) {

      int font = flat_shapes_.GetShape(s)[0].font_ids[0];

      if (font == active_fonts[f]) {

        shape_table->AddShape(flat_shapes_.GetShape(s));

      }

    }

  }

}


// Sets up a Clusterer for mftraining on a single shape_id.

// Call FreeClusterer on the return value after use.

CLUSTERER *MasterTrainer::SetupForClustering(

    const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs,

    int shape_id, int *num_samples) {

  int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);

  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;

  ASSERT_HOST(num_params == (int)MicroFeatureParameter::MFCount);

  CLUSTERER *clusterer = MakeClusterer(

      num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);


  // We want to iterate over the samples of just the one shape.

  IndexMapBiDi shape_map;

  shape_map.Init(shape_table.NumShapes(), false);

  shape_map.SetMap(shape_id, true);

  shape_map.Setup();

  // Reverse the order of the samples to match the previous behavior.

  std::vector<const TrainingSample *> sample_ptrs;

  SampleIterator it;

  it.Init(&shape_map, &shape_table, false, &samples_);

  for (it.Begin(); !it.AtEnd(); it.Next()) {

    sample_ptrs.push_back(&it.GetSample());

  }

  uint32_t sample_id = 0;

  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {

    const TrainingSample *sample = sample_ptrs[i];

    uint32_t num_features = sample->num_micro_features();

    for (uint32_t f = 0; f < num_features; ++f) {

      MakeSample(clusterer, sample->micro_features()[f].data(), sample_id);

    }

    ++sample_id;

  }

  *num_samples = sample_id;

  return clusterer;

}


// Writes the given float_classes (produced by SetupForFloat2Int) as inttemp

// to the given inttemp_file, and the corresponding pffmtable.

// The unicharset is the original encoding of graphemes, and shape_set should

// match the size of the shape_table, and may possibly be totally fake.

void MasterTrainer::WriteInttempAndPFFMTable(const UNICHARSET &unicharset,

                                             const UNICHARSET &shape_set,

                                             const ShapeTable &shape_table,

                                             CLASS_STRUCT *float_classes,

                                             const char *inttemp_file,

                                             const char *pffmtable_file) {

  auto *classify = new tesseract::Classify();

  // Move the fontinfo table to classify.

  fontinfo_table_.MoveTo(&classify->get_fontinfo_table());

  INT_TEMPLATES_STRUCT *int_templates =

      classify->CreateIntTemplates(float_classes, shape_set);

  FILE *fp = fopen(inttemp_file, "wb");

  if (fp == nullptr) {

    tprintf("Error, failed to open file \"%s\"\n", inttemp_file);

  } else {

    classify->WriteIntTemplates(fp, int_templates, shape_set);

    fclose(fp);

  }

  // Now write pffmtable. This is complicated by the fact that the adaptive

  // classifier still wants one indexed by unichar-id, but the static

  // classifier needs one indexed by its shape class id.

  // We put the shapetable_cutoffs in a vector, and compute the

  // unicharset cutoffs along the way.

  std::vector<uint16_t> shapetable_cutoffs;

  std::vector<uint16_t> unichar_cutoffs(unicharset.size());

  /* then write out each class */

  for (unsigned i = 0; i < int_templates->NumClasses; ++i) {

    INT_CLASS_STRUCT *Class = ClassForClassId(int_templates, i);

    // Todo: Test with min instead of max

    // int MaxLength = LengthForConfigId(Class, 0);

    uint16_t max_length = 0;

    for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {

      // Todo: Test with min instead of max

      // if (LengthForConfigId (Class, config_id) < MaxLength)

      uint16_t length = Class->ConfigLengths[config_id];

      if (length > max_length) {

        max_length = Class->ConfigLengths[config_id];

      }

      int shape_id = float_classes[i].font_set.at(config_id);

      const Shape &shape = shape_table.GetShape(shape_id);

      for (int c = 0; c < shape.size(); ++c) {

        int unichar_id = shape[c].unichar_id;

        if (length > unichar_cutoffs[unichar_id]) {

          unichar_cutoffs[unichar_id] = length;

        }

      }

    }

    shapetable_cutoffs.push_back(max_length);

  }

  fp = fopen(pffmtable_file, "wb");

  if (fp == nullptr) {

    tprintf("Error, failed to open file \"%s\"\n", pffmtable_file);

  } else {

    tesseract::Serialize(fp, shapetable_cutoffs);

    for (size_t c = 0; c < unicharset.size(); ++c) {

      const char *unichar = unicharset.id_to_unichar(c);

      if (strcmp(unichar, " ") == 0) {

        unichar = "NULL";

      }

      fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);

    }

    fclose(fp);

  }

  delete int_templates;

  delete classify;

}


// Generate debug output relating to the canonical distance between the

// two given UTF8 grapheme strings.

void MasterTrainer::DebugCanonical(const char *unichar_str1,

                                   const char *unichar_str2) {

  int class_id1 = unicharset_.unichar_to_id(unichar_str1);

  int class_id2 = unicharset_.unichar_to_id(unichar_str2);

  if (class_id2 == INVALID_UNICHAR_ID) {

    class_id2 = class_id1;

  }

  if (class_id1 == INVALID_UNICHAR_ID) {

    tprintf("No unicharset entry found for %s\n", unichar_str1);

    return;

  } else {

    tprintf("Font ambiguities for unichar %d = %s and %d = %s\n", class_id1,

            unichar_str1, class_id2, unichar_str2);

  }

  int num_fonts = samples_.NumFonts();

  const IntFeatureMap &feature_map = feature_map_;

  // Iterate the fonts to get the similarity with other fonst of the same

  // class.

  tprintf("      ");

  for (int f = 0; f < num_fonts; ++f) {

    if (samples_.NumClassSamples(f, class_id2, false) == 0) {

      continue;

    }

    tprintf("%6d", f);

  }

  tprintf("\n");

  for (int f1 = 0; f1 < num_fonts; ++f1) {

    // Map the features of the canonical_sample.

    if (samples_.NumClassSamples(f1, class_id1, false) == 0) {

      continue;

    }

    tprintf("%4d  ", f1);

    for (int f2 = 0; f2 < num_fonts; ++f2) {

      if (samples_.NumClassSamples(f2, class_id2, false) == 0) {

        continue;

      }

      float dist =

          samples_.ClusterDistance(f1, class_id1, f2, class_id2, feature_map);

      tprintf(" %5.3f", dist);

    }

    tprintf("\n");

  }

  // Build a fake ShapeTable containing all the sample types.

  ShapeTable shapes(unicharset_);

  for (int f = 0; f < num_fonts; ++f) {

    if (samples_.NumClassSamples(f, class_id1, true) > 0) {

      shapes.AddShape(class_id1, f);

    }

    if (class_id1 != class_id2 &&

        samples_.NumClassSamples(f, class_id2, true) > 0) {

      shapes.AddShape(class_id2, f);

    }

  }

}


#ifndef GRAPHICS_DISABLED

// Debugging for cloud/canonical features.

// Displays a Features window containing:

// If unichar_str2 is in the unicharset, and canonical_font is non-negative,

// displays the canonical features of the char/font combination in red.

// If unichar_str1 is in the unicharset, and cloud_font is non-negative,

// displays the cloud feature of the char/font combination in green.

// The canonical features are drawn first to show which ones have no

// matches in the cloud features.

// Until the features window is destroyed, each click in the features window

// will display the samples that have that feature in a separate window.

void MasterTrainer::DisplaySamples(const char *unichar_str1, int cloud_font,

                                   const char *unichar_str2,

                                   int canonical_font) {

  const IntFeatureMap &feature_map = feature_map_;

  const IntFeatureSpace &feature_space = feature_map.feature_space();

  ScrollView *f_window = CreateFeatureSpaceWindow("Features", 100, 500);

  ClearFeatureSpaceWindow(norm_mode_ == NM_BASELINE ? baseline : character,

                          f_window);

  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);

  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {

    const TrainingSample *sample =

        samples_.GetCanonicalSample(canonical_font, class_id2);

    for (uint32_t f = 0; f < sample->num_features(); ++f) {

      RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);

    }

  }

  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);

  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {

    const BitVector &cloud = samples_.GetCloudFeatures(cloud_font, class_id1);

    for (int f = 0; f < cloud.size(); ++f) {

      if (cloud[f]) {

        INT_FEATURE_STRUCT feature = feature_map.InverseIndexFeature(f);

        RenderIntFeature(f_window, &feature, ScrollView::GREEN);

      }

    }

  }

  f_window->Update();

  ScrollView *s_window = CreateFeatureSpaceWindow("Samples", 100, 500);

  SVEventType ev_type;

  do {

    // Wait until a click or popup event.

    auto ev = f_window->AwaitEvent(SVET_ANY);

    ev_type = ev->type;

    if (ev_type == SVET_CLICK) {

      int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);

      if (feature_index >= 0) {

        // Iterate samples and display those with the feature.

        Shape shape;

        shape.AddToShape(class_id1, cloud_font);

        s_window->Clear();

        samples_.DisplaySamplesWithFeature(feature_index, shape, feature_space,

                                           ScrollView::GREEN, s_window);

        s_window->Update();

      }

    }

  } while (ev_type != SVET_DESTROY);

}

#endif // !GRAPHICS_DISABLED


void MasterTrainer::TestClassifierVOld(bool replicate_samples,

                                       ShapeClassifier *test_classifier,

                                       ShapeClassifier *old_classifier) {

  SampleIterator sample_it;

  sample_it.Init(nullptr, nullptr, replicate_samples, &samples_);

  ErrorCounter::DebugNewErrors(test_classifier, old_classifier,

                               CT_UNICHAR_TOPN_ERR, fontinfo_table_,

                               page_images_, &sample_it);

}


// Tests the given test_classifier on the internal samples.

// See TestClassifier for details.

void MasterTrainer::TestClassifierOnSamples(CountTypes error_mode,

                                            int report_level,

                                            bool replicate_samples,

                                            ShapeClassifier *test_classifier,

                                            std::string *report_string) {

  TestClassifier(error_mode, report_level, replicate_samples, &samples_,

                 test_classifier, report_string);

}


// Tests the given test_classifier on the given samples.

// error_mode indicates what counts as an error.

// report_levels:

// 0 = no output.

// 1 = bottom-line error rate.

// 2 = bottom-line error rate + time.

// 3 = font-level error rate + time.

// 4 = list of all errors + short classifier debug output on 16 errors.

// 5 = list of all errors + short classifier debug output on 25 errors.

// If replicate_samples is true, then the test is run on an extended test

// sample including replicated and systematically perturbed samples.

// If report_string is non-nullptr, a summary of the results for each font

// is appended to the report_string.

double MasterTrainer::TestClassifier(CountTypes error_mode, int report_level,

                                     bool replicate_samples,

                                     TrainingSampleSet *samples,

                                     ShapeClassifier *test_classifier,

                                     std::string *report_string) {

  SampleIterator sample_it;

  sample_it.Init(nullptr, nullptr, replicate_samples, samples);

  if (report_level > 0) {

    int num_samples = 0;

    for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next()) {

      ++num_samples;

    }

    tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",

            sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),

            test_classifier->GetShapeTable()->NumShapes(), num_samples);

    tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");

  }

  double unichar_error = 0.0;

  ErrorCounter::ComputeErrorRate(test_classifier, report_level, error_mode,

                                 fontinfo_table_, page_images_, &sample_it,

                                 &unichar_error, nullptr, report_string);

  return unichar_error;

}


// Returns the average (in some sense) distance between the two given

// shapes, which may contain multiple fonts and/or unichars.

float MasterTrainer::ShapeDistance(const ShapeTable &shapes, int s1, int s2) {

  const IntFeatureMap &feature_map = feature_map_;

  const Shape &shape1 = shapes.GetShape(s1);

  const Shape &shape2 = shapes.GetShape(s2);

  int num_chars1 = shape1.size();

  int num_chars2 = shape2.size();

  float dist_sum = 0.0f;

  int dist_count = 0;

  if (num_chars1 > 1 || num_chars2 > 1) {

    // In the multi-char case try to optimize the calculation by computing

    // distances between characters of matching font where possible.

    for (int c1 = 0; c1 < num_chars1; ++c1) {

      for (int c2 = 0; c2 < num_chars2; ++c2) {

        dist_sum +=

            samples_.UnicharDistance(shape1[c1], shape2[c2], true, feature_map);

        ++dist_count;

      }

    }

  } else {

    // In the single unichar case, there is little alternative, but to compute

    // the squared-order distance between pairs of fonts.

    dist_sum =

        samples_.UnicharDistance(shape1[0], shape2[0], false, feature_map);

    ++dist_count;

  }

  return dist_sum / dist_count;

}


// Replaces samples that are always fragmented with the corresponding

// fragment samples.

void MasterTrainer::ReplaceFragmentedSamples() {

  if (fragments_ == nullptr) {

    return;

  }

  // Remove samples that are replaced by fragments. Each class that was

  // always naturally fragmented should be replaced by its fragments.

  int num_samples = samples_.num_samples();

  for (int s = 0; s < num_samples; ++s) {

    TrainingSample *sample = samples_.mutable_sample(s);

    if (fragments_[sample->class_id()] > 0) {

      samples_.KillSample(sample);

    }

  }

  samples_.DeleteDeadSamples();


  // Get ids of fragments in junk_samples_ that replace the dead chars.

  const UNICHARSET &frag_set = junk_samples_.unicharset();

#if 0

  // TODO(rays) The original idea was to replace only graphemes that were

  // always naturally fragmented, but that left a lot of the Indic graphemes

  // out. Determine whether we can go back to that idea now that spacing

  // is fixed in the training images, or whether this code is obsolete.

  bool* good_junk = new bool[frag_set.size()];

  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());

  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {

    int frag_ch = fragments_[dead_ch];

    if (frag_ch <= 0) continue;

    const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);

    CHAR_FRAGMENT* frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);

    // Mark the chars for all parts of the fragment as good in good_junk.

    for (int part = 0; part < frag->get_total(); ++part) {

      frag->set_pos(part);

      int good_ch = frag_set.unichar_to_id(frag->to_string().c_str());

      if (good_ch != INVALID_UNICHAR_ID)

        good_junk[good_ch] = true;  // We want this one.

    }

    delete frag;

  }

#endif

  // For now just use all the junk that was from natural fragments.

  // Get samples of fragments in junk_samples_ that replace the dead chars.

  int num_junks = junk_samples_.num_samples();

  for (int s = 0; s < num_junks; ++s) {

    TrainingSample *sample = junk_samples_.mutable_sample(s);

    int junk_id = sample->class_id();

    const char *frag_utf8 = frag_set.id_to_unichar(junk_id);

    CHAR_FRAGMENT *frag = CHAR_FRAGMENT::parse_from_string(frag_utf8);

    if (frag != nullptr && frag->is_natural()) {

      junk_samples_.extract_sample(s);

      samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);

    }

    delete frag;

  }

  junk_samples_.DeleteDeadSamples();

  junk_samples_.OrganizeByFontAndClass();

  samples_.OrganizeByFontAndClass();

  unicharset_.clear();

  unicharset_.AppendOtherUnicharset(samples_.unicharset());

  // delete [] good_junk;

  // Fragments_ no longer needed?

  delete[] fragments_;

  fragments_ = nullptr;

}


// Runs a hierarchical agglomerative clustering to merge shapes in the given

// shape_table, while satisfying the given constraints:

// * End with at least min_shapes left in shape_table,

// * No shape shall have more than max_shape_unichars in it,

// * Don't merge shapes where the distance between them exceeds max_dist.

const float kInfiniteDist = 999.0f;

void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,

                                  float max_dist, ShapeTable *shapes) {

  int num_shapes = shapes->NumShapes();

  int max_merges = num_shapes - min_shapes;

  // TODO: avoid new / delete.

  auto *shape_dists = new std::vector<ShapeDist>[num_shapes];

  float min_dist = kInfiniteDist;

  int min_s1 = 0;

  int min_s2 = 0;

  tprintf("Computing shape distances...");

  for (int s1 = 0; s1 < num_shapes; ++s1) {

    for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {

      ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));

      shape_dists[s1].push_back(dist);

      if (dist.distance < min_dist) {

        min_dist = dist.distance;

        min_s1 = s1;

        min_s2 = s2;

      }

    }

    tprintf(" %d", s1);

  }

  tprintf("\n");

  int num_merged = 0;

  while (num_merged < max_merges && min_dist < max_dist) {

    tprintf("Distance = %f: ", min_dist);

    int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);

    shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;

    if (num_unichars > max_shape_unichars) {

      tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",

              min_s1, min_s2, num_unichars, max_shape_unichars);

    } else {

      shapes->MergeShapes(min_s1, min_s2);

      shape_dists[min_s2].clear();

      ++num_merged;


      for (int s = 0; s < min_s1; ++s) {

        if (!shape_dists[s].empty()) {

          shape_dists[s][min_s1 - s - 1].distance =

              ShapeDistance(*shapes, s, min_s1);

          shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;

        }

      }

      for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {

        if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist) {

          shape_dists[min_s1][s2 - min_s1 - 1].distance =

              ShapeDistance(*shapes, min_s1, s2);

        }

      }

      for (int s = min_s1 + 1; s < min_s2; ++s) {

        if (!shape_dists[s].empty()) {

          shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;

        }

      }

    }

    min_dist = kInfiniteDist;

    for (int s1 = 0; s1 < num_shapes; ++s1) {

      for (unsigned i = 0; i < shape_dists[s1].size(); ++i) {

        if (shape_dists[s1][i].distance < min_dist) {

          min_dist = shape_dists[s1][i].distance;

          min_s1 = s1;

          min_s2 = s1 + 1 + i;

        }

      }

    }

  }

  tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);

  delete[] shape_dists;

  if (debug_level_ > 1) {

    for (int s1 = 0; s1 < num_shapes; ++s1) {

      if (shapes->MasterDestinationIndex(s1) == s1) {

        tprintf("Master shape:%s\n", shapes->DebugStr(s1).c_str());

      }

    }

  }

}


} // namespace tesseract.

UNICHAR_LEN
#define UNICHAR_LEN
Definition: unichar.h:31

tfscanf
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:189

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:54

scanutils.h

boxread.h

svmnode.h

sampleiterator.h

errorcounter.h

mastertrainer.h

ClassForClassId
#define ClassForClassId(T, c)
Definition: intproto.h:156

classify.h

shapetable.h

featdefs.h

shapeclassifier.h

distance
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44

value
int value
Definition: gmock-matchers_test.cc:664

i
int i
Definition: gmock-matchers_test.cc:718

tesseract
Definition: baseapi.h:39

tesseract::kCNFeatureType
const char *const kCNFeatureType
Definition: featdefs.cpp:34

tesseract::ShortNameToFeatureType
uint32_t ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:203

tesseract::ParseBoxFileStr
bool ParseBoxFileStr(const char *boxfile_str, int *page_number, std::string &utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:205

tesseract::ReadCharDescription
CHAR_DESC_STRUCT * ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:172

tesseract::tprintf
void tprintf(const char *format,...)
Definition: tprintf.cpp:41

tesseract::DivRounded
int DivRounded(int a, int b)
Definition: helpers.h:162

tesseract::kGeoFeatureType
const char *const kGeoFeatureType
Definition: featdefs.cpp:36

tesseract::SVEventType
SVEventType
Definition: scrollview.h:53

tesseract::SVET_DESTROY
@ SVET_DESTROY
Definition: scrollview.h:54

tesseract::SVET_CLICK
@ SVET_CLICK
Definition: scrollview.h:56

tesseract::SVET_ANY
@ SVET_ANY
Definition: scrollview.h:64

tesseract::character
@ character
Definition: mfoutline.h:53

tesseract::baseline
@ baseline
Definition: mfoutline.h:53

tesseract::kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:33

tesseract::kFontMergeDistance
const float kFontMergeDistance
Definition: mastertrainer.cpp:50

tesseract::Serialize
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236

tesseract::CreateFeatureSpaceWindow
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1622

tesseract::RenderIntFeature
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1500

tesseract::kMinClusteredShapes
const int kMinClusteredShapes
Definition: mastertrainer.cpp:46

tesseract::kIntFeatureType
const char *const kIntFeatureType
Definition: featdefs.cpp:35

tesseract::kInfiniteDist
const float kInfiniteDist
Definition: mastertrainer.cpp:966

tesseract::feature_defs
FEATURE_DEFS_STRUCT feature_defs
Definition: commontraining.cpp:90

tesseract::kMaxUnicharsPerCluster
const int kMaxUnicharsPerCluster
Definition: mastertrainer.cpp:48

tesseract::MakeClusterer
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1440

tesseract::ClearFeatureSpaceWindow
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:889

tesseract::CountTypes
CountTypes
Definition: errorcounter.h:69

tesseract::CT_UNICHAR_TOPN_ERR
@ CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76

tesseract::NormalizationMode
NormalizationMode
Definition: normalis.h:46

tesseract::NM_BASELINE
@ NM_BASELINE
Definition: normalis.h:47

tesseract::MicroFeatureParameter::MFCount
@ MFCount

tesseract::MakeSample
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1491

tesseract::kMicroFeatureType
const char *const kMicroFeatureType
Definition: featdefs.cpp:33

tesseract::UnicityTable::at
const T & at(int id) const
Return the object from an id.
Definition: unicity_table.h:56

tesseract::FontSpacingInfo
Definition: fontinfo.h:53

tesseract::FontSpacingInfo::x_gap_after
int16_t x_gap_after
Definition: fontinfo.h:55

tesseract::FontSpacingInfo::kerned_x_gaps
std::vector< int16_t > kerned_x_gaps
Definition: fontinfo.h:57

tesseract::FontSpacingInfo::x_gap_before
int16_t x_gap_before
Definition: fontinfo.h:54

tesseract::FontSpacingInfo::kerned_unichar_ids
std::vector< UNICHAR_ID > kerned_unichar_ids
Definition: fontinfo.h:56

tesseract::FontInfo
Definition: fontinfo.h:64

tesseract::FontInfo::universal_id
int32_t universal_id
Definition: fontinfo.h:140

tesseract::FontInfo::name
char * name
Definition: fontinfo.h:134

tesseract::FontInfo::properties
uint32_t properties
Definition: fontinfo.h:135

tesseract::FontInfo::init_spacing
void init_spacing(int unicharset_size)
Definition: fontinfo.h:79

tesseract::FontInfo::add_spacing
void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info)
Definition: fontinfo.h:85

tesseract::FontInfoTable::Serialize
TESS_API bool Serialize(FILE *fp) const
Definition: fontinfo.cpp:55

tesseract::FontInfoTable::MoveTo
TESS_API void MoveTo(UnicityTable< FontInfo > *target)
Definition: fontinfo.cpp:116

tesseract::Image
Definition: image.h:25

tesseract::TBOX
Definition: rect.h:37

tesseract::BitVector
Definition: bitvector.h:33

tesseract::BitVector::size
int size() const
Definition: bitvector.h:62

tesseract::GenericVector::get_index
int get_index(const T &object) const
Definition: genericvector.h:563

tesseract::GenericVector::at
T & at(int index) const
Definition: genericvector.h:89

tesseract::GenericVector::size
unsigned size() const
Definition: genericvector.h:70

tesseract::GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:574

tesseract::IndexMapBiDi
Definition: indexmapbidi.h:104

tesseract::IndexMapBiDi::Init
void Init(int size, bool all_mapped)
Definition: indexmapbidi.cpp:97

tesseract::IndexMapBiDi::SetMap
void SetMap(int sparse_index, bool mapped)
Definition: indexmapbidi.cpp:110

tesseract::IndexMapBiDi::Setup
void Setup()
Definition: indexmapbidi.cpp:117

tesseract::CHAR_FRAGMENT
Definition: unicharset.h:50

tesseract::CHAR_FRAGMENT::is_ending
bool is_ending() const
Definition: unicharset.h:121

tesseract::CHAR_FRAGMENT::parse_from_string
static CHAR_FRAGMENT * parse_from_string(const char *str)
Definition: unicharset.cpp:1103

tesseract::CHAR_FRAGMENT::is_natural
bool is_natural() const
Definition: unicharset.h:128

tesseract::CHAR_FRAGMENT::is_beginning
bool is_beginning() const
Definition: unicharset.h:116

tesseract::UNICHARSET
Definition: unicharset.h:164

tesseract::UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:768

tesseract::UNICHARSET::load_from_file
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391

tesseract::UNICHARSET::clear
void clear()
Definition: unicharset.h:324

tesseract::UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279

tesseract::UNICHARSET::contains_unichar
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695

tesseract::UNICHARSET::save_to_file
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361

tesseract::UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186

tesseract::UNICHARSET::size
size_t size() const
Definition: unicharset.h:355

tesseract::UNICHARSET::AppendOtherUnicharset
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:454

tesseract::Classify
Definition: classify.h:94

tesseract::CLUSTERER
Definition: cluster.h:91

tesseract::FEATURE_DEFS_STRUCT
Definition: featdefs.h:41

tesseract::FEATURE_DEFS_STRUCT::FeatureDesc
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:43

tesseract::IntFeatureSpace
Definition: intfeaturespace.h:36

tesseract::IntFeatureSpace::Size
int Size() const
Definition: intfeaturespace.h:49

tesseract::IntFeatureSpace::XYToFeatureIndex
int XYToFeatureIndex(int x, int y) const
Definition: intfeaturespace.cpp:79

tesseract::IntFeatureSpace::Serialize
bool Serialize(FILE *fp) const
Definition: intfeaturespace.cpp:36

tesseract::INT_CLASS_STRUCT
Definition: intproto.h:93

tesseract::INT_CLASS_STRUCT::ConfigLengths
uint16_t ConfigLengths[MAX_NUM_CONFIGS]
Definition: intproto.h:102

tesseract::INT_CLASS_STRUCT::NumConfigs
uint8_t NumConfigs
Definition: intproto.h:99

tesseract::INT_TEMPLATES_STRUCT
Definition: intproto.h:106

tesseract::INT_TEMPLATES_STRUCT::NumClasses
unsigned NumClasses
Definition: intproto.h:109

tesseract::INT_FEATURE_STRUCT
Definition: intproto.h:119

tesseract::FEATURE_DESC_STRUCT::ParamDesc
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:54

tesseract::FEATURE_DESC_STRUCT::NumParams
uint16_t NumParams
Definition: ocrfeatures.h:52

tesseract::CLASS_STRUCT
Definition: protos.h:40

tesseract::CLASS_STRUCT::font_set
UnicityTable< int > font_set
Definition: protos.h:47

tesseract::ShapeClassifier
Definition: shapeclassifier.h:43

tesseract::ShapeClassifier::GetShapeTable
virtual const ShapeTable * GetShapeTable() const =0

tesseract::Shape
Definition: shapetable.h:154

tesseract::Shape::AddToShape
void AddToShape(int unichar_id, int font_id)
Definition: shapetable.cpp:103

tesseract::Shape::size
int size() const
Definition: shapetable.h:169

tesseract::ShapeTable
Definition: shapetable.h:230

tesseract::ShapeTable::MergedUnicharCount
int MergedUnicharCount(unsigned shape_id1, unsigned shape_id2) const
Definition: shapetable.cpp:520

tesseract::ShapeTable::DebugStr
std::string DebugStr(unsigned shape_id) const
Definition: shapetable.cpp:292

tesseract::ShapeTable::AddShape
unsigned AddShape(int unichar_id, int font_id)
Definition: shapetable.cpp:351

tesseract::ShapeTable::SummaryStr
std::string SummaryStr() const
Definition: shapetable.cpp:325

tesseract::ShapeTable::MasterDestinationIndex
unsigned MasterDestinationIndex(unsigned shape_id) const
Definition: shapetable.cpp:548

tesseract::ShapeTable::Serialize
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:250

tesseract::ShapeTable::NumShapes
unsigned NumShapes() const
Definition: shapetable.h:248

tesseract::ShapeTable::GetShape
const Shape & GetShape(unsigned shape_id) const
Definition: shapetable.h:292

tesseract::ShapeTable::MergeShapes
void MergeShapes(unsigned shape_id1, unsigned shape_id2)
Definition: shapetable.cpp:530

tesseract::ShapeTable::FindShape
int FindShape(int unichar_id, int font_id) const
Definition: shapetable.cpp:400

tesseract::ShapeTable::AppendMasterShapes
void AppendMasterShapes(const ShapeTable &other, std::vector< int > *shape_map)
Definition: shapetable.cpp:683

tesseract::TrainingSample
Definition: trainingsample.h:54

tesseract::TrainingSample::features
const INT_FEATURE_STRUCT * features() const
Definition: trainingsample.h:146

tesseract::TrainingSample::class_id
UNICHAR_ID class_id() const
Definition: trainingsample.h:119

tesseract::TrainingSample::set_font_id
void set_font_id(int id)
Definition: trainingsample.h:128

tesseract::TrainingSample::num_features
uint32_t num_features() const
Definition: trainingsample.h:143

tesseract::TrainingSample::set_class_id
void set_class_id(int id)
Definition: trainingsample.h:122

tesseract::TrainingSample::num_micro_features
uint32_t num_micro_features() const
Definition: trainingsample.h:149

tesseract::TrainingSample::micro_features
const MicroFeature * micro_features() const
Definition: trainingsample.h:152

tesseract::TrainingSample::font_id
int font_id() const
Definition: trainingsample.h:125

tesseract::ErrorCounter::DebugNewErrors
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it)
Definition: errorcounter.cpp:109

tesseract::ErrorCounter::ComputeErrorRate
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, std::string *fonts_report)
Definition: errorcounter.cpp:44

tesseract::IntFeatureMap
Definition: intfeaturemap.h:48

tesseract::IntFeatureMap::InverseIndexFeature
INT_FEATURE_STRUCT InverseIndexFeature(int index_feature) const
Definition: intfeaturemap.cpp:54

tesseract::IntFeatureMap::feature_space
const IntFeatureSpace & feature_space() const
Definition: intfeaturemap.h:60

tesseract::ShapeDist
Definition: mastertrainer.h:48

tesseract::ShapeDist::distance
float distance
Definition: mastertrainer.h:59

tesseract::MasterTrainer::LoadFontInfo
bool LoadFontInfo(const char *filename)
Definition: mastertrainer.cpp:379

tesseract::MasterTrainer::TestClassifier
double TestClassifier(CountTypes error_mode, int report_level, bool replicate_samples, TrainingSampleSet *samples, ShapeClassifier *test_classifier, std::string *report_string)
Definition: mastertrainer.cpp:841

tesseract::MasterTrainer::GetBestMatchingFontInfoId
int GetBestMatchingFontInfoId(const char *filename)
Definition: mastertrainer.cpp:535

tesseract::MasterTrainer::DisplaySamples
void DisplaySamples(const char *unichar_str1, int cloud_font, const char *unichar_str2, int canonical_font)
Definition: mastertrainer.cpp:758

tesseract::MasterTrainer::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: mastertrainer.cpp:111

tesseract::MasterTrainer::PostLoadCleanup
void PostLoadCleanup()
Definition: mastertrainer.cpp:248

tesseract::MasterTrainer::ReplicateAndRandomizeSamplesIfRequired
void ReplicateAndRandomizeSamplesIfRequired()
Definition: mastertrainer.cpp:366

tesseract::MasterTrainer::IncludeJunk
void IncludeJunk()
Definition: mastertrainer.cpp:339

tesseract::MasterTrainer::LoadPageImages
void LoadPageImages(const char *filename)
Definition: mastertrainer.cpp:226

tesseract::MasterTrainer::~MasterTrainer
~MasterTrainer()
Definition: mastertrainer.cpp:65

tesseract::MasterTrainer::SetupMasterShapes
void SetupMasterShapes()
Definition: mastertrainer.cpp:288

tesseract::MasterTrainer::GetFontInfoId
int GetFontInfoId(const char *font_name)
Definition: mastertrainer.cpp:524

tesseract::MasterTrainer::PreTrainingSetup
void PreTrainingSetup()
Definition: mastertrainer.cpp:274

tesseract::MasterTrainer::Serialize
bool Serialize(FILE *fp) const
Definition: mastertrainer.cpp:75

tesseract::MasterTrainer::ShapeDistance
float ShapeDistance(const ShapeTable &shapes, int s1, int s2)
Definition: mastertrainer.cpp:867

tesseract::MasterTrainer::AddSample
void AddSample(bool verification, const char *unichar_str, TrainingSample *sample)
Definition: mastertrainer.cpp:193

tesseract::MasterTrainer::TestClassifierOnSamples
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, std::string *report_string)
Definition: mastertrainer.cpp:819

tesseract::MasterTrainer::LoadXHeights
bool LoadXHeights(const char *filename)
Definition: mastertrainer.cpp:412

tesseract::MasterTrainer::WriteInttempAndPFFMTable
void WriteInttempAndPFFMTable(const UNICHARSET &unicharset, const UNICHARSET &shape_set, const ShapeTable &shape_table, CLASS_STRUCT *float_classes, const char *inttemp_file, const char *pffmtable_file)
Definition: mastertrainer.cpp:623

tesseract::MasterTrainer::TestClassifierVOld
void TestClassifierVOld(bool replicate_samples, ShapeClassifier *test_classifier, ShapeClassifier *old_classifier)
Definition: mastertrainer.cpp:807

tesseract::MasterTrainer::MasterTrainer
MasterTrainer(NormalizationMode norm_mode, bool shape_analysis, bool replicate_samples, int debug_level)
Definition: mastertrainer.cpp:52

tesseract::MasterTrainer::DebugCanonical
void DebugCanonical(const char *unichar_str1, const char *unichar_str2)
Definition: mastertrainer.cpp:692

tesseract::MasterTrainer::unicharset
const UNICHARSET & unicharset() const
Definition: mastertrainer.h:178

tesseract::MasterTrainer::SetupFlatShapeTable
void SetupFlatShapeTable(ShapeTable *shape_table)
Definition: mastertrainer.cpp:552

tesseract::MasterTrainer::ReadTrainingSamples
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
Definition: mastertrainer.cpp:136

tesseract::MasterTrainer::AddSpacingInfo
bool AddSpacingInfo(const char *filename)
Definition: mastertrainer.cpp:463

tesseract::MasterTrainer::SetupForClustering
CLUSTERER * SetupForClustering(const ShapeTable &shape_table, const FEATURE_DEFS_STRUCT &feature_defs, int shape_id, int *num_samples)
Definition: mastertrainer.cpp:585

tesseract::SampleIterator
Definition: sampleiterator.h:91

tesseract::SampleIterator::GetSample
const TrainingSample & GetSample() const
Definition: sampleiterator.cpp:102

tesseract::SampleIterator::Init
void Init(const IndexMapBiDi *charset_map, const ShapeTable *shape_table, bool randomize, TrainingSampleSet *sample_set)
Definition: sampleiterator.cpp:49

tesseract::SampleIterator::AtEnd
bool AtEnd() const
Definition: sampleiterator.cpp:98

tesseract::SampleIterator::SparseCharsetSize
int SparseCharsetSize() const
Definition: sampleiterator.cpp:200

tesseract::SampleIterator::Begin
void Begin()
Definition: sampleiterator.cpp:86

tesseract::SampleIterator::CompactCharsetSize
int CompactCharsetSize() const
Definition: sampleiterator.cpp:195

tesseract::SampleIterator::NormalizeSamples
double NormalizeSamples()
Definition: sampleiterator.cpp:241

tesseract::SampleIterator::Next
void Next()
Definition: sampleiterator.cpp:154

tesseract::TrainingSampleSet
Definition: trainingsampleset.h:41

tesseract::TrainingSampleSet::GetCanonicalSample
const TrainingSample * GetCanonicalSample(int font_id, int class_id) const
Definition: trainingsampleset.cpp:501

tesseract::TrainingSampleSet::NumClassSamples
int NumClassSamples(int font_id, int class_id, bool randomize) const
Definition: trainingsampleset.cpp:198

tesseract::TrainingSampleSet::AddSample
int AddSample(const char *unichar, TrainingSample *sample)
Definition: trainingsampleset.cpp:170

tesseract::TrainingSampleSet::extract_sample
TrainingSample * extract_sample(int index)
Definition: trainingsampleset.h:157

tesseract::TrainingSampleSet::Serialize
bool Serialize(FILE *fp) const
Definition: trainingsampleset.cpp:100

tesseract::TrainingSampleSet::IndexFeatures
void IndexFeatures(const IntFeatureSpace &feature_space)
Definition: trainingsampleset.cpp:527

tesseract::TrainingSampleSet::charsetsize
int charsetsize() const
Definition: trainingsampleset.h:65

tesseract::TrainingSampleSet::DeleteDeadSamples
void DeleteDeadSamples()
Definition: trainingsampleset.cpp:540

tesseract::TrainingSampleSet::KillSample
void KillSample(TrainingSample *sample)
Definition: trainingsampleset.cpp:535

tesseract::TrainingSampleSet::ComputeCanonicalSamples
void ComputeCanonicalSamples(const IntFeatureMap &map, bool debug)
Definition: trainingsampleset.cpp:611

tesseract::TrainingSampleSet::ComputeCloudFeatures
void ComputeCloudFeatures(int feature_space_size)
Definition: trainingsampleset.cpp:757

tesseract::TrainingSampleSet::LoadUnicharset
void LoadUnicharset(const char *filename)
Definition: trainingsampleset.cpp:153

tesseract::TrainingSampleSet::NumFonts
int NumFonts() const
Definition: trainingsampleset.h:59

tesseract::TrainingSampleSet::GetCloudFeatures
const BitVector & GetCloudFeatures(int font_id, int class_id) const
Definition: trainingsampleset.cpp:256

tesseract::TrainingSampleSet::num_samples
int num_samples() const
Definition: trainingsampleset.h:53

tesseract::TrainingSampleSet::unicharset
const UNICHARSET & unicharset() const
Definition: trainingsampleset.h:62

tesseract::TrainingSampleSet::ComputeCanonicalFeatures
void ComputeCanonicalFeatures()
Definition: trainingsampleset.cpp:738

tesseract::TrainingSampleSet::mutable_sample
TrainingSample * mutable_sample(int index)
Definition: trainingsampleset.h:153

tesseract::TrainingSampleSet::OrganizeByFontAndClass
void OrganizeByFontAndClass()
Definition: trainingsampleset.cpp:555

tesseract::TrainingSampleSet::ClusterDistance
float ClusterDistance(int font_id1, int class_id1, int font_id2, int class_id2, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:337

tesseract::TrainingSampleSet::ReplicateAndRandomizeSamples
void ReplicateAndRandomizeSamples()
Definition: trainingsampleset.cpp:707

tesseract::TrainingSampleSet::DisplaySamplesWithFeature
void DisplaySamplesWithFeature(int f_index, const Shape &shape, const IntFeatureSpace &feature_space, ScrollView::Color color, ScrollView *window) const
Definition: trainingsampleset.cpp:792

tesseract::TrainingSampleSet::UnicharDistance
float UnicharDistance(const UnicharAndFonts &uf1, const UnicharAndFonts &uf2, bool matched_fonts, const IntFeatureMap &feature_map)
Definition: trainingsampleset.cpp:273

tesseract::ScrollView
Definition: scrollview.h:109

tesseract::ScrollView::AwaitEvent
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)
Definition: scrollview.cpp:432

tesseract::ScrollView::GREEN
@ GREEN
Definition: scrollview.h:118

tesseract::ScrollView::RED
@ RED
Definition: scrollview.h:116

tesseract::ScrollView::Clear
void Clear()
Definition: scrollview.cpp:565

tesseract::ScrollView::Update
static void Update()
Definition: scrollview.cpp:700