18#define _USE_MATH_DEFINES
22# include "config_auto.h"
31#include <allheaders.h>
39#ifdef INCLUDE_TENSORFLOW
71#ifndef GRAPHICS_DISABLED
78 : randomly_rotate_(false), training_data_(0), sub_trainer_(nullptr) {
84 int debug_interval, int64_t max_memory)
85 : randomly_rotate_(false),
86 training_data_(max_memory),
87 sub_trainer_(nullptr) {
95#ifndef GRAPHICS_DISABLED
106 const char *old_traineddata) {
107 std::vector<char> data;
111 tprintf(
"Loaded file %s, unpacking...\n", filename);
116 tprintf(
"Error, %s is an integer (fast) model, cannot continue training\n",
120 if (((old_traineddata ==
nullptr || *old_traineddata ==
'\0') &&
122 filename == old_traineddata) {
127 if (old_traineddata ==
nullptr || *old_traineddata ==
'\0') {
128 tprintf(
"Must supply the old traineddata for code conversion!\n");
148 std::vector<int> code_map =
MapRecoder(old_chset, old_recoder);
165 int net_flags,
float weight_range,
166 float learning_rate,
float momentum,
174 append_index, net_flags, weight_range,
182 "Training parameters:\n Debug interval = %d,"
183 " weights = %g, learning rate = %g, momentum=%g\n",
191#ifdef INCLUDE_TENSORFLOW
194 TFNetwork *tf_net =
new TFNetwork(
"TensorFlow");
197 tprintf(
"InitFromProtoStr failed!!\n");
237 const ImageData *trainingdata,
int iteration,
double min_dict_ratio,
238 double dict_ratio_step,
double max_dict_ratio,
double min_cert_offset,
239 double cert_offset_step,
double max_cert_offset, std::string &results) {
249 std::vector<int> truth_labels, ocr_labels, xcoords;
260 results +=
"0,0=" + std::to_string(baseline_error);
263 for (
double r = min_dict_ratio; r < max_dict_ratio; r += dict_ratio_step) {
264 for (
double c = min_cert_offset; c < max_cert_offset;
265 c += cert_offset_step) {
268 search.ExtractBestPathAsLabels(&ocr_labels, &xcoords);
273 if ((r == min_dict_ratio && c == min_cert_offset) ||
274 !std::isfinite(word_error)) {
277 tprintf(
"r=%g, c=%g, truth=%s, ocr=%s, wderr=%g, truth[0]=%d\n", r, c,
278 t.c_str(), o.c_str(), word_error, truth_labels[0]);
280 results +=
" " + std::to_string(r);
281 results +=
"," + std::to_string(c);
282 results +=
"=" + std::to_string(word_error);
298 bool randomly_rotate) {
310 std::stringstream &log_msg) {
332 std::vector<char> rec_model_data;
335 log_msg <<
" New best BCER = " << error_rate;
348 log_msg <<
" failed to write best model:";
350 log_msg <<
" wrote best model:";
353 log_msg << best_model_name;
357 log_msg <<
" New worst BCER = " << error_rate;
362 log_msg <<
"\nDivergence! ";
379 result = sub_trainer_result !=
STR_NONE;
383 std::vector<char> checkpoint;
386 log_msg <<
" failed to write checkpoint.";
388 log_msg <<
" wrote checkpoint.";
397 log_msg << std::fixed << std::setprecision(3)
408 std::stringstream &log_msg)
const {
455 uint8_t amount = serialize_amount;
459 if (serialize_amount ==
LIGHT) {
492 std::vector<char> sub_data;
519 tprintf(
"Warning: LSTMTrainer deserialized an LSTMRecognizer!\n");
548 if (amount ==
LIGHT) {
581 std::vector<char> sub_data;
585 if (sub_data.empty()) {
608 log_msg <<
" Failed to revert to previous best for trial!";
611 log_msg <<
" Trial sub_trainer_ from iteration "
636 double sub_margin = (training_error - sub_error) / sub_error;
638 log_msg <<
" sub_trainer=" << sub_error
639 <<
" margin=" << 100.0 * sub_margin <<
"\n";
642 while (
sub_trainer_->training_iteration() < end_iteration &&
644 int target_iteration =
646 while (
sub_trainer_->training_iteration() < target_iteration) {
649 std::stringstream batch_log(
"Sub:");
650 batch_log.imbue(std::locale::classic());
653 tprintf(
"UpdateSubtrainer:%s", batch_log.str().c_str());
654 log_msg << batch_log.str();
656 sub_margin = (training_error - sub_error) / sub_error;
661 std::vector<char> updated_trainer;
664 log_msg <<
" Sub trainer wins at iteration "
676 std::stringstream &log_msg) {
680 log_msg <<
"\nReduced learning rate on layers: " << num_reduced;
702 int num_layers = layers.size();
703 std::vector<int> num_weights(num_layers);
704 std::vector<TFloat> bad_sums[LR_COUNT];
705 std::vector<TFloat> ok_sums[LR_COUNT];
706 for (
int i = 0;
i < LR_COUNT; ++
i) {
707 bad_sums[
i].resize(num_layers, 0.0);
708 ok_sums[
i].resize(num_layers, 0.0);
710 auto momentum_factor = 1 / (1 -
momentum_);
711 std::vector<char> orig_trainer;
713 for (
int i = 0;
i < num_layers; ++
i) {
718 for (
int s = 0; s < num_samples; ++s) {
720 for (
int ww = 0; ww < LR_COUNT; ++ww) {
722 auto ww_factor = momentum_factor;
732 for (
int i = 0;
i < num_layers; ++
i) {
733 if (num_weights[
i] == 0) {
743 if (trainingdata ==
nullptr) {
747 std::vector<char> updated_trainer;
749 for (
int i = 0;
i < num_layers; ++
i) {
750 if (num_weights[
i] == 0) {
760 layer->
Update(0.0, 0.0, 0.0, 0);
764 float before_bad = bad_sums[ww][
i];
765 float before_ok = ok_sums[ww][
i];
767 &ok_sums[ww][
i], &bad_sums[ww][
i]);
769 bad_sums[ww][
i] + ok_sums[ww][
i] - before_bad - before_ok;
770 if (bad_frac > 0.0f) {
771 bad_frac = (bad_sums[ww][
i] - before_bad) / bad_frac;
778 for (
int i = 0;
i < num_layers; ++
i) {
779 if (num_weights[
i] == 0) {
784 TFloat total_down = bad_sums[LR_DOWN][
i] + ok_sums[LR_DOWN][
i];
785 TFloat total_same = bad_sums[LR_SAME][
i] + ok_sums[LR_SAME][
i];
786 TFloat frac_down = bad_sums[LR_DOWN][
i] / total_down;
787 TFloat frac_same = bad_sums[LR_SAME][
i] / total_same;
788 tprintf(
"Layer %d=%s: lr %g->%g%%, lr %g->%g%%",
i, layer->
name().c_str(),
789 lr * factor, 100.0 * frac_down, lr, 100.0 * frac_same);
798 if (num_lowered == 0) {
800 for (
int i = 0;
i < num_layers; ++
i) {
801 if (num_weights[
i] > 0) {
816 int null_char, std::vector<int> *labels) {
817 if (str.c_str() ==
nullptr || str.length() <= 0) {
818 tprintf(
"Empty truth string!\n");
822 std::vector<int> internal_labels;
828 if (unicharset.
encode_string(cleaned.c_str(),
true, &internal_labels,
nullptr,
831 for (
auto internal_label : internal_labels) {
832 if (recoder !=
nullptr) {
837 for (
int j = 0; j < len; ++j) {
838 labels->push_back(code(j));
849 labels->push_back(internal_label);
859 tprintf(
"Encoding of string failed! Failure bytes:");
860 while (err_index < cleaned.size()) {
861 tprintf(
" %x", cleaned[err_index++] & 0xff);
890#ifndef GRAPHICS_DISABLED
905 if (trainingdata ==
nullptr) {
906 tprintf(
"Null trainingdata.\n");
912 std::vector<int> truth_labels;
914 tprintf(
"Can't encode transcription: '%s' in language '%s'\n",
919 bool upside_down =
false;
929 for (
auto truth_label : truth_labels) {
934 std::reverse(truth_labels.begin(), truth_labels.end());
938 while (w < truth_labels.size() &&
942 if (w == truth_labels.size()) {
948 bool invert = trainingdata->
boxes().empty();
949 if (!
RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,
950 &image_scale, &inputs, fwd_outputs)) {
958 tprintf(
"Compute simple targets failed for %s!\n",
962 }
else if (loss_type ==
LT_CTC) {
964 tprintf(
"Compute CTC targets failed for %s!\n",
969 tprintf(
"Logistic outputs not implemented yet!\n");
972 std::vector<int> ocr_labels;
973 std::vector<int> xcoords;
976 if (loss_type !=
LT_CTC) {
988 if (truth_text != ocr_text) {
998 trainingdata->
page_number(), delta_error == 0.0 ?
"(Perfect)" :
"");
1000 if (delta_error == 0.0) {
1015 std::vector<char> *data)
const {
1023 const char *data,
int size) {
1025 tprintf(
"Warning: data size is 0 in LSTMTrainer::ReadLocalTrainingDump\n");
1029 fp.
Open(data, size);
1035 std::vector<char> recognizer_data;
1038 recognizer_data.size());
1054 std::stringstream filename;
1055 filename.imbue(std::locale::classic());
1056 filename <<
model_base_ << std::fixed << std::setprecision(3)
1061 return filename.str();
1078 std::vector<int> code_map(num_new_codes, -1);
1079 for (
int c = 0; c < num_new_codes; ++c) {
1083 for (
int uid = 0; uid <= num_new_unichars; ++uid) {
1087 while (code_index < length && codes(code_index) != c) {
1090 if (code_index == length) {
1095 uid < num_new_unichars
1097 : old_chset.
size() - 1;
1098 if (old_uid == INVALID_UNICHAR_ID) {
1103 if (code_index < old_recoder.
EncodeUnichar(old_uid, &old_codes)) {
1104 old_code = old_codes(code_index);
1108 code_map[c] = old_code;
1121 "Must provide a traineddata containing lstm_unicharset and"
1122 " lstm_recoder!\n" !=
nullptr);
1138#ifndef GRAPHICS_DISABLED
1157 const std::vector<int> &truth_labels,
1159 const std::string &truth_text =
DecodeLabels(truth_labels);
1160 if (truth_text.c_str() ==
nullptr || truth_text.length() <= 0) {
1161 tprintf(
"Empty truth string at decode time!\n");
1166 std::vector<int> labels;
1167 std::vector<int> xcoords;
1171 truth_text.c_str());
1172 if (truth_text != text) {
1177 tprintf(
"TRAINING activation path for truth string %s\n",
1178 truth_text.c_str());
1180#ifndef GRAPHICS_DISABLED
1192#ifndef GRAPHICS_DISABLED
1196 const char *window_name,
ScrollView **window) {
1197 int width = targets.
Width();
1201 for (
int c = 0; c < num_features; ++c) {
1205 for (
int t = 0; t < width; ++t) {
1206 double target = targets.
f(t)[c];
1210 (*window)->SetCursor(t - 1, 0);
1213 (*window)->DrawTo(t, target);
1214 }
else if (start_t >= 0) {
1215 (*window)->DrawTo(t, 0);
1216 (*window)->DrawTo(start_t - 1, 0);
1221 (*window)->DrawTo(width, 0);
1222 (*window)->DrawTo(start_t - 1, 0);
1225 (*window)->Update();
1233 const std::vector<int> &truth_labels,
1235 if (truth_labels.size() > targets->
Width()) {
1236 tprintf(
"Error: transcription %s too long to fit into target of width %d\n",
1241 for (
auto truth_label : truth_labels) {
1245 for (
i = truth_labels.size(); i < targets->Width(); ++
i) {
1266 double char_error,
double word_error) {
1286 double total_error = 0.0;
1287 int width = deltas.
Width();
1289 for (
int t = 0; t < width; ++t) {
1290 const float *class_errs = deltas.
f(t);
1291 for (
int c = 0; c < num_classes; ++c) {
1292 double error = class_errs[c];
1293 total_error += error * error;
1296 return sqrt(total_error / (width * num_classes));
1306 int width = deltas.
Width();
1308 for (
int t = 0; t < width; ++t) {
1309 const float *class_errs = deltas.
f(t);
1310 for (
int c = 0; c < num_classes; ++c) {
1311 float abs_delta = std::fabs(class_errs[c]);
1314 if (0.5 <= abs_delta) {
1319 return static_cast<double>(num_errors) / width;
1324 const std::vector<int> &ocr_str) {
1326 unsigned truth_size = 0;
1327 for (
auto ch : truth_str) {
1333 for (
auto ch : ocr_str) {
1338 unsigned char_errors = 0;
1339 for (
auto label_count : label_counts) {
1340 char_errors += abs(label_count);
1343 if (truth_size <= char_errors) {
1344 return (char_errors == 0) ? 0.0 : 1.0;
1346 return static_cast<double>(char_errors) / truth_size;
1352 std::string *ocr_str) {
1353 using StrMap = std::unordered_map<std::string, int, std::hash<std::string>>;
1354 std::vector<std::string> truth_words =
split(*truth_str,
' ');
1355 if (truth_words.empty()) {
1358 std::vector<std::string> ocr_words =
split(*ocr_str,
' ');
1360 for (
const auto &truth_word : truth_words) {
1361 std::string truth_word_string(truth_word.c_str());
1362 auto it = word_counts.find(truth_word_string);
1363 if (it == word_counts.end()) {
1364 word_counts.insert(std::make_pair(truth_word_string, 1));
1369 for (
const auto &ocr_word : ocr_words) {
1370 std::string ocr_word_string(ocr_word.c_str());
1371 auto it = word_counts.find(ocr_word_string);
1372 if (it == word_counts.end()) {
1373 word_counts.insert(std::make_pair(ocr_word_string, -1));
1378 int word_recall_errs = 0;
1379 for (
const auto &word_count : word_counts) {
1380 if (word_count.second > 0) {
1381 word_recall_errs += word_count.second;
1384 return static_cast<double>(word_recall_errs) / truth_words.size();
1395 double buffer_sum = 0.0;
1396 for (
int i = 0;
i < mean_count; ++
i) {
1399 double mean = buffer_sum / mean_count;
1414 tprintf(
"Mean rms=%g%%, delta=%g%%, train=%g%%(%g%%), skip ratio=%g%%\n",
1426 const std::vector<char> &model_data,
1462 double two_percent_more = error_rate + 2.0;
1469 tprintf(
"2 Percent improvement time=%d, best error was %g @ %d\n",
1474 if (tester !=
nullptr) {
1487 if (result.length() > 0) {
const double kLearningRateDecay
const double kImprovementFraction
const int kMinStartedErrorRate
void tprintf(const char *format,...)
int IntCastRounded(double x)
@ TESSDATA_LSTM_UNICHARSET
const double kSubTrainerMarginFraction
std::function< std::string(int, const double *, const TessdataManager &, int)> TestCallback
const int kErrorGraphInterval
constexpr size_t countof(T const (&)[N]) noexcept
bool SaveDataToFile(const GenericVector< char > &data, const char *filename)
LIST search(LIST list, void *key, int_compare is_equal)
const double kMinDivergenceRate
const int kNumAdjustmentIterations
const double kHighConfidence
const double kBestCheckpointFraction
const int kNumPagesPerBatch
const std::vector< std::string > split(const std::string &s, char c)
const int kMinStallIterations
const double kStageTransitionThreshold
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
const std::string & imagefilename() const
const std::string & transcription() const
const std::string & language() const
const std::vector< TBOX > & boxes() const
TESS_API bool LoadDocuments(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, FileReader reader)
double SignedRand(double range)
void OpenWrite(std::vector< char > *data)
bool DeSerialize(std::string &data)
bool Serialize(const std::string &data)
bool Open(const char *filename, FileReader reader)
void OverwriteEntry(TessdataType type, const char *data, int size)
std::string VersionString() const
void SetVersionString(const std::string &v_str)
bool GetComponent(TessdataType type, TFile *fp)
bool SaveFile(const char *filename, FileWriter writer) const
bool Init(const char *data_file_name)
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool DeSerialize(TFile *fp)
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
bool has_special_codes() const
bool load_from_file(const char *const filename, bool skip_fragments)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
static std::string CleanupString(const char *utf8_str)
void DebugActivationPath(const NetworkIO &outputs, const std::vector< int > &labels, const std::vector< int > &xcoords)
LossType OutputLossType() const
std::string DecodeLabels(const std::vector< int > &labels)
bool SimpleTextOutput() const
NetworkScratch scratch_space_
bool LoadCharsets(const TessdataManager *mgr)
void LabelsFromOutputs(const NetworkIO &outputs, std::vector< int > *labels, std::vector< int > *xcoords)
void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0, int lstm_choice_amount=5)
void DisplayForward(const NetworkIO &inputs, const std::vector< int > &labels, const std::vector< int > &label_coords, const char *window_name, ScrollView **window)
void SetIteration(int iteration)
void ScaleLearningRate(double factor)
void ScaleLayerLearningRate(const std::string &id, double factor)
float learning_rate() const
int32_t training_iteration_
int training_iteration() const
int sample_iteration() const
std::vector< std::string > EnumerateLayers() const
float GetLayerLearningRate(const std::string &id) const
Network * GetLayer(const std::string &id) const
bool Serialize(const TessdataManager *mgr, TFile *fp) const
const UNICHARSET & GetUnicharset() const
int32_t sample_iteration_
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
virtual int RemapOutputs(int old_no, const std::vector< int > &code_map)
const std::string & name() const
static void ClearWindow(bool tess_coords, const char *window_name, int width, int height, ScrollView **window)
virtual void SetEnableTraining(TrainingState state)
virtual bool Backward(bool debug, const NetworkIO &fwd_deltas, NetworkScratch *scratch, NetworkIO *back_deltas)=0
virtual void Update(float learning_rate, float momentum, float adam_beta, int num_samples)
virtual void DebugWeights()=0
bool TestFlag(NetworkFlags flag) const
virtual std::string spec() const
virtual void CountAlternators(const Network &other, TFloat *same, TFloat *changed) const
void Resize(const NetworkIO &src, int num_features)
void SetActivations(int t, int label, float ok_score)
bool AnySuspiciousTruth(float confidence_thr) const
void SubtractAllFromFloat(const NetworkIO &src)
const GENERIC_2D_ARRAY< float > & float_array() const
void Decode(const NetworkIO &output, double dict_ratio, double cert_offset, double worst_dict_cert, const UNICHARSET *charset, int lstm_choice_mode=0)
void ExtractBestPathAsLabels(std::vector< int > *labels, std::vector< int > *xcoords) const
static constexpr float kMinCertainty
static bool ComputeCTCTargets(const std::vector< int > &truth_labels, int null_char, const GENERIC_2D_ARRAY< float > &outputs, NetworkIO *targets)
static void NormalizeProbs(NetworkIO *probs)
static bool InitNetwork(int num_outputs, const char *network_spec, int append_index, int net_flags, float weight_range, TRand *randomizer, Network **network)
bool TransitionTrainingStage(float error_threshold)
std::vector< int32_t > best_error_iterations_
std::vector< char > worst_model_data_
Trainability PrepareForBackward(const ImageData *trainingdata, NetworkIO *fwd_outputs, NetworkIO *targets)
bool ReadLocalTrainingDump(const TessdataManager *mgr, const char *data, int size)
bool MaintainCheckpoints(const TestCallback &tester, std::stringstream &log_msg)
std::string UpdateErrorGraph(int iteration, double error_rate, const std::vector< char > &model_data, const TestCallback &tester)
bool EncodeString(const std::string &str, std::vector< int > *labels) const
double error_rates_[ET_COUNT]
bool LoadAllTrainingData(const std::vector< std::string > &filenames, CachingStrategy cache_strategy, bool randomly_rotate)
double ComputeErrorRates(const NetworkIO &deltas, double char_error, double word_error)
int InitTensorFlowNetwork(const std::string &tf_proto)
void LogIterations(const char *intro_str, std::stringstream &log_msg) const
double ComputeWordError(std::string *truth_str, std::string *ocr_str)
double NewSingleError(ErrorTypes type) const
void StartSubtrainer(std::stringstream &log_msg)
bool ComputeCTCTargets(const std::vector< int > &truth_labels, NetworkIO *outputs, NetworkIO *targets)
std::vector< char > best_trainer_
double worst_error_rates_[ET_COUNT]
void SaveRecognitionDump(std::vector< char > *data) const
bool Serialize(SerializeAmount serialize_amount, const TessdataManager *mgr, TFile *fp) const
bool ComputeTextTargets(const NetworkIO &outputs, const std::vector< int > &truth_labels, NetworkIO *targets)
float error_rate_of_last_saved_best_
int last_perfect_training_iteration_
void FillErrorBuffer(double new_error, ErrorTypes type)
int learning_iteration() const
bool SaveTraineddata(const char *filename)
SubTrainerResult UpdateSubtrainer(std::stringstream &log_msg)
bool DeSerialize(const TessdataManager *mgr, TFile *fp)
double ComputeRMSError(const NetworkIO &deltas)
Trainability GridSearchDictParams(const ImageData *trainingdata, int iteration, double min_dict_ratio, double dict_ratio_step, double max_dict_ratio, double min_cert_offset, double cert_offset_step, double max_cert_offset, std::string &results)
double ComputeWinnerError(const NetworkIO &deltas)
std::string checkpoint_name_
bool InitNetwork(const char *network_spec, int append_index, int net_flags, float weight_range, float learning_rate, float momentum, float adam_beta)
void UpdateErrorBuffer(double new_error, ErrorTypes type)
int32_t improvement_steps_
int CurrentTrainingStage() const
std::string DumpFilename() const
std::vector< char > best_model_data_
bool SaveTrainingDump(SerializeAmount serialize_amount, const LSTMTrainer &trainer, std::vector< char > *data) const
bool DebugLSTMTraining(const NetworkIO &inputs, const ImageData &trainingdata, const NetworkIO &fwd_outputs, const std::vector< int > &truth_labels, const NetworkIO &outputs)
DocumentCache training_data_
int checkpoint_iteration_
static const int kRollingBufferSize_
int prev_sample_iteration_
std::vector< double > error_buffers_[ET_COUNT]
std::unique_ptr< LSTMTrainer > sub_trainer_
double ComputeCharError(const std::vector< int > &truth_str, const std::vector< int > &ocr_str)
void DisplayTargets(const NetworkIO &targets, const char *window_name, ScrollView **window)
bool ReadTrainingDump(const std::vector< char > &data, LSTMTrainer &trainer) const
int ReduceLayerLearningRates(TFloat factor, int num_samples, LSTMTrainer *samples_trainer)
const ImageData * TrainOnLine(LSTMTrainer *samples_trainer, bool batch)
std::vector< double > best_error_history_
void PrepareLogMsg(std::stringstream &log_msg) const
bool TryLoadingCheckpoint(const char *filename, const char *old_traineddata)
void ReduceLearningRates(LSTMTrainer *samples_trainer, std::stringstream &log_msg)
double best_error_rates_[ET_COUNT]
std::vector< int > MapRecoder(const UNICHARSET &old_chset, const UnicharCompress &old_recoder) const
std::unique_ptr< SVEvent > AwaitEvent(SVEventType type)