17# include "config_auto.h"
47 double *unichar_error,
double *scaled_error,
48 std::string *fonts_report) {
51 std::vector<UnicharRating> results;
53 clock_t start = clock();
54 unsigned total_samples = 0;
55 double unscaled_error = 0.0;
57 int error_samples = report_level > 3 ? report_level * report_level : 0;
61 int page_index = mutable_sample->
page_num();
63 0 <= page_index && page_index < page_images.size() ? page_images[page_index] :
nullptr;
66 bool debug_it =
false;
67 int correct_id = mutable_sample->
class_id();
72 debug_it = counter.AccumulateJunk(report_level > 3, results, mutable_sample);
74 debug_it = counter.AccumulateErrors(report_level > 3, boosting_mode, fontinfo_table, results,
77 if (debug_it && error_samples > 0) {
81#ifndef GRAPHICS_DISABLED
82 classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
88 const double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
90 unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it,
91 unichar_error, fonts_report);
92 if (scaled_error !=
nullptr) {
93 *scaled_error = counter.scaled_error_;
95 if (report_level > 1 && total_samples > 0) {
97 tprintf(
"Errors computed in %.2fs at %.1f μs/char\n", total_time,
98 1000000.0 * total_time / total_samples);
100 return unscaled_error;
115 std::vector<UnicharRating> results;
117 int total_samples = 0;
118 int error_samples = 25;
119 int total_new_errors = 0;
123 int page_index = mutable_sample->
page_num();
125 0 <= page_index && page_index < page_images.size() ? page_images[page_index] :
nullptr;
129 int correct_id = mutable_sample->
class_id();
130 if (correct_id != 0 && !old_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
131 results, mutable_sample)) {
135 if (correct_id != 0 && new_counter.AccumulateErrors(
true, boosting_mode, fontinfo_table,
136 results, mutable_sample)) {
140 if (results.size() > 0 && error_samples > 0) {
141#ifndef GRAPHICS_DISABLED
142 new_classifier->
DebugDisplay(*mutable_sample, page_pix, correct_id);
150 tprintf(
"Total new errors = %d\n", total_new_errors);
155ErrorCounter::ErrorCounter(
const UNICHARSET &unicharset,
int fontsize)
158 , unichar_counts_(unicharset.size(), unicharset.size(), 0)
159 , ok_score_hist_(0, 101)
160 , bad_score_hist_(0, 101)
161 , unicharset_(unicharset) {
163 font_counts_.clear();
164 font_counts_.resize(fontsize, empty_counts);
165 multi_unichar_counts_.clear();
166 multi_unichar_counts_.resize(unicharset.
size(), 0);
176bool ErrorCounter::AccumulateErrors(
bool debug,
CountTypes boosting_mode,
177 const FontInfoTable &font_table,
178 const std::vector<UnicharRating> &results,
179 TrainingSample *sample) {
180 int num_results = results.size();
181 int answer_actual_rank = -1;
182 int font_id = sample->font_id();
183 int unichar_id = sample->class_id();
184 sample->set_is_error(
false);
185 if (num_results == 0) {
189 sample->set_is_error(
true);
194 int epsilon_rank = 0;
195 int answer_epsilon_rank = -1;
196 int num_top_answers = 0;
197 double prev_rating = results[0].rating;
201 while (res_index < num_results) {
202 if (results[res_index].rating < prev_rating - rating_epsilon_) {
204 prev_rating = results[res_index].rating;
206 if (results[res_index].unichar_id == unichar_id && answer_epsilon_rank < 0) {
207 answer_epsilon_rank = epsilon_rank;
208 answer_actual_rank = res_index;
215 }
else if (epsilon_rank == 0) {
220 if (answer_actual_rank != 0) {
224 sample->set_is_error(
true);
227 if (answer_epsilon_rank == 0) {
230 if (num_top_answers > 1) {
232 ++multi_unichar_counts_[unichar_id];
237 if (font_table.SetContainsFontProperties(font_id, results[answer_actual_rank].fonts)) {
240 if (font_table.SetContainsMultipleFontProperties(results[answer_actual_rank].fonts)) {
251 sample->set_is_error(
true);
254 ++unichar_counts_(unichar_id, results[0].unichar_id);
255 if (answer_epsilon_rank < 0 || answer_epsilon_rank >= 2) {
259 sample->set_is_error(
true);
262 if (answer_epsilon_rank < 0) {
266 sample->set_is_error(
true);
268 answer_epsilon_rank = epsilon_rank;
273 font_counts_[font_id].n[
CT_RANK] += answer_epsilon_rank;
282 if (sample->is_error()) {
283 scaled_error_ += sample->weight();
285 tprintf(
"%d results for char %s font %d :", num_results,
287 for (
int i = 0;
i < num_results; ++
i) {
288 tprintf(
" %.3f : %s\n", results[
i].rating,
294 if (num_results > 0) {
297 bad_score_hist_.
add(percent, 1);
300 if (answer_actual_rank >= 0) {
301 percent =
IntCastRounded(results[answer_actual_rank].rating * 100);
303 ok_score_hist_.
add(percent, 1);
310bool ErrorCounter::AccumulateJunk(
bool debug,
const std::vector<UnicharRating> &results,
311 TrainingSample *sample) {
314 const int num_results = results.size();
315 const int font_id = sample->font_id();
316 const int unichar_id = sample->class_id();
318 if (num_results > 0) {
321 if (num_results > 0 && results[0].unichar_id != unichar_id) {
324 sample->set_is_error(
true);
326 scaled_error_ += sample->weight();
327 bad_score_hist_.
add(percent, 1);
332 sample->set_is_error(
false);
333 ok_score_hist_.
add(percent, 1);
350double ErrorCounter::ReportErrors(
int report_level,
CountTypes boosting_mode,
351 const FontInfoTable &fontinfo_table,
const SampleIterator &it,
352 double *unichar_error, std::string *fonts_report) {
356 int fontsize = font_counts_.size();
357 for (
int f = 0; f < fontsize; ++f) {
359 totals += font_counts_[f];
360 std::string font_report;
361 if (ReportString(
false, font_counts_[f], font_report)) {
362 if (fonts_report !=
nullptr) {
363 *fonts_report += fontinfo_table.at(f).name;
364 *fonts_report +=
": ";
365 *fonts_report += font_report;
366 *fonts_report +=
"\n";
368 if (report_level > 2) {
370 tprintf(
"%s: %s\n", fontinfo_table.at(f).name, font_report.c_str());
375 std::string total_report;
376 bool any_results = ReportString(
true, totals, total_report);
377 if (fonts_report !=
nullptr && fonts_report->empty()) {
379 *fonts_report =
"NoSamplesFound: ";
380 *fonts_report += total_report;
381 *fonts_report +=
"\n";
383 if (report_level > 0) {
385 std::string total_report;
387 tprintf(
"TOTAL Scaled Err=%.4g%%, %s\n", scaled_error_ * 100.0, total_report.c_str());
391 int charsetsize = unicharset_.
size();
392 int worst_uni_id = 0;
393 int worst_result_id = 0;
395 for (
int u = 0; u < charsetsize; ++u) {
396 for (
int v = 0; v < charsetsize; ++v) {
397 if (unichar_counts_(u, v) > worst_err) {
398 worst_err = unichar_counts_(u, v);
405 tprintf(
"Worst error = %d:%s -> %s with %d/%d=%.2f%% errors\n", worst_uni_id,
411 tprintf(
"Multi-unichar shape use:\n");
412 for (
int u = 0; u < multi_unichar_counts_.size(); ++u) {
413 if (multi_unichar_counts_[u] > 0) {
414 tprintf(
"%d multiple answers for unichar: %s\n", multi_unichar_counts_[u],
418 tprintf(
"OK Score histogram:\n");
419 ok_score_hist_.
print();
420 tprintf(
"ERROR Score histogram:\n");
421 bad_score_hist_.
print();
425 if (!ComputeRates(totals, rates)) {
429 if (unichar_error !=
nullptr) {
432 return rates[boosting_mode];
439bool ErrorCounter::ReportString(
bool even_if_empty,
const Counts &counts, std::string &report) {
442 if (!ComputeRates(counts, rates) && !even_if_empty) {
448 const int kMaxExtraLength = 5;
450 const char format_str[] =
451 "Unichar=%.4g%%[1], %.4g%%[2], %.4g%%[n], %.4g%%[T] "
452 "Mult=%.4g%%, Jn=%.4g%%, Brk=%.4g%%, Rej=%.4g%%, "
453 "FontAttr=%.4g%%, Multi=%.4g%%, "
454 "Answers=%.3g, Rank=%.3g, "
455 "OKjunk=%.4g%%, Badjunk=%.4g%%";
456 constexpr size_t max_str_len =
sizeof(format_str) + kMaxExtraLength * (
CT_SIZE - 1) + 1;
457 char formatted_str[max_str_len];
464 report = formatted_str;
467 for (
int ct : counts.n) {
468 report +=
"\t" + std::to_string(ct);
475bool ErrorCounter::ComputeRates(
const Counts &counts,
double rates[
CT_SIZE]) {
476 const int ok_samples =
480 double denominator =
static_cast<double>(std::max(ok_samples, 1));
481 for (
int ct = 0; ct <=
CT_RANK; ++ct) {
482 rates[ct] = counts.n[ct] / denominator;
485 denominator =
static_cast<double>(std::max(junk_samples, 1));
487 rates[ct] = counts.n[ct] / denominator;
489 return ok_samples != 0 || junk_samples != 0;
492ErrorCounter::Counts::Counts() {
493 memset(n, 0,
sizeof(n[0]) *
CT_SIZE);
497 for (
int ct = 0; ct <
CT_SIZE; ++ct) {
498 n[ct] += other.n[ct];
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
const double kRatingEpsilon
void tprintf(const char *format,...)
int IntCastRounded(double x)
void add(int32_t value, int32_t count)
bool has_special_codes() const
const char * id_to_unichar(UNICHAR_ID id) const
virtual int UnicharClassifySample(const TrainingSample &sample, Image page_pix, int debug, UNICHAR_ID keep_this, std::vector< UnicharRating > *results)
virtual const UNICHARSET & GetUnicharset() const
void DebugDisplay(const TrainingSample &sample, Image page_pix, UNICHAR_ID unichar_id)
UNICHAR_ID class_id() const
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it)
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, std::string *fonts_report)
const TrainingSampleSet * sample_set() const
int GlobalSampleIndex() const
TrainingSample * MutableSample() const
std::string SampleToString(const TrainingSample &sample) const