tesseract v5.3.3.20231005
errorcounter.h
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7// http://www.apache.org/licenses/LICENSE-2.0
8// Unless required by applicable law or agreed to in writing, software
9// distributed under the License is distributed on an "AS IS" BASIS,
10// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11// See the License for the specific language governing permissions and
12// limitations under the License.
13//
15
16#ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
17#define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
18
19#include "matrix.h"
20#include "statistc.h"
21
22struct Pix;
23
24namespace tesseract {
25
26template <typename T>
27class UnicityTable;
28struct FontInfo;
29class FontInfoTable;
30class SampleIterator;
31class ShapeClassifier;
32class TrainingSample;
33struct UnicharRating;
34
35// Enumeration of the different types of error count.
36// Error counts work as follows:
37//
38// Ground truth is a valid unichar-id / font-id pair:
39// Number of classifier answers?
40// 0 >0
41// CT_REJECT unichar-id matches top shape?
42// __________ yes! no
43// CT_UNICHAR_TOP_OK CT_UNICHAR_TOP1_ERR
44// Top shape-id has multiple unichars? 2nd shape unichar id matches?
45// yes! no yes! no
46// CT_OK_MULTI_UNICHAR | _____ CT_UNICHAR_TOP2_ERR
47// Font attributes match? Any unichar-id matches?
48// yes! no yes! no
49// CT_FONT_ATTR_OK CT_FONT_ATTR_ERR ______ CT_UNICHAR_TOPN_ERR
50// | __________________ _________________
51// Top shape-id has multiple font attrs?
52// yes! no
53// CT_OK_MULTI_FONT
54// _____________________________
55//
56// Note that multiple counts may be activated for a single sample!
57//
58// Ground truth is for a fragment/n-gram that is NOT in the unicharset.
59// This is called junk and is expected to be rejected:
60// Number of classifier answers?
61// 0 >0
62// CT_REJECTED_JUNK CT_ACCEPTED_JUNK
63//
64// Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
65// the mean rank of the correct result, counting from 0, and with an error
66// receiving the number of answers as the correct rank.
67//
68// Keep in sync with the ReportString function.
70 CT_UNICHAR_TOP_OK, // Top shape contains correct unichar id.
71 // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
72 // kRatingEpsilon from the first result in each group. The real top choice
73 // is measured using TOPTOP.
74 CT_UNICHAR_TOP1_ERR, // Top shape does not contain correct unichar id.
75 CT_UNICHAR_TOP2_ERR, // Top 2 shapes don't contain correct unichar id.
76 CT_UNICHAR_TOPN_ERR, // No output shape contains correct unichar id.
77 CT_UNICHAR_TOPTOP_ERR, // Very top choice not correct.
78 CT_OK_MULTI_UNICHAR, // Top shape id has correct unichar id, and others.
79 CT_OK_JOINED, // Top shape id is correct but marked joined.
80 CT_OK_BROKEN, // Top shape id is correct but marked broken.
81 CT_REJECT, // Classifier hates this.
82 CT_FONT_ATTR_ERR, // Top unichar OK, but font attributes incorrect.
83 CT_OK_MULTI_FONT, // CT_FONT_ATTR_OK but there are multiple font attrs.
84 CT_NUM_RESULTS, // Number of answers produced.
85 CT_RANK, // Rank of correct answer.
86 CT_REJECTED_JUNK, // Junk that was correctly rejected.
87 CT_ACCEPTED_JUNK, // Junk that was incorrectly classified otherwise.
88
89 CT_SIZE // Number of types for array sizing.
90};
91
92// Class to encapsulate all the functionality and sub-structures required
93// to count errors for an isolated character classifier (ShapeClassifier).
95public:
96 // Computes and returns the unweighted boosting_mode error rate of the given
97 // classifier. Can be used for testing, or inside an iterative training
98 // system, including one that uses boosting.
99 // report_levels:
100 // 0 = no output.
101 // 1 = bottom-line error rate.
102 // 2 = bottom-line error rate + time.
103 // 3 = font-level error rate + time.
104 // 4 = list of all errors + short classifier debug output on 16 errors.
105 // 5 = list of all errors + short classifier debug output on 25 errors.
106 // * The boosting_mode determines which error type is used for computing the
107 // scaled_error output, and setting the is_error flag in the samples.
108 // * The fontinfo_table is used to get string font names for the debug
109 // output, and also to count font attributes errors.
110 // * The page_images vector may contain a Pix* (which may be nullptr) for each
111 // page index assigned to the samples.
112 // * The it provides encapsulated iteration over some sample set.
113 // * The outputs unichar_error, scaled_error and totals_report are all
114 // optional.
115 // * If not nullptr, unichar error gets the top1 unichar error rate.
116 // * Scaled_error gets the error chosen by boosting_mode weighted by the
117 // weights on the samples.
118 // * Fonts_report gets a string summarizing the error rates for each font in
119 // both human-readable form and as a tab-separated list of error counts.
120 // The human-readable form is all before the first tab.
121 // * The return value is the un-weighted version of the scaled_error.
122 static double ComputeErrorRate(ShapeClassifier *classifier, int report_level,
123 CountTypes boosting_mode, const FontInfoTable &fontinfo_table,
124 const std::vector<Image > &page_images, SampleIterator *it,
125 double *unichar_error, double *scaled_error, std::string *fonts_report);
126 // Tests a pair of classifiers, debugging errors of the new against the old.
127 // See errorcounter.h for description of arguments.
128 // Iterates over the samples, calling the classifiers in normal/silent mode.
129 // If the new_classifier makes a boosting_mode error that the old_classifier
130 // does not, and the appropriate, it will then call the new_classifier again
131 // with a debug flag and a keep_this argument to find out what is going on.
132 static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier,
133 CountTypes boosting_mode, const FontInfoTable &fontinfo_table,
134 const std::vector<Image > &page_images, SampleIterator *it);
135
136private:
137 // Simple struct to hold an array of counts.
138 struct Counts {
139 Counts();
140 // Adds other into this for computing totals.
141 void operator+=(const Counts &other);
142
143 int n[CT_SIZE];
144 };
145
146 // Constructor is private. Only anticipated use of ErrorCounter is via
147 // the static ComputeErrorRate.
148 ErrorCounter(const UNICHARSET &unicharset, int fontsize);
149 ~ErrorCounter() = default;
150
151 // Accumulates the errors from the classifier results on a single sample.
152 // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
153 // boosting_mode selects the type of error to be used for boosting and the
154 // is_error_ member of sample is set according to whether the required type
155 // of error occurred. The font_table provides access to font properties
156 // for error counting and shape_table is used to understand the relationship
157 // between unichar_ids and shape_ids in the results
158 bool AccumulateErrors(bool debug, CountTypes boosting_mode, const FontInfoTable &font_table,
159 const std::vector<UnicharRating> &results, TrainingSample *sample);
160
161 // Accumulates counts for junk. Counts only whether the junk was correctly
162 // rejected or not.
163 bool AccumulateJunk(bool debug, const std::vector<UnicharRating> &results,
164 TrainingSample *sample);
165
166 // Creates a report of the error rate. The report_level controls the detail
167 // that is reported to stderr via tprintf:
168 // 0 -> no output.
169 // >=1 -> bottom-line error rate.
170 // >=3 -> font-level error rate.
171 // boosting_mode determines the return value. It selects which (un-weighted)
172 // error rate to return.
173 // The fontinfo_table from MasterTrainer provides the names of fonts.
174 // The it determines the current subset of the training samples.
175 // If not nullptr, the top-choice unichar error rate is saved in
176 // unichar_error. If not nullptr, the report string is saved in fonts_report.
177 // (Ignoring report_level).
178 double ReportErrors(int report_level, CountTypes boosting_mode,
179 const FontInfoTable &fontinfo_table, const SampleIterator &it,
180 double *unichar_error, std::string *fonts_report);
181
182 // Sets the report string to a combined human and machine-readable report
183 // string of the error rates.
184 // Returns false if there is no data, leaving report unchanged, unless
185 // even_if_empty is true.
186 static bool ReportString(bool even_if_empty, const Counts &counts, std::string &report);
187
188 // Computes the error rates and returns in rates which is an array of size
189 // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
190 static bool ComputeRates(const Counts &counts, double rates[CT_SIZE]);
191
192 // Total scaled error used by boosting algorithms.
193 double scaled_error_;
194 // Difference in result rating to be thought of as an "equal" choice.
195 double rating_epsilon_;
196 // Vector indexed by font_id from the samples of error accumulators.
197 std::vector<Counts> font_counts_;
198 // Counts of the results that map each unichar_id (from samples) to an
199 // incorrect shape_id.
200 GENERIC_2D_ARRAY<int> unichar_counts_;
201 // Count of the number of times each shape_id occurs, is correct, and multi-
202 // unichar.
203 std::vector<int> multi_unichar_counts_;
204 // Histogram of scores (as percent) for correct answers.
205 STATS ok_score_hist_;
206 // Histogram of scores (as percent) for incorrect answers.
207 STATS bad_score_hist_;
208 // Unicharset for printing character ids in results.
209 const UNICHARSET &unicharset_;
210};
211
212} // namespace tesseract.
213
214#endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
Definition: points.h:372
@ CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76
@ CT_UNICHAR_TOP_OK
Definition: errorcounter.h:70
@ CT_UNICHAR_TOP1_ERR
Definition: errorcounter.h:74
@ CT_UNICHAR_TOP2_ERR
Definition: errorcounter.h:75
@ CT_OK_MULTI_FONT
Definition: errorcounter.h:83
@ CT_REJECTED_JUNK
Definition: errorcounter.h:86
@ CT_UNICHAR_TOPTOP_ERR
Definition: errorcounter.h:77
@ CT_FONT_ATTR_ERR
Definition: errorcounter.h:82
@ CT_ACCEPTED_JUNK
Definition: errorcounter.h:87
@ CT_OK_MULTI_UNICHAR
Definition: errorcounter.h:78
@ CT_NUM_RESULTS
Definition: errorcounter.h:84
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it)
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const std::vector< Image > &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, std::string *fonts_report)