tesseract v5.3.3.20231005
statistc.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: statistc.h (Formerly stats.h)
3 * Description: Class description for STATS class.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19#ifndef TESSERACT_CCSTRUCT_STATISTC_H_
20#define TESSERACT_CCSTRUCT_STATISTC_H_
21
22#include <cstdio>
23#include "kdpair.h"
24#include "scrollview.h"
25
26namespace tesseract {
27
28// Simple histogram-based statistics for integer values in a known
29// range, such that the range is small compared to the number of samples.
31public:
32 // The histogram buckets are in the range
33 // [min_bucket_value, max_bucket_value].
34 // Any data under min_bucket value is silently mapped to min_bucket_value,
35 // and likewise, any data over max_bucket_value is silently mapped to
36 // max_bucket_value.
37 // In the internal array, min_bucket_value maps to 0 and
38 // 1 + max_bucket_value - min_bucket_value to the array size.
39 STATS(int32_t min_bucket_value, int32_t max_bucket_value);
40 STATS() = default; // empty for arrays
41
42 ~STATS();
43
44 // (Re)Sets the range and clears the counts.
45 // See the constructor for info on max and min values.
46 bool set_range(int32_t min_bucket_value, int32_t max_bucket_value);
47
48 void clear(); // empty buckets
49
50 void add(int32_t value, int32_t count);
51
52 // "Accessors" return various statistics on the data.
53 int32_t mode() const; // get mode of samples
54 double mean() const; // get mean of samples
55 double sd() const; // standard deviation
56 // Returns the fractile value such that frac fraction (in [0,1]) of samples
57 // has a value less than the return value.
58 double ile(double frac) const;
59 // Returns the minimum used entry in the histogram (ie the minimum of the
60 // data, NOT the minimum of the supplied range, nor is it an index.)
61 // Would normally be called min(), but that is a reserved word in VC++.
62 int32_t min_bucket() const; // Find min
63 // Returns the maximum used entry in the histogram (ie the maximum of the
64 // data, NOT the maximum of the supplied range, nor is it an index.)
65 int32_t max_bucket() const; // Find max
66 // Finds a more useful estimate of median than ile(0.5).
67 // Overcomes a problem with ile() - if the samples are, for example,
68 // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
69 // between 6 and 13 = 9.5
70 double median() const; // get median of samples
71 // Returns the count of the given value.
72 int32_t pile_count(int32_t value) const {
73 if (buckets_ == nullptr) {
74 return 0;
75 }
76 if (value <= rangemin_) {
77 return buckets_[0];
78 }
79 if (value >= rangemax_) {
80 return buckets_[rangemax_ - rangemin_];
81 }
82 return buckets_[value - rangemin_];
83 }
84 // Returns the total count of all buckets.
85 int32_t get_total() const {
86 return total_count_; // total of all piles
87 }
88 // Returns true if x is a local min.
89 bool local_min(int32_t x) const;
90
91 // Apply a triangular smoothing filter to the stats.
92 // This makes the modes a bit more useful.
93 // The factor gives the height of the triangle, i.e. the weight of the
94 // centre.
95 void smooth(int32_t factor);
96
97 // Cluster the samples into max_cluster clusters.
98 // Each call runs one iteration. The array of clusters must be
99 // max_clusters+1 in size as cluster 0 is used to indicate which samples
100 // have been used.
101 // The return value is the current number of clusters.
102 int32_t cluster(float lower, // thresholds
103 float upper,
104 float multiple, // distance threshold
105 int32_t max_clusters, // max no to make
106 STATS *clusters); // array of clusters
107
108 // Finds (at most) the top max_modes modes, well actually the whole peak
109 // around each mode, returning them in the given modes vector as a <mean of
110 // peak, total count of peak> pair in order of decreasing total count. Since
111 // the mean is the key and the count the data in the pair, a single call to
112 // sort on the output will re-sort by increasing mean of peak if that is more
113 // useful than decreasing total count. Returns the actual number of modes
114 // found.
115 int top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const;
116
117 // Prints a summary and table of the histogram.
118 void print() const;
119 // Prints summary stats only of the histogram.
120 void print_summary() const;
121
122#ifndef GRAPHICS_DISABLED
123 // Draws the histogram as a series of rectangles.
124 void plot(ScrollView *window, // window to draw in
125 float xorigin, // origin of histo
126 float yorigin, // gram
127 float xscale, // size of one unit
128 float yscale, // size of one uint
129 ScrollView::Color colour) const; // colour to draw in
130
131 // Draws a line graph of the histogram.
132 void plotline(ScrollView *window, // window to draw in
133 float xorigin, // origin of histo
134 float yorigin, // gram
135 float xscale, // size of one unit
136 float yscale, // size of one uint
137 ScrollView::Color colour) const; // colour to draw in
138#endif // !GRAPHICS_DISABLED
139
140private:
141 int32_t rangemin_ = 0; // min of range
142 int32_t rangemax_ = 0; // max of range
143 int32_t total_count_ = 0; // no of samples
144 int32_t *buckets_ = nullptr; // array of cells
145};
146
147} // namespace tesseract
148
149#endif // TESSERACT_CCSTRUCT_STATISTC_H_
int value
int * count
int32_t pile_count(int32_t value) const
Definition: statistc.h:72
int32_t get_total() const
Definition: statistc.h:85
#define TESS_API
Definition: export.h:32