tesseract v5.3.3.20231005
statistc.cpp
Go to the documentation of this file.
1/**********************************************************************
2 * File: statistc.cpp (Formerly stats.c)
3 * Description: Simple statistical package for integer values.
4 * Author: Ray Smith
5 *
6 * (C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19// Include automatically generated configuration file if running autoconf.
20#ifdef HAVE_CONFIG_H
21# include "config_auto.h"
22#endif
23
24#include "statistc.h"
25
26#include "errcode.h"
27#include "scrollview.h"
28#include "tprintf.h"
29
30#include "helpers.h"
31
32#include <cmath>
33#include <cstdlib>
34#include <cstring>
35
36namespace tesseract {
37
38/**********************************************************************
39 * STATS::STATS
40 *
41 * Construct a new stats element by allocating and zeroing the memory.
42 **********************************************************************/
43STATS::STATS(int32_t min_bucket_value, int32_t max_bucket_value) {
44 if (max_bucket_value < min_bucket_value) {
45 min_bucket_value = 0;
46 max_bucket_value = 1;
47 }
48 rangemin_ = min_bucket_value; // setup
49 rangemax_ = max_bucket_value;
50 buckets_ = new int32_t[1 + rangemax_ - rangemin_];
51 clear();
52}
53
54/**********************************************************************
55 * STATS::set_range
56 *
57 * Alter the range on an existing stats element.
58 **********************************************************************/
59bool STATS::set_range(int32_t min_bucket_value, int32_t max_bucket_value) {
60 if (max_bucket_value < min_bucket_value) {
61 return false;
62 }
63 if (rangemax_ - rangemin_ != max_bucket_value - min_bucket_value) {
64 delete[] buckets_;
65 buckets_ = new int32_t[1 + max_bucket_value - min_bucket_value];
66 }
67 rangemin_ = min_bucket_value; // setup
68 rangemax_ = max_bucket_value;
69 clear(); // zero it
70 return true;
71}
72
73/**********************************************************************
74 * STATS::clear
75 *
76 * Clear out the STATS class by zeroing all the buckets.
77 **********************************************************************/
78void STATS::clear() { // clear out buckets
79 total_count_ = 0;
80 if (buckets_ != nullptr) {
81 memset(buckets_, 0, (1 + rangemax_ - rangemin_) * sizeof(buckets_[0]));
82 }
83}
84
85/**********************************************************************
86 * STATS::~STATS
87 *
88 * Destructor for a stats class.
89 **********************************************************************/
91 delete[] buckets_;
92}
93
94/**********************************************************************
95 * STATS::add
96 *
97 * Add a set of samples to (or delete from) a pile.
98 **********************************************************************/
99void STATS::add(int32_t value, int32_t count) {
100 if (buckets_ != nullptr) {
101 value = ClipToRange(value, rangemin_, rangemax_);
102 buckets_[value - rangemin_] += count;
103 total_count_ += count; // keep count of total
104 }
105}
106
107/**********************************************************************
108 * STATS::mode
109 *
110 * Find the mode of a stats class.
111 **********************************************************************/
112int32_t STATS::mode() const { // get mode of samples
113 if (buckets_ == nullptr) {
114 return rangemin_;
115 }
116 int32_t max = buckets_[0]; // max cell count
117 int32_t maxindex = 0; // index of max
118 for (int index = rangemax_ - rangemin_; index > 0; --index) {
119 if (buckets_[index] > max) {
120 max = buckets_[index]; // find biggest
121 maxindex = index;
122 }
123 }
124 return maxindex + rangemin_; // index of biggest
125}
126
127/**********************************************************************
128 * STATS::mean
129 *
130 * Find the mean of a stats class.
131 **********************************************************************/
132double STATS::mean() const { // get mean of samples
133 if (buckets_ == nullptr || total_count_ <= 0) {
134 return static_cast<double>(rangemin_);
135 }
136 int64_t sum = 0;
137 for (int index = rangemax_ - rangemin_; index >= 0; --index) {
138 sum += static_cast<int64_t>(index) * buckets_[index];
139 }
140 return static_cast<double>(sum) / total_count_ + rangemin_;
141}
142
143/**********************************************************************
144 * STATS::sd
145 *
146 * Find the standard deviation of a stats class.
147 **********************************************************************/
148double STATS::sd() const { // standard deviation
149 if (buckets_ == nullptr || total_count_ <= 0) {
150 return 0.0;
151 }
152 int64_t sum = 0;
153 double sqsum = 0.0;
154 for (int index = rangemax_ - rangemin_; index >= 0; --index) {
155 sum += static_cast<int64_t>(index) * buckets_[index];
156 sqsum += static_cast<double>(index) * index * buckets_[index];
157 }
158 double variance = static_cast<double>(sum) / total_count_;
159 variance = sqsum / total_count_ - variance * variance;
160 if (variance > 0.0) {
161 return sqrt(variance);
162 }
163 return 0.0;
164}
165
166/**********************************************************************
167 * STATS::ile
168 *
169 * Returns the fractile value such that frac fraction (in [0,1]) of samples
170 * has a value less than the return value.
171 **********************************************************************/
172double STATS::ile(double frac) const {
173 if (buckets_ == nullptr || total_count_ == 0) {
174 return static_cast<double>(rangemin_);
175 }
176#if 0
177 // TODO(rays) The existing code doesn't seem to be doing the right thing
178 // with target a double but this substitute crashes the code that uses it.
179 // Investigate and fix properly.
180 int target = IntCastRounded(frac * total_count_);
181 target = ClipToRange(target, 1, total_count_);
182#else
183 double target = frac * total_count_;
184 target = ClipToRange(target, 1.0, static_cast<double>(total_count_));
185#endif
186 int sum = 0;
187 int index = 0;
188 for (index = 0; index <= rangemax_ - rangemin_ && sum < target; sum += buckets_[index++]) {
189 ;
190 }
191 if (index > 0) {
192 ASSERT_HOST(buckets_[index - 1] > 0);
193 return rangemin_ + index - static_cast<double>(sum - target) / buckets_[index - 1];
194 } else {
195 return static_cast<double>(rangemin_);
196 }
197}
198
199/**********************************************************************
200 * STATS::min_bucket
201 *
202 * Find REAL minimum bucket - ile(0.0) isn't necessarily correct
203 **********************************************************************/
204int32_t STATS::min_bucket() const { // Find min
205 if (buckets_ == nullptr || total_count_ == 0) {
206 return rangemin_;
207 }
208 int32_t min = 0;
209 for (min = 0; (min <= rangemax_ - rangemin_) && (buckets_[min] == 0); min++) {
210 ;
211 }
212 return rangemin_ + min;
213}
214
215/**********************************************************************
216 * STATS::max_bucket
217 *
218 * Find REAL maximum bucket - ile(1.0) isn't necessarily correct
219 **********************************************************************/
220
221int32_t STATS::max_bucket() const { // Find max
222 if (buckets_ == nullptr || total_count_ == 0) {
223 return rangemin_;
224 }
225 int32_t max;
226 for (max = rangemax_ - rangemin_; max > 0 && buckets_[max] == 0; max--) {
227 ;
228 }
229 return rangemin_ + max;
230}
231
232/**********************************************************************
233 * STATS::median
234 *
235 * Finds a more useful estimate of median than ile(0.5).
236 *
237 * Overcomes a problem with ile() - if the samples are, for example,
238 * 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
239 * between 6 and 13 = 9.5
240 **********************************************************************/
241double STATS::median() const { // get median
242 if (buckets_ == nullptr) {
243 return static_cast<double>(rangemin_);
244 }
245 double median = ile(0.5);
246 int median_pile = static_cast<int>(floor(median));
247 if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
248 int32_t min_pile;
249 int32_t max_pile;
250 /* Find preceding non zero pile */
251 for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--) {
252 ;
253 }
254 /* Find following non zero pile */
255 for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++) {
256 ;
257 }
258 median = (min_pile + max_pile) / 2.0;
259 }
260 return median;
261}
262
263/**********************************************************************
264 * STATS::local_min
265 *
266 * Return true if this point is a local min.
267 **********************************************************************/
268bool STATS::local_min(int32_t x) const {
269 if (buckets_ == nullptr) {
270 return false;
271 }
272 x = ClipToRange(x, rangemin_, rangemax_) - rangemin_;
273 if (buckets_[x] == 0) {
274 return true;
275 }
276 int32_t index; // table index
277 for (index = x - 1; index >= 0 && buckets_[index] == buckets_[x]; --index) {
278 ;
279 }
280 if (index >= 0 && buckets_[index] < buckets_[x]) {
281 return false;
282 }
283 for (index = x + 1; index <= rangemax_ - rangemin_ && buckets_[index] == buckets_[x]; ++index) {
284 ;
285 }
286 if (index <= rangemax_ - rangemin_ && buckets_[index] < buckets_[x]) {
287 return false;
288 } else {
289 return true;
290 }
291}
292
293/**********************************************************************
294 * STATS::smooth
295 *
296 * Apply a triangular smoothing filter to the stats.
297 * This makes the modes a bit more useful.
298 * The factor gives the height of the triangle, i.e. the weight of the
299 * centre.
300 **********************************************************************/
301void STATS::smooth(int32_t factor) {
302 if (buckets_ == nullptr || factor < 2) {
303 return;
304 }
305 STATS result(rangemin_, rangemax_);
306 int entrycount = 1 + rangemax_ - rangemin_;
307 for (int entry = 0; entry < entrycount; entry++) {
308 // centre weight
309 int count = buckets_[entry] * factor;
310 for (int offset = 1; offset < factor; offset++) {
311 if (entry - offset >= 0) {
312 count += buckets_[entry - offset] * (factor - offset);
313 }
314 if (entry + offset < entrycount) {
315 count += buckets_[entry + offset] * (factor - offset);
316 }
317 }
318 result.add(entry + rangemin_, count);
319 }
320 total_count_ = result.total_count_;
321 memcpy(buckets_, result.buckets_, entrycount * sizeof(buckets_[0]));
322}
323
324/**********************************************************************
325 * STATS::cluster
326 *
327 * Cluster the samples into max_cluster clusters.
328 * Each call runs one iteration. The array of clusters must be
329 * max_clusters+1 in size as cluster 0 is used to indicate which samples
330 * have been used.
331 * The return value is the current number of clusters.
332 **********************************************************************/
333
334int32_t STATS::cluster(float lower, // thresholds
335 float upper,
336 float multiple, // distance threshold
337 int32_t max_clusters, // max no to make
338 STATS *clusters) { // array of clusters
339 bool new_cluster; // added one
340 float *centres; // cluster centres
341 int32_t entry; // bucket index
342 int32_t cluster; // cluster index
343 int32_t best_cluster; // one to assign to
344 int32_t new_centre = 0; // residual mode
345 int32_t new_mode; // pile count of new_centre
346 int32_t count; // pile to place
347 float dist; // from cluster
348 float min_dist; // from best_cluster
349 int32_t cluster_count; // no of clusters
350
351 if (buckets_ == nullptr || max_clusters < 1) {
352 return 0;
353 }
354 centres = new float[max_clusters + 1];
355 for (cluster_count = 1;
356 cluster_count <= max_clusters && clusters[cluster_count].buckets_ != nullptr &&
357 clusters[cluster_count].total_count_ > 0;
358 cluster_count++) {
359 centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
360 new_centre = clusters[cluster_count].mode();
361 for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
362 pile_count(entry) <= pile_count(entry + 1);
363 entry--) {
364 count = pile_count(entry) - clusters[0].pile_count(entry);
365 if (count > 0) {
366 clusters[cluster_count].add(entry, count);
367 clusters[0].add(entry, count);
368 }
369 }
370 for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&
371 pile_count(entry) <= pile_count(entry - 1);
372 entry++) {
373 count = pile_count(entry) - clusters[0].pile_count(entry);
374 if (count > 0) {
375 clusters[cluster_count].add(entry, count);
376 clusters[0].add(entry, count);
377 }
378 }
379 }
380 cluster_count--;
381
382 if (cluster_count == 0) {
383 clusters[0].set_range(rangemin_, rangemax_);
384 }
385 do {
386 new_cluster = false;
387 new_mode = 0;
388 for (entry = 0; entry <= rangemax_ - rangemin_; entry++) {
389 count = buckets_[entry] - clusters[0].buckets_[entry];
390 // remaining pile
391 if (count > 0) { // any to handle
392 min_dist = static_cast<float>(INT32_MAX);
393 best_cluster = 0;
394 for (cluster = 1; cluster <= cluster_count; cluster++) {
395 dist = entry + rangemin_ - centres[cluster];
396 // find distance
397 if (dist < 0) {
398 dist = -dist;
399 }
400 if (dist < min_dist) {
401 min_dist = dist; // find least
402 best_cluster = cluster;
403 }
404 }
405 if (min_dist > upper // far enough for new
406 && (best_cluster == 0 || entry + rangemin_ > centres[best_cluster] * multiple ||
407 entry + rangemin_ < centres[best_cluster] / multiple)) {
408 if (count > new_mode) {
409 new_mode = count;
410 new_centre = entry + rangemin_;
411 }
412 }
413 }
414 }
415 // need new and room
416 if (new_mode > 0 && cluster_count < max_clusters) {
417 cluster_count++;
418 new_cluster = true;
419 if (!clusters[cluster_count].set_range(rangemin_, rangemax_)) {
420 delete[] centres;
421 return 0;
422 }
423 centres[cluster_count] = static_cast<float>(new_centre);
424 clusters[cluster_count].add(new_centre, new_mode);
425 clusters[0].add(new_centre, new_mode);
426 for (entry = new_centre - 1; centres[cluster_count] - entry < lower && entry >= rangemin_ &&
427 pile_count(entry) <= pile_count(entry + 1);
428 entry--) {
429 count = pile_count(entry) - clusters[0].pile_count(entry);
430 if (count > 0) {
431 clusters[cluster_count].add(entry, count);
432 clusters[0].add(entry, count);
433 }
434 }
435 for (entry = new_centre + 1; entry - centres[cluster_count] < lower && entry <= rangemax_ &&
436 pile_count(entry) <= pile_count(entry - 1);
437 entry++) {
438 count = pile_count(entry) - clusters[0].pile_count(entry);
439 if (count > 0) {
440 clusters[cluster_count].add(entry, count);
441 clusters[0].add(entry, count);
442 }
443 }
444 centres[cluster_count] = static_cast<float>(clusters[cluster_count].ile(0.5));
445 }
446 } while (new_cluster && cluster_count < max_clusters);
447 delete[] centres;
448 return cluster_count;
449}
450
451// Helper tests that the current index is still part of the peak and gathers
452// the data into the peak, returning false when the peak is ended.
453// src_buckets[index] - used_buckets[index] is the unused part of the histogram.
454// prev_count is the histogram count of the previous index on entry and is
455// updated to the current index on return.
456// total_count and total_value are accumulating the mean of the peak.
457static bool GatherPeak(int index, const int *src_buckets, int *used_buckets, int *prev_count,
458 int *total_count, double *total_value) {
459 int pile_count = src_buckets[index] - used_buckets[index];
460 if (pile_count <= *prev_count && pile_count > 0) {
461 // Accumulate count and index.count product.
462 *total_count += pile_count;
463 *total_value += index * pile_count;
464 // Mark this index as used
465 used_buckets[index] = src_buckets[index];
466 *prev_count = pile_count;
467 return true;
468 } else {
469 return false;
470 }
471}
472
473// Finds (at most) the top max_modes modes, well actually the whole peak around
474// each mode, returning them in the given modes vector as a <mean of peak,
475// total count of peak> pair in order of decreasing total count.
476// Since the mean is the key and the count the data in the pair, a single call
477// to sort on the output will re-sort by increasing mean of peak if that is
478// more useful than decreasing total count.
479// Returns the actual number of modes found.
480int STATS::top_n_modes(int max_modes, std::vector<KDPairInc<float, int>> &modes) const {
481 if (max_modes <= 0) {
482 return 0;
483 }
484 int src_count = 1 + rangemax_ - rangemin_;
485 // Used copies the counts in buckets_ as they get used.
486 STATS used(rangemin_, rangemax_);
487 modes.clear();
488 // Total count of the smallest peak found so far.
489 int least_count = 1;
490 // Mode that is used as a seed for each peak
491 int max_count = 0;
492 do {
493 // Find an unused mode.
494 max_count = 0;
495 int max_index = 0;
496 for (int src_index = 0; src_index < src_count; src_index++) {
497 int pile_count = buckets_[src_index] - used.buckets_[src_index];
498 if (pile_count > max_count) {
499 max_count = pile_count;
500 max_index = src_index;
501 }
502 }
503 if (max_count > 0) {
504 // Copy the bucket count to used so it doesn't get found again.
505 used.buckets_[max_index] = max_count;
506 // Get the entire peak.
507 double total_value = max_index * max_count;
508 int total_count = max_count;
509 int prev_pile = max_count;
510 for (int offset = 1; max_index + offset < src_count; ++offset) {
511 if (!GatherPeak(max_index + offset, buckets_, used.buckets_, &prev_pile, &total_count,
512 &total_value)) {
513 break;
514 }
515 }
516 prev_pile = buckets_[max_index];
517 for (int offset = 1; max_index - offset >= 0; ++offset) {
518 if (!GatherPeak(max_index - offset, buckets_, used.buckets_, &prev_pile, &total_count,
519 &total_value)) {
520 break;
521 }
522 }
523 if (total_count > least_count || modes.size() < static_cast<size_t>(max_modes)) {
524 // We definitely want this mode, so if we have enough discard the least.
525 if (modes.size() == static_cast<size_t>(max_modes)) {
526 modes.resize(max_modes - 1);
527 }
528 size_t target_index = 0;
529 // Linear search for the target insertion point.
530 while (target_index < modes.size() && modes[target_index].data() >= total_count) {
531 ++target_index;
532 }
533 auto peak_mean = static_cast<float>(total_value / total_count + rangemin_);
534 modes.insert(modes.begin() + target_index, KDPairInc<float, int>(peak_mean, total_count));
535 least_count = modes.back().data();
536 }
537 }
538 } while (max_count > 0);
539 return modes.size();
540}
541
542/**********************************************************************
543 * STATS::print
544 *
545 * Prints a summary and table of the histogram.
546 **********************************************************************/
547void STATS::print() const {
548 if (buckets_ == nullptr) {
549 return;
550 }
551 int32_t min = min_bucket() - rangemin_;
552 int32_t max = max_bucket() - rangemin_;
553
554 int num_printed = 0;
555 for (int index = min; index <= max; index++) {
556 if (buckets_[index] != 0) {
557 tprintf("%4d:%-3d ", rangemin_ + index, buckets_[index]);
558 if (++num_printed % 8 == 0) {
559 tprintf("\n");
560 }
561 }
562 }
563 tprintf("\n");
565}
566
567/**********************************************************************
568 * STATS::print_summary
569 *
570 * Print a summary of the stats.
571 **********************************************************************/
573 if (buckets_ == nullptr) {
574 return;
575 }
576 int32_t min = min_bucket();
577 int32_t max = max_bucket();
578 tprintf("Total count=%d\n", total_count_);
579 tprintf("Min=%.2f Really=%d\n", ile(0.0), min);
580 tprintf("Lower quartile=%.2f\n", ile(0.25));
581 tprintf("Median=%.2f, ile(0.5)=%.2f\n", median(), ile(0.5));
582 tprintf("Upper quartile=%.2f\n", ile(0.75));
583 tprintf("Max=%.2f Really=%d\n", ile(1.0), max);
584 tprintf("Range=%d\n", max + 1 - min);
585 tprintf("Mean= %.2f\n", mean());
586 tprintf("SD= %.2f\n", sd());
587}
588
589/**********************************************************************
590 * STATS::plot
591 *
592 * Draw a histogram of the stats table.
593 **********************************************************************/
594
595#ifndef GRAPHICS_DISABLED
596void STATS::plot(ScrollView *window, // to draw in
597 float xorigin, // bottom left
598 float yorigin,
599 float xscale, // one x unit
600 float yscale, // one y unit
601 ScrollView::Color colour) const { // colour to draw in
602 if (buckets_ == nullptr) {
603 return;
604 }
605 window->Pen(colour);
606
607 for (int index = 0; index <= rangemax_ - rangemin_; index++) {
608 window->Rectangle(xorigin + xscale * index, yorigin, xorigin + xscale * (index + 1),
609 yorigin + yscale * buckets_[index]);
610 }
611}
612#endif
613
614/**********************************************************************
615 * STATS::plotline
616 *
617 * Draw a histogram of the stats table. (Line only)
618 **********************************************************************/
619
620#ifndef GRAPHICS_DISABLED
621void STATS::plotline(ScrollView *window, // to draw in
622 float xorigin, // bottom left
623 float yorigin,
624 float xscale, // one x unit
625 float yscale, // one y unit
626 ScrollView::Color colour) const { // colour to draw in
627 if (buckets_ == nullptr) {
628 return;
629 }
630 window->Pen(colour);
631 window->SetCursor(xorigin, yorigin + yscale * buckets_[0]);
632 for (int index = 0; index <= rangemax_ - rangemin_; index++) {
633 window->DrawTo(xorigin + xscale * index, yorigin + yscale * buckets_[index]);
634 }
635}
636#endif
637
638} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
int value
int * count
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
void print_summary() const
Definition: statistc.cpp:572
void print() const
Definition: statistc.cpp:547
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:596
int32_t pile_count(int32_t value) const
Definition: statistc.h:72
double median() const
Definition: statistc.cpp:241
int32_t min_bucket() const
Definition: statistc.cpp:204
int top_n_modes(int max_modes, std::vector< KDPairInc< float, int > > &modes) const
Definition: statistc.cpp:480
int32_t cluster(float lower, float upper, float multiple, int32_t max_clusters, STATS *clusters)
Definition: statistc.cpp:334
bool local_min(int32_t x) const
Definition: statistc.cpp:268
int32_t mode() const
Definition: statistc.cpp:112
void smooth(int32_t factor)
Definition: statistc.cpp:301
int32_t max_bucket() const
Definition: statistc.cpp:221
double ile(double frac) const
Definition: statistc.cpp:172
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:621
bool set_range(int32_t min_bucket_value, int32_t max_bucket_value)
Definition: statistc.cpp:59
double sd() const
Definition: statistc.cpp:148
double mean() const
Definition: statistc.cpp:132
void Pen(Color color)
Definition: scrollview.cpp:710
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:576
void SetCursor(int x, int y)
Definition: scrollview.cpp:485
void DrawTo(int x, int y)
Definition: scrollview.cpp:491