tesseract v5.3.3.20231005
cluster.h
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: cluster.h
3 ** Purpose: Definition of feature space clustering routines
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *****************************************************************************/
17
18#ifndef CLUSTER_H
19#define CLUSTER_H
20
21#include "kdtree.h"
22#include "oldlist.h"
23
24namespace tesseract {
25
26struct BUCKETS;
27
28#define MINBUCKETS 5
29#define MAXBUCKETS 39
30
31/*----------------------------------------------------------------------
32 Types
33----------------------------------------------------------------------*/
34struct CLUSTER {
35 CLUSTER(size_t n) : Mean(n) {
36 }
37
39 delete Left;
40 delete Right;
41 }
42
43 bool Clustered : 1; // true if included in a higher cluster
44 bool Prototype : 1; // true if cluster represented by a proto
45 unsigned SampleCount : 30; // number of samples in this cluster
46 CLUSTER *Left; // ptr to left sub-cluster
47 CLUSTER *Right; // ptr to right sub-cluster
48 int32_t CharID; // identifier of char sample came from
49 std::vector<float> Mean; // mean of cluster - SampleSize floats
50};
51using SAMPLE = CLUSTER; // can refer to as either sample or cluster
52
54
55struct CLUSTERCONFIG { // parameters to control clustering
56 PROTOSTYLE ProtoStyle; // specifies types of protos to be made
57 float MinSamples; // min # of samples per proto - % of total
58 float MaxIllegal; // max percentage of samples in a cluster which
59 // have more than 1 feature in that cluster
60 float Independence; // desired independence between dimensions
61 double Confidence; // desired confidence in prototypes created
62 int MagicSamples; // Ideal number of samples in a cluster.
63};
64
66
68 float Spherical;
69 float *Elliptical;
70};
71
72struct PROTOTYPE {
73 bool Significant : 1; // true if prototype is significant
74 bool Merged : 1; // Merged after clustering so do not output
75 // but kept for display purposes. If it has no
76 // samples then it was actually merged.
77 // Otherwise it matched an already significant
78 // cluster.
79 unsigned Style : 2; // spherical, elliptical, or mixed
80 unsigned NumSamples : 28; // number of samples in the cluster
81 CLUSTER *Cluster; // ptr to cluster which made prototype
82 std::vector<DISTRIBUTION> Distrib; // different distribution for each dimension
83 std::vector<float> Mean; // prototype mean
84 float TotalMagnitude; // total magnitude over all dimensions
85 float LogMagnitude; // log base e of TotalMagnitude
86 FLOATUNION Variance; // prototype variance
87 FLOATUNION Magnitude; // magnitude of density function
88 FLOATUNION Weight; // weight of density function
89};
90
91struct CLUSTERER {
92 int16_t SampleSize; // number of parameters per sample
93 PARAM_DESC *ParamDesc; // description of each parameter
94 int32_t NumberOfSamples; // total number of samples being clustered
95 KDTREE *KDTree; // for optimal nearest neighbor searching
96 CLUSTER *Root; // ptr to root cluster of cluster tree
97 LIST ProtoList; // list of prototypes
98 uint32_t NumChar; // # of characters represented by samples
99 // cache of reusable histograms by distribution type and number of buckets.
101};
102
104 int32_t NumSamples; // number of samples in list
105 int32_t MaxNumSamples; // maximum size of list
106 SAMPLE *Sample[1]; // array of ptrs to sample data structures
107};
108
109// low level cluster tree analysis routines.
110#define InitSampleSearch(S, C) (((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))
111
112/*--------------------------------------------------------------------------
113 Public Function Prototypes
114--------------------------------------------------------------------------*/
116CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]);
117
119SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID);
120
122LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
123
125void FreeClusterer(CLUSTERER *Clusterer);
126
128void FreeProtoList(LIST *ProtoList);
129
130void FreePrototype(void *arg); // PROTOTYPE *Prototype);
131
132CLUSTER *NextSample(LIST *SearchState);
133
134float Mean(PROTOTYPE *Proto, uint16_t Dimension);
135
136float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension);
137
139int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
140 float m1[], float m2[]);
141
142} // namespace tesseract
143
144#endif
#define MAXBUCKETS
Definition: cluster.h:29
#define MINBUCKETS
Definition: cluster.h:28
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:1870
list_rec * LIST
Definition: oldlist.h:125
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1662
CLUSTERCONFIG Config
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:1638
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
void FreePrototype(void *arg)
Definition: cluster.cpp:1608
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1440
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1673
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1575
PROTOSTYLE
Definition: cluster.h:53
@ spherical
Definition: cluster.h:53
@ mixed
Definition: cluster.h:53
@ elliptical
Definition: cluster.h:53
@ automatic
Definition: cluster.h:53
CLUSTER SAMPLE
Definition: cluster.h:51
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1491
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1543
DISTRIBUTION
Definition: cluster.h:65
@ D_random
Definition: cluster.h:65
@ DISTRIBUTION_COUNT
Definition: cluster.h:65
@ uniform
Definition: cluster.h:65
@ normal
Definition: cluster.h:65
CLUSTER * Right
Definition: cluster.h:47
CLUSTER(size_t n)
Definition: cluster.h:35
int32_t CharID
Definition: cluster.h:48
unsigned SampleCount
Definition: cluster.h:45
CLUSTER * Left
Definition: cluster.h:46
std::vector< float > Mean
Definition: cluster.h:49
PROTOSTYLE ProtoStyle
Definition: cluster.h:56
float * Elliptical
Definition: cluster.h:69
float TotalMagnitude
Definition: cluster.h:84
unsigned Style
Definition: cluster.h:79
std::vector< float > Mean
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:81
FLOATUNION Magnitude
Definition: cluster.h:87
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
std::vector< DISTRIBUTION > Distrib
Definition: cluster.h:82
FLOATUNION Weight
Definition: cluster.h:88
int16_t SampleSize
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:96
PARAM_DESC * ParamDesc
Definition: cluster.h:93
KDTREE * KDTree
Definition: cluster.h:95
uint32_t NumChar
Definition: cluster.h:98
int32_t NumberOfSamples
Definition: cluster.h:94
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:100
SAMPLE * Sample[1]
Definition: cluster.h:106
int32_t MaxNumSamples
Definition: cluster.h:105
#define TESS_API
Definition: export.h:32