tesseract  4.0.0-beta.1-59-g2cc4
cluster.h
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.h
3  ** Purpose: Definition of feature space clustering routines
4  ** Author: Dan Johnson
5  ** History: 5/29/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #ifndef CLUSTER_H
19 #define CLUSTER_H
20 
21 #include "kdtree.h"
22 #include "oldlist.h"
23 
24 struct BUCKETS;
25 
26 #define MINBUCKETS 5
27 #define MAXBUCKETS 39
28 
29 /*----------------------------------------------------------------------
30  Types
31 ----------------------------------------------------------------------*/
32 typedef struct sample {
33  unsigned Clustered:1; // TRUE if included in a higher cluster
34  unsigned Prototype:1; // TRUE if cluster represented by a proto
35  unsigned SampleCount:30; // number of samples in this cluster
36  struct sample *Left; // ptr to left sub-cluster
37  struct sample *Right; // ptr to right sub-cluster
38  int32_t CharID; // identifier of char sample came from
39  FLOAT32 Mean[1]; // mean of cluster - SampleSize floats
40 } CLUSTER;
41 
42 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster
43 
44 typedef enum {
46 } PROTOSTYLE;
47 
48 typedef struct { // parameters to control clustering
49  PROTOSTYLE ProtoStyle; // specifies types of protos to be made
50  FLOAT32 MinSamples; // min # of samples per proto - % of total
51  FLOAT32 MaxIllegal; // max percentage of samples in a cluster which have
52  // more than 1 feature in that cluster
53  FLOAT32 Independence; // desired independence between dimensions
54  FLOAT64 Confidence; // desired confidence in prototypes created
55  int MagicSamples; // Ideal number of samples in a cluster.
57 
58 typedef enum {
60 } DISTRIBUTION;
61 
62 typedef union {
65 } FLOATUNION;
66 
67 typedef struct {
68  unsigned Significant:1; // TRUE if prototype is significant
69  unsigned Merged:1; // Merged after clustering so do not output
70  // but kept for display purposes. If it has no
71  // samples then it was actually merged.
72  // Otherwise it matched an already significant
73  // cluster.
74  unsigned Style:2; // spherical, elliptical, or mixed
75  unsigned NumSamples:28; // number of samples in the cluster
76  CLUSTER *Cluster; // ptr to cluster which made prototype
77  DISTRIBUTION *Distrib; // different distribution for each dimension
78  FLOAT32 *Mean; // prototype mean
79  FLOAT32 TotalMagnitude; // total magnitude over all dimensions
80  FLOAT32 LogMagnitude; // log base e of TotalMagnitude
81  FLOATUNION Variance; // prototype variance
82  FLOATUNION Magnitude; // magnitude of density function
83  FLOATUNION Weight; // weight of density function
84 } PROTOTYPE;
85 
86 typedef struct {
87  int16_t SampleSize; // number of parameters per sample
88  PARAM_DESC *ParamDesc; // description of each parameter
89  int32_t NumberOfSamples; // total number of samples being clustered
90  KDTREE *KDTree; // for optimal nearest neighbor searching
91  CLUSTER *Root; // ptr to root cluster of cluster tree
92  LIST ProtoList; // list of prototypes
93  int32_t NumChar; // # of characters represented by samples
94  // cache of reusable histograms by distribution type and number of buckets.
96 } CLUSTERER;
97 
98 typedef struct {
99  int32_t NumSamples; // number of samples in list
100  int32_t MaxNumSamples; // maximum size of list
101  SAMPLE *Sample[1]; // array of ptrs to sample data structures
102 } SAMPLELIST;
103 
104 // low level cluster tree analysis routines.
105 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
106 
107 /*--------------------------------------------------------------------------
108  Public Function Prototypes
109 --------------------------------------------------------------------------*/
110 CLUSTERER *MakeClusterer (int16_t SampleSize, const PARAM_DESC ParamDesc[]);
111 
112 SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, int32_t CharID);
113 
115 
116 void FreeClusterer(CLUSTERER *Clusterer);
117 
118 void FreeProtoList(LIST *ProtoList);
119 
120 void FreePrototype(void *arg); // PROTOTYPE *Prototype);
121 
122 CLUSTER *NextSample(LIST *SearchState);
123 
124 FLOAT32 Mean(PROTOTYPE *Proto, uint16_t Dimension);
125 
126 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension);
127 
128 int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2,
129  FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
130 
131 //--------------Global Data Definitions and Declarations---------------------------
132 // define errors that can be trapped
133 #define ALREADYCLUSTERED 4000
134 #endif
Definition: cluster.h:45
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 MaxIllegal
Definition: cluster.h:51
unsigned SampleCount
Definition: cluster.h:35
unsigned NumSamples
Definition: cluster.h:75
#define MAXBUCKETS
Definition: cluster.h:27
FLOAT64 Confidence
Definition: cluster.h:54
int MagicSamples
Definition: cluster.h:55
#define MINBUCKETS
Definition: cluster.h:26
int32_t MaxNumSamples
Definition: cluster.h:100
int16_t SampleSize
Definition: cluster.h:87
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
float FLOAT32
Definition: host.h:34
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:546
DISTRIBUTION * Distrib
Definition: cluster.h:77
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
Definition: cluster.cpp:880
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, int32_t CharID)
Definition: cluster.cpp:455
Definition: cluster.h:32
FLOAT32 * Mean
Definition: cluster.h:78
double FLOAT64
Definition: host.h:35
void FreePrototype(void *arg)
Definition: cluster.cpp:587
int32_t NumSamples
Definition: cluster.h:99
FLOAT32 Independence
Definition: cluster.h:53
unsigned Style
Definition: cluster.h:74
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:657
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
Definition: kdtree.h:49
int32_t NumberOfSamples
Definition: cluster.h:89
FLOATUNION Magnitude
Definition: cluster.h:82
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Weight
Definition: cluster.h:83
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MinSamples
Definition: cluster.h:50
unsigned Prototype
Definition: cluster.h:34
Definition: cluster.h:59
struct sample CLUSTER
LIST ProtoList
Definition: cluster.h:92
FLOAT32 Spherical
Definition: cluster.h:63
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
struct sample * Right
Definition: cluster.h:37
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:512
int32_t CharID
Definition: cluster.h:38
unsigned Clustered
Definition: cluster.h:33
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
unsigned Merged
Definition: cluster.h:69
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:620
FLOAT32 TotalMagnitude
Definition: cluster.h:79
DISTRIBUTION
Definition: cluster.h:58
FLOAT32 LogMagnitude
Definition: cluster.h:80
CLUSTERCONFIG Config
PROTOSTYLE
Definition: cluster.h:44
unsigned Significant
Definition: cluster.h:68
struct sample * Left
Definition: cluster.h:36
CLUSTER SAMPLE
Definition: cluster.h:42
int32_t NumChar
Definition: cluster.h:93