All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
cluster.cpp File Reference
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include "helpers.h"
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   ( ( (N) < 0 ) ? ( -(N) ) : (N) )
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   MAX_FLOAT32
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

typedef tesseract::KDPairInc
< float, TEMPCLUSTER * > 
ClusterPair
 
typedef tesseract::GenericHeap
< ClusterPair
ClusterHeap
 
typedef FLOAT64(* DENSITYFUNC )(inT32)
 
typedef FLOAT64(* SOLVEFUNC )(CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
inT32 MergeClusters (inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
 
void MakeDimRandom (uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (inT16 N, CLUSTER *Cluster)
 
BOOL8 Independent (PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
uinT16 OptimumNumberOfBuckets (uinT32 SampleCount)
 
FLOAT64 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 NormalDensity (inT32 x)
 
FLOAT64 UniformDensity (inT32 x)
 
FLOAT64 Integral (FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 NormalBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 UniformBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
BOOL8 DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uinT16 DegreesOfFreedom (DISTRIBUTION Distribution, uinT16 HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uinT32 NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 Solve (SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
 
FLOAT64 ChiArea (CHISTRUCT *ChiParams, FLOAT64 x)
 
BOOL8 MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

#define Abs (   N)    ( ( (N) < 0 ) ? ( -(N) ) : (N) )

Definition at line 208 of file cluster.cpp.

#define BUCKETTABLESIZE   1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 160 of file cluster.cpp.

#define CHIACCURACY   0.01
#define DELTARATIO   0.1
#define FTABLE_X   10

Definition at line 31 of file cluster.cpp.

#define FTABLE_Y   100

Definition at line 32 of file cluster.cpp.

#define HOTELLING   1

Definition at line 30 of file cluster.cpp.

#define ILLEGAL_CHAR   2
#define INITIALDELTA   0.1
#define LOOKUPTABLESIZE   8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 228 of file cluster.cpp.

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 229 of file cluster.cpp.

#define MAXDISTANCE   MAX_FLOAT32
#define MAXNEIGHBORS   2
#define MINALPHA   (1e-200)
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 151 of file cluster.cpp.

#define MINSAMPLESNEEDED   1

Definition at line 152 of file cluster.cpp.

#define MINSAMPLESPERBUCKET   5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 150 of file cluster.cpp.

#define MINVARIANCE   0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 142 of file cluster.cpp.

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 207 of file cluster.cpp.

#define NORMALEXTENT   3.0

Definition at line 161 of file cluster.cpp.

#define Odd (   N)    ((N)%2)

Definition at line 206 of file cluster.cpp.

#define SqrtOf2Pi   2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 218 of file cluster.cpp.

Typedef Documentation

Definition at line 169 of file cluster.cpp.

Definition at line 168 of file cluster.cpp.

typedef FLOAT64(* DENSITYFUNC)(inT32)

Definition at line 203 of file cluster.cpp.

typedef FLOAT64(* SOLVEFUNC)(CHISTRUCT *, double)

Definition at line 204 of file cluster.cpp.

Function Documentation

void AdjustBuckets ( BUCKETS Buckets,
uinT32  NewSampleCount 
)

This routine multiplies each ExpectedCount histogram entry by NewSampleCount/OldSampleCount so that the histogram is now adjusted to the new sample count.

Parameters
Bucketshistogram data structure to adjust
NewSampleCountnew sample count to adjust to
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2301 of file cluster.cpp.

2301  {
2302  int i;
2303  FLOAT64 AdjustFactor;
2304 
2305  AdjustFactor = (((FLOAT64) NewSampleCount) /
2306  ((FLOAT64) Buckets->SampleCount));
2307 
2308  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2309  Buckets->ExpectedCount[i] *= AdjustFactor;
2310  }
2311 
2312  Buckets->SampleCount = NewSampleCount;
2313 
2314 } // AdjustBuckets
uinT32 SampleCount
Definition: cluster.cpp:180
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
double FLOAT64
Definition: host.h:112
int AlphaMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of structures which hold pre-computed chi-squared values for a chi-squared value whose corresponding alpha field matches the alpha field of SearchKey.

It is called by the list search routines.

Parameters
arg1chi-squared struct being tested for a match
arg2chi-squared struct that is the search key
Returns
TRUE if ChiStruct's Alpha matches SearchKey's Alpha
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2349 of file cluster.cpp.

2350  { //CHISTRUCT *SearchKey)
2351  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2352  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2353 
2354  return (ChiStruct->Alpha == SearchKey->Alpha);
2355 
2356 } // AlphaMatch
FLOAT64 Alpha
Definition: cluster.cpp:191
FLOAT64 ChiArea ( CHISTRUCT ChiParams,
FLOAT64  x 
)

This routine computes the area under a chi density curve from 0 to x, minus the desired area under the curve. The number of degrees of freedom of the chi curve is specified in the ChiParams structure. The desired area is also specified in the ChiParams structure as Alpha ( or 1 minus the desired area ). This routine is intended to be passed to the Solve() function to find the value of chi-squared which will yield a desired area under the right tail of the chi density curve. The function will only work for even degrees of freedom. The equations are based on integrating the chi density curve in parts to obtain a series that can be used to compute the area under the curve.

Parameters
ChiParamscontains degrees of freedom and alpha
xvalue of chi-squared to evaluate
Returns
Error between actual and desired area under the chi curve.
Note
Exceptions: none
History: Fri Aug 4 12:48:41 1989, DSJ, Created.

Definition at line 2464 of file cluster.cpp.

2464  {
2465  int i, N;
2466  FLOAT64 SeriesTotal;
2467  FLOAT64 Denominator;
2468  FLOAT64 PowerOfx;
2469 
2470  N = ChiParams->DegreesOfFreedom / 2 - 1;
2471  SeriesTotal = 1;
2472  Denominator = 1;
2473  PowerOfx = 1;
2474  for (i = 1; i <= N; i++) {
2475  Denominator *= 2 * i;
2476  PowerOfx *= x;
2477  SeriesTotal += PowerOfx / Denominator;
2478  }
2479  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2480 
2481 } // ChiArea
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
FLOAT64 Alpha
Definition: cluster.cpp:191
double FLOAT64
Definition: host.h:112
LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 515 of file cluster.cpp.

515  {
516  //only create cluster tree if samples have never been clustered before
517  if (Clusterer->Root == NULL)
518  CreateClusterTree(Clusterer);
519 
520  //deallocate the old prototype list if one exists
521  FreeProtoList (&Clusterer->ProtoList);
522  Clusterer->ProtoList = NIL_LIST;
523 
524  //compute prototypes starting at the root node in the tree
525  ComputePrototypes(Clusterer, Config);
526  return (Clusterer->ProtoList);
527 } // ClusterSamples
#define NIL_LIST
Definition: oldlist.h:126
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:571
CLUSTER * Root
Definition: cluster.h:91
LIST ProtoList
Definition: cluster.h:92
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:936
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:705
#define NULL
Definition: host.h:144
FLOAT64 ComputeChiSquared ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine computes the chi-squared value which will leave a cumulative probability of Alpha in the right tail of a chi-squared distribution with the specified number of degrees of freedom. Alpha must be between 0 and 1. DegreesOfFreedom must be even. The routine maintains an array of lists. Each list corresponds to a different number of degrees of freedom. Each entry in the list corresponds to a different alpha value and its corresponding chi-squared value. Therefore, once a particular chi-squared value is computed, it is stored in the list and never needs to be computed again.

Parameters
DegreesOfFreedomdetermines shape of distribution
Alphaprobability of right tail
Returns
Desired chi-squared value
Note
Exceptions: none
History: 6/5/89, DSJ, Created.

Definition at line 1897 of file cluster.cpp.

1900 {
1901  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1902 
1903  CHISTRUCT *OldChiSquared;
1904  CHISTRUCT SearchKey;
1905 
1906  // limit the minimum alpha that can be used - if alpha is too small
1907  // it may not be possible to compute chi-squared.
1908  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1909  if (Odd (DegreesOfFreedom))
1910  DegreesOfFreedom++;
1911 
1912  /* find the list of chi-squared values which have already been computed
1913  for the specified number of degrees of freedom. Search the list for
1914  the desired chi-squared. */
1915  SearchKey.Alpha = Alpha;
1916  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1917  &SearchKey, AlphaMatch));
1918 
1919  if (OldChiSquared == NULL) {
1920  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1921  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1922  (FLOAT64) DegreesOfFreedom,
1923  (FLOAT64) CHIACCURACY);
1924  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1925  OldChiSquared);
1926  }
1927  else {
1928  // further optimization might move OldChiSquared to front of list
1929  }
1930 
1931  return (OldChiSquared->ChiSquared);
1932 
1933 } // ComputeChiSquared
#define Odd(N)
Definition: cluster.cpp:206
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2370
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:115
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:413
FLOAT64 ChiSquared
Definition: cluster.cpp:192
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2349
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2464
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2397
#define CHIACCURACY
#define first_node(l)
Definition: oldlist.h:139
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2243
#define NULL
Definition: host.h:144
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:229
FLOAT64 Alpha
Definition: cluster.cpp:191
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
double FLOAT64
Definition: host.h:112
#define MINALPHA
void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 936 of file cluster.cpp.

936  {
937  LIST ClusterStack = NIL_LIST;
938  CLUSTER *Cluster;
939  PROTOTYPE *Prototype;
940 
941  // use a stack to keep track of clusters waiting to be processed
942  // initially the only cluster on the stack is the root cluster
943  if (Clusterer->Root != NULL)
944  ClusterStack = push (NIL_LIST, Clusterer->Root);
945 
946  // loop until we have analyzed all clusters which are potential prototypes
947  while (ClusterStack != NIL_LIST) {
948  // remove the next cluster to be analyzed from the stack
949  // try to make a prototype from the cluster
950  // if successful, put it on the proto list, else split the cluster
951  Cluster = (CLUSTER *) first_node (ClusterStack);
952  ClusterStack = pop (ClusterStack);
953  Prototype = MakePrototype(Clusterer, Config, Cluster);
954  if (Prototype != NULL) {
955  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
956  }
957  else {
958  ClusterStack = push (ClusterStack, Cluster->Right);
959  ClusterStack = push (ClusterStack, Cluster->Left);
960  }
961  }
962 } // ComputePrototypes
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
struct sample * Right
Definition: cluster.h:37
CLUSTER * Root
Definition: cluster.h:91
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:982
#define first_node(l)
Definition: oldlist.h:139
LIST pop(LIST list)
Definition: oldlist.cpp:305
LIST ProtoList
Definition: cluster.h:92
Definition: cluster.h:32
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
STATISTICS * ComputeStatistics ( inT16  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions.

Parameters
Nnumber of dimensions
ParamDescarray of dimension descriptions
Clustercluster whose stats are to be computed
Returns
Pointer to new data structure containing statistics
Note
Exceptions: None
History: 6/2/89, DSJ, Created.

Definition at line 1431 of file cluster.cpp.

1431  {
1432  STATISTICS *Statistics;
1433  int i, j;
1434  FLOAT32 *CoVariance;
1435  FLOAT32 *Distance;
1436  LIST SearchState;
1437  SAMPLE *Sample;
1438  uinT32 SampleCountAdjustedForBias;
1439 
1440  // allocate memory to hold the statistics results
1441  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1442  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1443  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1444  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1445 
1446  // allocate temporary memory to hold the sample to mean distances
1447  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1448 
1449  // initialize the statistics
1450  Statistics->AvgVariance = 1.0;
1451  CoVariance = Statistics->CoVariance;
1452  for (i = 0; i < N; i++) {
1453  Statistics->Min[i] = 0.0;
1454  Statistics->Max[i] = 0.0;
1455  for (j = 0; j < N; j++, CoVariance++)
1456  *CoVariance = 0;
1457  }
1458  // find each sample in the cluster and merge it into the statistics
1459  InitSampleSearch(SearchState, Cluster);
1460  while ((Sample = NextSample (&SearchState)) != NULL) {
1461  for (i = 0; i < N; i++) {
1462  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1463  if (ParamDesc[i].Circular) {
1464  if (Distance[i] > ParamDesc[i].HalfRange)
1465  Distance[i] -= ParamDesc[i].Range;
1466  if (Distance[i] < -ParamDesc[i].HalfRange)
1467  Distance[i] += ParamDesc[i].Range;
1468  }
1469  if (Distance[i] < Statistics->Min[i])
1470  Statistics->Min[i] = Distance[i];
1471  if (Distance[i] > Statistics->Max[i])
1472  Statistics->Max[i] = Distance[i];
1473  }
1474  CoVariance = Statistics->CoVariance;
1475  for (i = 0; i < N; i++)
1476  for (j = 0; j < N; j++, CoVariance++)
1477  *CoVariance += Distance[i] * Distance[j];
1478  }
1479  // normalize the variances by the total number of samples
1480  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1481  // also compute the geometic mean of the diagonal variances
1482  // ensure that clusters with only 1 sample are handled correctly
1483  if (Cluster->SampleCount > 1)
1484  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1485  else
1486  SampleCountAdjustedForBias = 1;
1487  CoVariance = Statistics->CoVariance;
1488  for (i = 0; i < N; i++)
1489  for (j = 0; j < N; j++, CoVariance++) {
1490  *CoVariance /= SampleCountAdjustedForBias;
1491  if (j == i) {
1492  if (*CoVariance < MINVARIANCE)
1493  *CoVariance = MINVARIANCE;
1494  Statistics->AvgVariance *= *CoVariance;
1495  }
1496  }
1497  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1498  1.0 / N);
1499 
1500  // release temporary memory and return
1501  memfree(Distance);
1502  return (Statistics);
1503 } // ComputeStatistics
#define MINVARIANCE
Definition: cluster.cpp:142
FLOAT32 * Max
Definition: cluster.cpp:175
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 AvgVariance
Definition: cluster.cpp:172
float FLOAT32
Definition: host.h:111
unsigned SampleCount
Definition: cluster.h:35
unsigned int uinT32
Definition: host.h:103
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Range
Definition: ocrfeatures.h:51
Definition: cluster.h:32
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:625
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define NULL
Definition: host.h:144
FLOAT32 * CoVariance
Definition: cluster.cpp:173
void CreateClusterTree ( CLUSTERER Clusterer)

This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples.

Parameters
Clustererdata structure holdings samples to be clustered
Returns
None (the Clusterer data structure is changed)
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 705 of file cluster.cpp.

705  {
706  ClusteringContext context;
707  ClusterPair HeapEntry;
708  TEMPCLUSTER *PotentialCluster;
709 
710  // each sample and its nearest neighbor form a "potential" cluster
711  // save these in a heap with the "best" potential clusters on top
712  context.tree = Clusterer->KDTree;
713  context.candidates = (TEMPCLUSTER *)
714  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
715  context.next = 0;
716  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
717  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
718 
719  // form potential clusters into actual clusters - always do "best" first
720  while (context.heap->Pop(&HeapEntry)) {
721  PotentialCluster = HeapEntry.data;
722 
723  // if main cluster of potential cluster is already in another cluster
724  // then we don't need to worry about it
725  if (PotentialCluster->Cluster->Clustered) {
726  continue;
727  }
728 
729  // if main cluster is not yet clustered, but its nearest neighbor is
730  // then we must find a new nearest neighbor
731  else if (PotentialCluster->Neighbor->Clustered) {
732  PotentialCluster->Neighbor =
733  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
734  &HeapEntry.key);
735  if (PotentialCluster->Neighbor != NULL) {
736  context.heap->Push(&HeapEntry);
737  }
738  }
739 
740  // if neither cluster is already clustered, form permanent cluster
741  else {
742  PotentialCluster->Cluster =
743  MakeNewCluster(Clusterer, PotentialCluster);
744  PotentialCluster->Neighbor =
745  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
746  &HeapEntry.key);
747  if (PotentialCluster->Neighbor != NULL) {
748  context.heap->Push(&HeapEntry);
749  }
750  }
751  }
752 
753  // the root node in the cluster tree is now the only node in the kd-tree
754  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
755 
756  // free up the memory used by the K-D tree, heap, and temp clusters
757  FreeKDTree(context.tree);
758  Clusterer->KDTree = NULL;
759  delete context.heap;
760  memfree(context.candidates);
761 } // CreateClusterTree
bool Pop(Pair *entry)
Definition: genericheap.h:116
void memfree(void *element)
Definition: freelist.cpp:30
unsigned Clustered
Definition: cluster.h:33
KDTREE * KDTree
Definition: cluster.h:90
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:351
inT32 NumberOfSamples
Definition: cluster.h:89
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:773
CLUSTER * Neighbor
Definition: cluster.cpp:165
CLUSTER * Root
Definition: cluster.h:91
void(* void_proc)(...)
Definition: cutil.h:66
#define RootOf(T)
Definition: kdtree.h:58
void * Emalloc(int Size)
Definition: emalloc.cpp:47
ClusterHeap * heap
Definition: cluster.cpp:197
void Push(Pair *entry)
Definition: genericheap.h:95
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:807
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:846
Definition: cluster.h:32
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:332
CLUSTER * Cluster
Definition: cluster.cpp:164
#define NULL
Definition: host.h:144
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:169
uinT16 DegreesOfFreedom ( DISTRIBUTION  Distribution,
uinT16  HistogramBuckets 
)

This routine computes the degrees of freedom that should be used in a chi-squared test with the specified number of histogram buckets. The result is always rounded up to the next even number so that the value of chi-squared can be computed more easily. This will cause the value of chi-squared to be higher than the optimum value, resulting in the chi-square test being more lenient than optimum.

Parameters
Distributiondistribution being tested for
HistogramBucketsnumber of buckets in chi-square test
Returns
The number of degrees of freedom for a chi-square test
Note
Exceptions: none
History: Thu Aug 3 14:04:18 1989, DSJ, Created.

Definition at line 2243 of file cluster.cpp.

2243  {
2244  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2245 
2246  uinT16 AdjustedNumBuckets;
2247 
2248  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2249  if (Odd (AdjustedNumBuckets))
2250  AdjustedNumBuckets++;
2251  return (AdjustedNumBuckets);
2252 
2253 } // DegreesOfFreedom
#define Odd(N)
Definition: cluster.cpp:206
unsigned short uinT16
Definition: host.h:101
unsigned char uinT8
Definition: host.h:99
BOOL8 DistributionOK ( BUCKETS Buckets)

This routine performs a chi-square goodness of fit test on the histogram data in the Buckets data structure. TRUE is returned if the histogram matches the probability distribution which was specified when the Buckets structure was originally created. Otherwise FALSE is returned.

Parameters
Bucketshistogram data to perform chi-square test on
Returns
TRUE if samples match distribution, FALSE otherwise
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2159 of file cluster.cpp.

2159  {
2160  FLOAT32 FrequencyDifference;
2161  FLOAT32 TotalDifference;
2162  int i;
2163 
2164  // compute how well the histogram matches the expected histogram
2165  TotalDifference = 0.0;
2166  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2167  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2168  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2169  Buckets->ExpectedCount[i];
2170  }
2171 
2172  // test to see if the difference is more than expected
2173  if (TotalDifference > Buckets->ChiSquared)
2174  return FALSE;
2175  else
2176  return TRUE;
2177 } // DistributionOK
float FLOAT32
Definition: host.h:111
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
uinT32 * Count
Definition: cluster.cpp:185
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
FLOAT64 ChiSquared
Definition: cluster.cpp:182
void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uinT16  Dim,
PARAM_DESC ParamDesc,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine counts the number of cluster samples which fall within the various histogram buckets in Buckets. Only one dimension of each sample is examined. The exact meaning of the Mean and StdDev parameters depends on the distribution which is being analyzed (this info is in the Buckets data structure). For normal distributions, Mean and StdDev have the expected meanings. For uniform and random distributions the Mean is the center point of the range and the StdDev is 1/2 the range. A dimension with zero standard deviation cannot be statistically analyzed. In this case, a pseudo-analysis is used.

Parameters
Bucketshistogram buckets to count samples
Clustercluster whose samples are being analyzed
Dimdimension of samples which is being analyzed
ParamDescdescription of the dimension
Mean"mean" of the distribution
StdDev"standard deviation" of the distribution
Returns
None (the Buckets data structure is filled in)
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2015 of file cluster.cpp.

2020  {
2021  uinT16 BucketID;
2022  int i;
2023  LIST SearchState;
2024  SAMPLE *Sample;
2025 
2026  // initialize the histogram bucket counts to 0
2027  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2028  Buckets->Count[i] = 0;
2029 
2030  if (StdDev == 0.0) {
2031  /* if the standard deviation is zero, then we can't statistically
2032  analyze the cluster. Use a pseudo-analysis: samples exactly on
2033  the mean are distributed evenly across all buckets. Samples greater
2034  than the mean are placed in the last bucket; samples less than the
2035  mean are placed in the first bucket. */
2036 
2037  InitSampleSearch(SearchState, Cluster);
2038  i = 0;
2039  while ((Sample = NextSample (&SearchState)) != NULL) {
2040  if (Sample->Mean[Dim] > Mean)
2041  BucketID = Buckets->NumberOfBuckets - 1;
2042  else if (Sample->Mean[Dim] < Mean)
2043  BucketID = 0;
2044  else
2045  BucketID = i;
2046  Buckets->Count[BucketID] += 1;
2047  i++;
2048  if (i >= Buckets->NumberOfBuckets)
2049  i = 0;
2050  }
2051  }
2052  else {
2053  // search for all samples in the cluster and add to histogram buckets
2054  InitSampleSearch(SearchState, Cluster);
2055  while ((Sample = NextSample (&SearchState)) != NULL) {
2056  switch (Buckets->Distribution) {
2057  case normal:
2058  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2059  Mean, StdDev);
2060  break;
2061  case D_random:
2062  case uniform:
2063  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2064  Mean, StdDev);
2065  break;
2066  default:
2067  BucketID = 0;
2068  }
2069  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2070  }
2071  }
2072 } // FillBuckets
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2124
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
Definition: cluster.h:59
FLOAT32 Mean[1]
Definition: cluster.h:39
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2088
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
Definition: cluster.h:32
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:625
#define InitSampleSearch(S, C)
Definition: cluster.h:105
uinT32 * Count
Definition: cluster.cpp:185
#define NULL
Definition: host.h:144
DISTRIBUTION Distribution
Definition: cluster.cpp:179
unsigned short uinT16
Definition: host.h:101
CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
FLOAT32 Distance 
)

This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise NULL is returned. The distance between the 2 nodes is placed in the specified variable.

Parameters
Treekd-tree to search in for nearest neighbor
Clustercluster whose nearest neighbor is to be found
Distanceptr to variable to report distance found
Returns
Pointer to the nearest neighbor of Cluster, or NULL
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 807 of file cluster.cpp.

810 {
811  CLUSTER *Neighbor[MAXNEIGHBORS];
812  FLOAT32 Dist[MAXNEIGHBORS];
813  int NumberOfNeighbors;
814  inT32 i;
815  CLUSTER *BestNeighbor;
816 
817  // find the 2 nearest neighbors of the cluster
819  &NumberOfNeighbors, (void **)Neighbor, Dist);
820 
821  // search for the nearest neighbor that is not the cluster itself
822  *Distance = MAXDISTANCE;
823  BestNeighbor = NULL;
824  for (i = 0; i < NumberOfNeighbors; i++) {
825  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
826  *Distance = Dist[i];
827  BestNeighbor = Neighbor[i];
828  }
829  }
830  return BestNeighbor;
831 } // FindNearestNeighbor
float FLOAT32
Definition: host.h:111
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:322
#define MAXNEIGHBORS
#define MAXDISTANCE
FLOAT32 Mean[1]
Definition: cluster.h:39
Definition: cluster.h:32
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
void FreeBuckets ( BUCKETS buckets)

This routine properly frees the memory used by a BUCKETS.

Parameters
bucketspointer to data structure to be freed

Definition at line 2201 of file cluster.cpp.

2201  {
2202  Efree(buckets->Count);
2203  Efree(buckets->ExpectedCount);
2204  Efree(buckets);
2205 } // FreeBuckets
void Efree(void *ptr)
Definition: emalloc.cpp:79
uinT32 * Count
Definition: cluster.cpp:185
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
void FreeCluster ( CLUSTER Cluster)

This routine frees the memory consumed by the specified cluster and all of its subclusters. This is done by recursive calls to FreeCluster().

Parameters
Clusterpointer to cluster to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 2220 of file cluster.cpp.

2220  {
2221  if (Cluster != NULL) {
2222  FreeCluster (Cluster->Left);
2223  FreeCluster (Cluster->Right);
2224  memfree(Cluster);
2225  }
2226 } // FreeCluster
void memfree(void *element)
Definition: freelist.cpp:30
struct sample * Left
Definition: cluster.h:36
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2220
struct sample * Right
Definition: cluster.h:37
#define NULL
Definition: host.h:144
void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 543 of file cluster.cpp.

543  {
544  if (Clusterer != NULL) {
545  memfree (Clusterer->ParamDesc);
546  if (Clusterer->KDTree != NULL)
547  FreeKDTree (Clusterer->KDTree);
548  if (Clusterer->Root != NULL)
549  FreeCluster (Clusterer->Root);
550  // Free up all used buckets structures.
551  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
552  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
553  if (Clusterer->bucket_cache[d][c] != NULL)
554  FreeBuckets(Clusterer->bucket_cache[d][c]);
555  }
556 
557  memfree(Clusterer);
558  }
559 } // FreeClusterer
void memfree(void *element)
Definition: freelist.cpp:30
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2201
#define MINBUCKETS
Definition: cluster.h:26
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2220
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:351
CLUSTER * Root
Definition: cluster.h:91
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define MAXBUCKETS
Definition: cluster.h:27
#define NULL
Definition: host.h:144
void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 571 of file cluster.cpp.

571  {
572  destroy_nodes(*ProtoList, FreePrototype);
573 } // FreeProtoList
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:204
void FreePrototype(void *arg)
Definition: cluster.cpp:586
void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 586 of file cluster.cpp.

586  { //PROTOTYPE *Prototype)
587  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
588 
589  // unmark the corresponding cluster (if there is one
590  if (Prototype->Cluster != NULL)
591  Prototype->Cluster->Prototype = FALSE;
592 
593  // deallocate the prototype statistics and then the prototype itself
594  if (Prototype->Distrib != NULL)
595  memfree (Prototype->Distrib);
596  if (Prototype->Mean != NULL)
597  memfree (Prototype->Mean);
598  if (Prototype->Style != spherical) {
599  if (Prototype->Variance.Elliptical != NULL)
600  memfree (Prototype->Variance.Elliptical);
601  if (Prototype->Magnitude.Elliptical != NULL)
602  memfree (Prototype->Magnitude.Elliptical);
603  if (Prototype->Weight.Elliptical != NULL)
604  memfree (Prototype->Weight.Elliptical);
605  }
606  memfree(Prototype);
607 } // FreePrototype
void memfree(void *element)
Definition: freelist.cpp:30
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
unsigned Style
Definition: cluster.h:74
#define NULL
Definition: host.h:144
void FreeStatistics ( STATISTICS Statistics)

This routine frees the memory used by the statistics data structure.

Parameters
Statisticspointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2188 of file cluster.cpp.

2188  {
2189  memfree (Statistics->CoVariance);
2190  memfree (Statistics->Min);
2191  memfree (Statistics->Max);
2192  memfree(Statistics);
2193 } // FreeStatistics
FLOAT32 * Max
Definition: cluster.cpp:175
void memfree(void *element)
Definition: freelist.cpp:30
FLOAT32 * Min
Definition: cluster.cpp:174
FLOAT32 * CoVariance
Definition: cluster.cpp:173
BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket.

Parameters
clustererwhich keeps a bucket_cache for us.
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Bucket data structure
Note
Exceptions: none
History: Thu Aug 3 12:58:10 1989, DSJ, Created.

Definition at line 1713 of file cluster.cpp.

1716  {
1717  // Get an old bucket structure with the same number of buckets.
1718  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1719  BUCKETS *Buckets =
1720  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1721 
1722  // If a matching bucket structure is not found, make one and save it.
1723  if (Buckets == NULL) {
1724  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1725  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1726  Buckets;
1727  } else {
1728  // Just adjust the existing buckets.
1729  if (SampleCount != Buckets->SampleCount)
1730  AdjustBuckets(Buckets, SampleCount);
1731  if (Confidence != Buckets->Confidence) {
1732  Buckets->Confidence = Confidence;
1733  Buckets->ChiSquared = ComputeChiSquared(
1734  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1735  Confidence);
1736  }
1737  InitBuckets(Buckets);
1738  }
1739  return Buckets;
1740 } // GetBuckets
#define MINBUCKETS
Definition: cluster.h:26
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2301
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1897
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1859
FLOAT64 Confidence
Definition: cluster.cpp:181
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1761
uinT32 SampleCount
Definition: cluster.cpp:180
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2325
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2243
#define NULL
Definition: host.h:144
unsigned short uinT16
Definition: host.h:101
FLOAT64 ChiSquared
Definition: cluster.cpp:182
BOOL8 Independent ( PARAM_DESC  ParamDesc[],
inT16  N,
FLOAT32 CoVariance,
FLOAT32  Independence 
)

This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true).

Parameters
ParamDescdescriptions of each feature space dimension
Nnumber of dimensions
CoVarianceptr to a covariance matrix
Independencemax off-diagonal correlation coefficient
Returns
TRUE if dimensions are independent, FALSE otherwise
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1665 of file cluster.cpp.

1666  {
1667  int i, j;
1668  FLOAT32 *VARii; // points to ith on-diagonal element
1669  FLOAT32 *VARjj; // points to jth on-diagonal element
1670  FLOAT32 CorrelationCoeff;
1671 
1672  VARii = CoVariance;
1673  for (i = 0; i < N; i++, VARii += N + 1) {
1674  if (ParamDesc[i].NonEssential)
1675  continue;
1676 
1677  VARjj = VARii + N + 1;
1678  CoVariance = VARii + 1;
1679  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1680  if (ParamDesc[j].NonEssential)
1681  continue;
1682 
1683  if ((*VARii == 0.0) || (*VARjj == 0.0))
1684  CorrelationCoeff = 0.0;
1685  else
1686  CorrelationCoeff =
1687  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1688  if (CorrelationCoeff > Independence)
1689  return (FALSE);
1690  }
1691  }
1692  return (TRUE);
1693 } // Independent
float FLOAT32
Definition: host.h:111
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
void InitBuckets ( BUCKETS Buckets)

This routine sets the bucket counts in the specified histogram to zero.

Parameters
Bucketshistogram data structure to init
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2325 of file cluster.cpp.

2325  {
2326  int i;
2327 
2328  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2329  Buckets->Count[i] = 0;
2330  }
2331 
2332 } // InitBuckets
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
uinT32 * Count
Definition: cluster.cpp:185
FLOAT64 Integral ( FLOAT64  f1,
FLOAT64  f2,
FLOAT64  Dx 
)

This routine computes a trapezoidal approximation to the integral of a function over a small delta in x.

Parameters
f1value of function at x1
f2value of function at x2
Dxx2 - x1 (should always be positive)
Returns
Approximation of the integral of the function from x1 to x2.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1988 of file cluster.cpp.

1988  {
1989  return (f1 + f2) * Dx / 2.0;
1990 } // Integral
double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Compute the inverse of a matrix using LU decomposition with partial pivoting. The return value is the sum of norms of the off-diagonal terms of the product of a and inv. (A measure of the error.)

Definition at line 2569 of file cluster.cpp.

2569  {
2570  // Allocate memory for the 2D arrays.
2571  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2572  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2573  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2574 
2575  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2576  int row;
2577  int col;
2578  for (row = 0; row < size; row++) {
2579  for (col = 0; col < size; col++) {
2580  U[row][col] = input[row*size + col];
2581  L[row][col] = row == col ? 1.0 : 0.0;
2582  U_inv[row][col] = 0.0;
2583  }
2584  }
2585 
2586  // Compute forward matrix by inversion by LU decomposition of input.
2587  for (col = 0; col < size; ++col) {
2588  // Find best pivot
2589  int best_row = 0;
2590  double best_pivot = -1.0;
2591  for (row = col; row < size; ++row) {
2592  if (Abs(U[row][col]) > best_pivot) {
2593  best_pivot = Abs(U[row][col]);
2594  best_row = row;
2595  }
2596  }
2597  // Exchange pivot rows.
2598  if (best_row != col) {
2599  for (int k = 0; k < size; ++k) {
2600  double tmp = U[best_row][k];
2601  U[best_row][k] = U[col][k];
2602  U[col][k] = tmp;
2603  tmp = L[best_row][k];
2604  L[best_row][k] = L[col][k];
2605  L[col][k] = tmp;
2606  }
2607  }
2608  // Now do the pivot itself.
2609  for (row = col + 1; row < size; ++row) {
2610  double ratio = -U[row][col] / U[col][col];
2611  for (int j = col; j < size; ++j) {
2612  U[row][j] += U[col][j] * ratio;
2613  }
2614  for (int k = 0; k < size; ++k) {
2615  L[row][k] += L[col][k] * ratio;
2616  }
2617  }
2618  }
2619  // Next invert U.
2620  for (col = 0; col < size; ++col) {
2621  U_inv[col][col] = 1.0 / U[col][col];
2622  for (row = col - 1; row >= 0; --row) {
2623  double total = 0.0;
2624  for (int k = col; k > row; --k) {
2625  total += U[row][k] * U_inv[k][col];
2626  }
2627  U_inv[row][col] = -total / U[row][row];
2628  }
2629  }
2630  // Now the answer is U_inv.L.
2631  for (row = 0; row < size; row++) {
2632  for (col = 0; col < size; col++) {
2633  double sum = 0.0;
2634  for (int k = row; k < size; ++k) {
2635  sum += U_inv[row][k] * L[k][col];
2636  }
2637  inv[row*size + col] = sum;
2638  }
2639  }
2640  // Check matrix product.
2641  double error_sum = 0.0;
2642  for (row = 0; row < size; row++) {
2643  for (col = 0; col < size; col++) {
2644  double sum = 0.0;
2645  for (int k = 0; k < size; ++k) {
2646  sum += input[row*size + k] * inv[k *size + col];
2647  }
2648  if (row != col) {
2649  error_sum += Abs(sum);
2650  }
2651  }
2652  }
2653  return error_sum;
2654 }
#define Abs(N)
Definition: cluster.cpp:208
int ListEntryMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list for a list node whose contents match Key. It is called by the list delete_d routine.

Returns
TRUE if ListNode matches Key
Note
Exceptions: none
History: Thu Aug 3 14:23:58 1989, DSJ, Created.

Definition at line 2284 of file cluster.cpp.

2285  { //Key
2286  return (arg1 == arg2);
2287 
2288 } // ListEntryMatch
BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket.

Parameters
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Pointer to new histogram data structure
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1761 of file cluster.cpp.

1763  {
1764  const DENSITYFUNC DensityFunction[] =
1765  { NormalDensity, UniformDensity, UniformDensity };
1766  int i, j;
1767  BUCKETS *Buckets;
1768  FLOAT64 BucketProbability;
1769  FLOAT64 NextBucketBoundary;
1770  FLOAT64 Probability;
1771  FLOAT64 ProbabilityDelta;
1772  FLOAT64 LastProbDensity;
1773  FLOAT64 ProbDensity;
1774  uinT16 CurrentBucket;
1775  BOOL8 Symmetrical;
1776 
1777  // allocate memory needed for data structure
1778  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1779  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1780  Buckets->SampleCount = SampleCount;
1781  Buckets->Confidence = Confidence;
1782  Buckets->Count = reinterpret_cast<uinT32*>(
1783  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1784  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1785  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1786 
1787  // initialize simple fields
1788  Buckets->Distribution = Distribution;
1789  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1790  Buckets->Count[i] = 0;
1791  Buckets->ExpectedCount[i] = 0.0;
1792  }
1793 
1794  // all currently defined distributions are symmetrical
1795  Symmetrical = TRUE;
1796  Buckets->ChiSquared = ComputeChiSquared(
1797  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1798 
1799  if (Symmetrical) {
1800  // allocate buckets so that all have approx. equal probability
1801  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1802 
1803  // distribution is symmetric so fill in upper half then copy
1804  CurrentBucket = Buckets->NumberOfBuckets / 2;
1805  if (Odd (Buckets->NumberOfBuckets))
1806  NextBucketBoundary = BucketProbability / 2;
1807  else
1808  NextBucketBoundary = BucketProbability;
1809 
1810  Probability = 0.0;
1811  LastProbDensity =
1812  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1813  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1814  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1815  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1816  Probability += ProbabilityDelta;
1817  if (Probability > NextBucketBoundary) {
1818  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1819  CurrentBucket++;
1820  NextBucketBoundary += BucketProbability;
1821  }
1822  Buckets->Bucket[i] = CurrentBucket;
1823  Buckets->ExpectedCount[CurrentBucket] +=
1824  (FLOAT32) (ProbabilityDelta * SampleCount);
1825  LastProbDensity = ProbDensity;
1826  }
1827  // place any leftover probability into the last bucket
1828  Buckets->ExpectedCount[CurrentBucket] +=
1829  (FLOAT32) ((0.5 - Probability) * SampleCount);
1830 
1831  // copy upper half of distribution to lower half
1832  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1833  Buckets->Bucket[i] =
1834  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1835 
1836  // copy upper half of expected counts to lower half
1837  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1838  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1839  }
1840  return Buckets;
1841 } // MakeBuckets
#define Odd(N)
Definition: cluster.cpp:206
float FLOAT32
Definition: host.h:111
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:184
unsigned char BOOL8
Definition: host.h:113
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1897
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1859
FLOAT64 Confidence
Definition: cluster.cpp:181
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1968
unsigned int uinT32
Definition: host.h:103
uinT32 SampleCount
Definition: cluster.cpp:180
#define Mirror(N, R)
Definition: cluster.cpp:207
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
void * Emalloc(int Size)
Definition: emalloc.cpp:47
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2243
#define TRUE
Definition: capi.h:28
uinT32 * Count
Definition: cluster.cpp:185
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1951
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:203
DISTRIBUTION Distribution
Definition: cluster.cpp:179
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1988
FLOAT32 * ExpectedCount
Definition: cluster.cpp:186
unsigned short uinT16
Definition: host.h:101
double FLOAT64
Definition: host.h:112
FLOAT64 ChiSquared
Definition: cluster.cpp:182
CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 400 of file cluster.cpp.

400  {
401  CLUSTERER *Clusterer;
402  int i;
403 
404  // allocate main clusterer data structure and init simple fields
405  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
406  Clusterer->SampleSize = SampleSize;
407  Clusterer->NumberOfSamples = 0;
408  Clusterer->NumChar = 0;
409 
410  // init fields which will not be used initially
411  Clusterer->Root = NULL;
412  Clusterer->ProtoList = NIL_LIST;
413 
414  // maintain a copy of param descriptors in the clusterer data structure
415  Clusterer->ParamDesc =
416  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
417  for (i = 0; i < SampleSize; i++) {
418  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
419  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
420  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
421  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
422  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
423  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
424  Clusterer->ParamDesc[i].MidRange =
425  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
426  }
427 
428  // allocate a kd tree to hold the samples
429  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
430 
431  // Initialize cache of histogram buckets to minimize recomputing them.
432  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
433  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
434  Clusterer->bucket_cache[d][c] = NULL;
435  }
436 
437  return Clusterer;
438 } // MakeClusterer
FLOAT32 Min
Definition: ocrfeatures.h:49
#define MINBUCKETS
Definition: cluster.h:26
#define NIL_LIST
Definition: oldlist.h:126
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1-MINBUCKETS]
Definition: cluster.h:95
KDTREE * KDTree
Definition: cluster.h:90
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
inT32 NumberOfSamples
Definition: cluster.h:89
FLOAT32 MidRange
Definition: ocrfeatures.h:53
CLUSTER * Root
Definition: cluster.h:91
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT8 NonEssential
Definition: ocrfeatures.h:48
inT32 NumChar
Definition: cluster.h:93
inT8 Circular
Definition: ocrfeatures.h:47
LIST ProtoList
Definition: cluster.h:92
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 Range
Definition: ocrfeatures.h:51
#define MAXBUCKETS
Definition: cluster.h:27
#define NULL
Definition: host.h:144
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:182
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 Max
Definition: ocrfeatures.h:50
PROTOTYPE * MakeDegenerateProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
inT32  MinSamples 
)

This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it.

If the cluster is not degenerate, NULL is returned.

Parameters
Nnumber of dimensions
Clustercluster being analyzed
Statisticsstatistical info about cluster
Styletype of prototype to be generated
MinSamplesminimum number of samples in a cluster
Returns
Pointer to degenerate prototype or NULL.
Note
Exceptions: None
History: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).

Definition at line 1077 of file cluster.cpp.

1082  {
1083  PROTOTYPE *Proto = NULL;
1084 
1085  if (MinSamples < MINSAMPLESNEEDED)
1086  MinSamples = MINSAMPLESNEEDED;
1087 
1088  if (Cluster->SampleCount < MinSamples) {
1089  switch (Style) {
1090  case spherical:
1091  Proto = NewSphericalProto (N, Cluster, Statistics);
1092  break;
1093  case elliptical:
1094  case automatic:
1095  Proto = NewEllipticalProto (N, Cluster, Statistics);
1096  break;
1097  case mixed:
1098  Proto = NewMixedProto (N, Cluster, Statistics);
1099  break;
1100  }
1101  Proto->Significant = FALSE;
1102  }
1103  return (Proto);
1104 } // MakeDegenerateProto
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1598
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1553
unsigned Significant
Definition: cluster.h:68
#define MINSAMPLESNEEDED
Definition: cluster.cpp:152
Definition: cluster.h:45
#define FALSE
Definition: capi.h:29
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1519
#define NULL
Definition: host.h:144
void MakeDimRandom ( uinT16  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

This routine alters the ith dimension of the specified mixed prototype to be D_random.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
ParamDescdescription of specified dimension
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1369 of file cluster.cpp.

1369  {
1370  Proto->Distrib[i] = D_random;
1371  Proto->Mean[i] = ParamDesc->MidRange;
1372  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1373 
1374  // subtract out the previous magnitude of this dimension from the total
1375  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1376  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1377  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1378  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1379 
1380  // note that the proto Weight is irrelevant for D_random protos
1381 } // MakeDimRandom
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 MidRange
Definition: ocrfeatures.h:53
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 Range
Definition: ocrfeatures.h:51
void MakeDimUniform ( uinT16  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

This routine alters the ith dimension of the specified mixed prototype to be uniform.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
Statisticsstatistical info about prototype
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1394 of file cluster.cpp.

1394  {
1395  Proto->Distrib[i] = uniform;
1396  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1397  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1398  Proto->Variance.Elliptical[i] =
1399  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1400  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1401  Proto->Variance.Elliptical[i] = MINVARIANCE;
1402 
1403  // subtract out the previous magnitude of this dimension from the total
1404  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1405  Proto->Magnitude.Elliptical[i] =
1406  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1407  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1408  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1409 
1410  // note that the proto Weight is irrelevant for uniform protos
1411 } // MakeDimUniform
#define MINVARIANCE
Definition: cluster.cpp:142
FLOAT32 * Max
Definition: cluster.cpp:175
FLOAT32 * Min
Definition: cluster.cpp:174
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
CLUSTER * Cluster
Definition: cluster.h:76
PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new elliptical prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1264 of file cluster.cpp.

1267  {
1268  PROTOTYPE *Proto = NULL;
1269  int i;
1270 
1271  // check that each dimension is a normal distribution
1272  for (i = 0; i < Clusterer->SampleSize; i++) {
1273  if (Clusterer->ParamDesc[i].NonEssential)
1274  continue;
1275 
1276  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1277  Cluster->Mean[i],
1278  sqrt ((FLOAT64) Statistics->
1279  CoVariance[i * (Clusterer->SampleSize + 1)]));
1280  if (!DistributionOK (Buckets))
1281  break;
1282  }
1283  // if all dimensions matched a normal distribution, make a proto
1284  if (i >= Clusterer->SampleSize)
1285  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1286  return (Proto);
1287 } // MakeEllipticalProto
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1553
FLOAT32 Mean[1]
Definition: cluster.h:39
inT8 NonEssential
Definition: ocrfeatures.h:48
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2159
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NULL
Definition: host.h:144
inT16 SampleSize
Definition: cluster.h:87
double FLOAT64
Definition: host.h:112
PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
FLOAT64  Confidence 
)

This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a prototype
Statisticsstatistical info about cluster
NormalBucketshistogram struct used to analyze distribution
Confidenceconfidence level for alternate distributions
Returns
Pointer to new mixed prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1307 of file cluster.cpp.

1311  {
1312  PROTOTYPE *Proto;
1313  int i;
1314  BUCKETS *UniformBuckets = NULL;
1315  BUCKETS *RandomBuckets = NULL;
1316 
1317  // create a mixed proto to work on - initially assume all dimensions normal*/
1318  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1319 
1320  // find the proper distribution for each dimension
1321  for (i = 0; i < Clusterer->SampleSize; i++) {
1322  if (Clusterer->ParamDesc[i].NonEssential)
1323  continue;
1324 
1325  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1326  Proto->Mean[i],
1327  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1328  if (DistributionOK (NormalBuckets))
1329  continue;
1330 
1331  if (RandomBuckets == NULL)
1332  RandomBuckets =
1333  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1334  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1335  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1336  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1337  if (DistributionOK (RandomBuckets))
1338  continue;
1339 
1340  if (UniformBuckets == NULL)
1341  UniformBuckets =
1342  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1343  MakeDimUniform(i, Proto, Statistics);
1344  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1345  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1346  if (DistributionOK (UniformBuckets))
1347  continue;
1348  break;
1349  }
1350  // if any dimension failed to match a distribution, discard the proto
1351  if (i < Clusterer->SampleSize) {
1352  FreePrototype(Proto);
1353  Proto = NULL;
1354  }
1355  return (Proto);
1356 } // MakeMixedProto
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1598
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1369
unsigned SampleCount
Definition: cluster.h:35
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
FLOAT32 * Elliptical
Definition: cluster.h:64
inT8 NonEssential
Definition: ocrfeatures.h:48
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2159
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreePrototype(void *arg)
Definition: cluster.cpp:586
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1713
#define NULL
Definition: host.h:144
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1394
inT16 SampleSize
Definition: cluster.h:87
double FLOAT64
Definition: host.h:112
CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree.

Parameters
Clusterercurrent clustering environment
TempClusterpotential cluster to make permanent
Returns
Pointer to the new permanent cluster
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 846 of file cluster.cpp.

846  {
847  CLUSTER *Cluster;
848 
849  // allocate the new cluster and initialize it
850  Cluster = (CLUSTER *) Emalloc(
851  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
852  Cluster->Clustered = FALSE;
853  Cluster->Prototype = FALSE;
854  Cluster->Left = TempCluster->Cluster;
855  Cluster->Right = TempCluster->Neighbor;
856  Cluster->CharID = -1;
857 
858  // mark the old clusters as "clustered" and delete them from the kd-tree
859  Cluster->Left->Clustered = TRUE;
860  Cluster->Right->Clustered = TRUE;
861  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
862  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
863 
864  // compute the mean and sample count for the new cluster
865  Cluster->SampleCount =
866  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
867  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
868  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
869 
870  // add the new cluster to the KD tree
871  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
872  return Cluster;
873 } // MakeNewCluster
struct sample * Left
Definition: cluster.h:36
float FLOAT32
Definition: host.h:111
unsigned Clustered
Definition: cluster.h:33
unsigned SampleCount
Definition: cluster.h:35
struct sample * Right
Definition: cluster.h:37
KDTREE * KDTree
Definition: cluster.h:90
CLUSTER * Neighbor
Definition: cluster.cpp:165
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:218
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
Definition: cluster.h:32
#define TRUE
Definition: capi.h:28
CLUSTER * Cluster
Definition: cluster.cpp:164
inT32 CharID
Definition: cluster.h:38
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:265
inT16 SampleSize
Definition: cluster.h:87
void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
inT32  Level 
)

This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Parameters
contextClusteringContext (see definition above)
Clustercurrent cluster being visited in kd-tree walk
Levellevel of this cluster in the kd-tree

Definition at line 773 of file cluster.cpp.

774  {
775  ClusterPair HeapEntry;
776  int next = context->next;
777  context->candidates[next].Cluster = Cluster;
778  HeapEntry.data = &(context->candidates[next]);
779  context->candidates[next].Neighbor =
780  FindNearestNeighbor(context->tree,
781  context->candidates[next].Cluster,
782  &HeapEntry.key);
783  if (context->candidates[next].Neighbor != NULL) {
784  context->heap->Push(&HeapEntry);
785  context->next++;
786  }
787 } // MakePotentialClusters
CLUSTER * Neighbor
Definition: cluster.cpp:165
ClusterHeap * heap
Definition: cluster.cpp:197
void Push(Pair *entry)
Definition: genericheap.h:95
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:807
CLUSTER * Cluster
Definition: cluster.cpp:164
#define NULL
Definition: host.h:144
TEMPCLUSTER * candidates
Definition: cluster.cpp:198
PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Clustercluster to be made into a prototype
Returns
Pointer to new prototype or NULL
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 982 of file cluster.cpp.

984  {
985  STATISTICS *Statistics;
986  PROTOTYPE *Proto;
987  BUCKETS *Buckets;
988 
989  // filter out clusters which contain samples from the same character
990  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
991  return NULL;
992 
993  // compute the covariance matrix and ranges for the cluster
994  Statistics =
995  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
996 
997  // check for degenerate clusters which need not be analyzed further
998  // note that the MinSamples test assumes that all clusters with multiple
999  // character samples have been removed (as above)
1000  Proto = MakeDegenerateProto(
1001  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1002  (inT32) (Config->MinSamples * Clusterer->NumChar));
1003  if (Proto != NULL) {
1004  FreeStatistics(Statistics);
1005  return Proto;
1006  }
1007  // check to ensure that all dimensions are independent
1008  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1009  Statistics->CoVariance, Config->Independence)) {
1010  FreeStatistics(Statistics);
1011  return NULL;
1012  }
1013 
1014  if (HOTELLING && Config->ProtoStyle == elliptical) {
1015  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1016  if (Proto != NULL) {
1017  FreeStatistics(Statistics);
1018  return Proto;
1019  }
1020  }
1021 
1022  // create a histogram data structure used to evaluate distributions
1023  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1024  Config->Confidence);
1025 
1026  // create a prototype based on the statistics and test it
1027  switch (Config->ProtoStyle) {
1028  case spherical:
1029  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1030  break;
1031  case elliptical:
1032  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1033  break;
1034  case mixed:
1035  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1036  Config->Confidence);
1037  break;
1038  case automatic:
1039  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1040  if (Proto != NULL)
1041  break;
1042  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1043  if (Proto != NULL)
1044  break;
1045  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1046  Config->Confidence);
1047  break;
1048  }
1049  FreeStatistics(Statistics);
1050  return Proto;
1051 } // MakePrototype
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1264
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1665
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1119
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2512
unsigned SampleCount
Definition: cluster.h:35
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1077
Definition: cluster.h:59
FLOAT32 Independence
Definition: cluster.h:53
FLOAT32 MaxIllegal
Definition: cluster.h:51
#define HOTELLING
Definition: cluster.cpp:30
FLOAT64 Confidence
Definition: cluster.h:54
FLOAT32 MinSamples
Definition: cluster.h:50
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
inT32 NumChar
Definition: cluster.h:93
Definition: cluster.h:45
PARAM_DESC * ParamDesc
Definition: cluster.h:88
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1431
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1226
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1713
#define NULL
Definition: host.h:144
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1307
inT16 SampleSize
Definition: cluster.h:87
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2188
FLOAT32 * CoVariance
Definition: cluster.cpp:173
int inT32
Definition: host.h:102
SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure
Note
Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called
History: 5/29/89, DSJ, Created.

Definition at line 457 of file cluster.cpp.

458  {
459  SAMPLE *Sample;
460  int i;
461 
462  // see if the samples have already been clustered - if so trap an error
463  if (Clusterer->Root != NULL)
465  "Can't add samples after they have been clustered");
466 
467  // allocate the new sample and initialize it
468  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
469  (Clusterer->SampleSize -
470  1) * sizeof (FLOAT32));
471  Sample->Clustered = FALSE;
472  Sample->Prototype = FALSE;
473  Sample->SampleCount = 1;
474  Sample->Left = NULL;
475  Sample->Right = NULL;
476  Sample->CharID = CharID;
477 
478  for (i = 0; i < Clusterer->SampleSize; i++)
479  Sample->Mean[i] = Feature[i];
480 
481  // add the sample to the KD tree - keep track of the total # of samples
482  Clusterer->NumberOfSamples++;
483  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
484  if (CharID >= Clusterer->NumChar)
485  Clusterer->NumChar = CharID + 1;
486 
487  // execute hook for monitoring clustering operation
488  // (*SampleCreationHook)( Sample );
489 
490  return (Sample);
491 } // MakeSample
struct sample * Left
Definition: cluster.h:36
float FLOAT32
Definition: host.h:111
unsigned Clustered
Definition: cluster.h:33
unsigned SampleCount
Definition: cluster.h:35
struct sample * Right
Definition: cluster.h:37
KDTREE * KDTree
Definition: cluster.h:90
inT32 NumberOfSamples
Definition: cluster.h:89
FLOAT32 Mean[1]
Definition: cluster.h:39
CLUSTER * Root
Definition: cluster.h:91
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define ALREADYCLUSTERED
Definition: cluster.h:133
inT32 NumChar
Definition: cluster.h:93
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:218
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
Definition: cluster.h:32
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
inT32 CharID
Definition: cluster.h:38
#define NULL
Definition: host.h:144
inT16 SampleSize
Definition: cluster.h:87
PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by a spherical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new spherical prototype or NULL.
Note
Exceptions: None
History: 6/1/89, DSJ, Created.

Definition at line 1226 of file cluster.cpp.

1229  {
1230  PROTOTYPE *Proto = NULL;
1231  int i;
1232 
1233  // check that each dimension is a normal distribution
1234  for (i = 0; i < Clusterer->SampleSize; i++) {
1235  if (Clusterer->ParamDesc[i].NonEssential)
1236  continue;
1237 
1238  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1239  Cluster->Mean[i],
1240  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1241  if (!DistributionOK (Buckets))
1242  break;
1243  }
1244  // if all dimensions matched a normal distribution, make a proto
1245  if (i >= Clusterer->SampleSize)
1246  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1247  return (Proto);
1248 } // MakeSphericalProto
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 Mean[1]
Definition: cluster.h:39
inT8 NonEssential
Definition: ocrfeatures.h:48
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2159
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2015
PARAM_DESC * ParamDesc
Definition: cluster.h:88
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1519
#define NULL
Definition: host.h:144
inT16 SampleSize
Definition: cluster.h:87
double FLOAT64
Definition: host.h:112
FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 650 of file cluster.cpp.

650  {
651  return (Proto->Mean[Dimension]);
652 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78
inT32 MergeClusters ( inT16  N,
register PARAM_DESC  ParamDesc[],
register inT32  n1,
register inT32  n2,
register FLOAT32  m[],
register FLOAT32  m1[],
register FLOAT32  m2[] 
)
inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.
Note
Exceptions: None
History: 5/31/89, DSJ, Created.

Definition at line 891 of file cluster.cpp.

896  {
897  inT32 i, n;
898 
899  n = n1 + n2;
900  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
901  if (ParamDesc->Circular) {
902  // if distance between means is greater than allowed
903  // reduce upper point by one "rotation" to compute mean
904  // then normalize the mean back into the accepted range
905  if ((*m2 - *m1) > ParamDesc->HalfRange) {
906  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
907  if (*m < ParamDesc->Min)
908  *m += ParamDesc->Range;
909  }
910  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
911  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
912  if (*m < ParamDesc->Min)
913  *m += ParamDesc->Range;
914  }
915  else
916  *m = (n1 * *m1 + n2 * *m2) / n;
917  }
918  else
919  *m = (n1 * *m1 + n2 * *m2) / n;
920  }
921  return n;
922 } // MergeClusters
int inT32
Definition: host.h:102
BOOL8 MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
FLOAT32  MaxIllegal 
)

This routine looks at all samples in the specified cluster. It computes a running estimate of the percentage of the charaters which have more than 1 sample in the cluster. When this percentage exceeds MaxIllegal, TRUE is returned. Otherwise FALSE is returned. The CharID fields must contain integers which identify the training characters which were used to generate the sample. One integer is used for each sample. The NumChar field in the Clusterer must contain the number of characters in the training set. All CharID fields must be between 0 and NumChar-1. The main function of this routine is to help identify clusters which need to be split further, i.e. if numerous training characters have 2 or more features which are contained in the same cluster, then the cluster should be split.

Parameters
Clustererdata structure holding cluster tree
Clustercluster containing samples to be tested
MaxIllegalmax percentage of samples allowed to have more than 1 feature in the cluster
Returns
TRUE if the cluster should be split, FALSE otherwise.
Note
Exceptions: none
History: Wed Aug 30 11:13:05 1989, DSJ, Created. 2/22/90, DSJ, Added MaxIllegal control rather than always splitting illegal clusters.

Definition at line 2512 of file cluster.cpp.

2515 {
2516  static BOOL8 *CharFlags = NULL;
2517  static inT32 NumFlags = 0;
2518  int i;
2519  LIST SearchState;
2520  SAMPLE *Sample;
2521  inT32 CharID;
2522  inT32 NumCharInCluster;
2523  inT32 NumIllegalInCluster;
2524  FLOAT32 PercentIllegal;
2525 
2526  // initial estimate assumes that no illegal chars exist in the cluster
2527  NumCharInCluster = Cluster->SampleCount;
2528  NumIllegalInCluster = 0;
2529 
2530  if (Clusterer->NumChar > NumFlags) {
2531  if (CharFlags != NULL)
2532  memfree(CharFlags);
2533  NumFlags = Clusterer->NumChar;
2534  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2535  }
2536 
2537  for (i = 0; i < NumFlags; i++)
2538  CharFlags[i] = FALSE;
2539 
2540  // find each sample in the cluster and check if we have seen it before
2541  InitSampleSearch(SearchState, Cluster);
2542  while ((Sample = NextSample (&SearchState)) != NULL) {
2543  CharID = Sample->CharID;
2544  if (CharFlags[CharID] == FALSE) {
2545  CharFlags[CharID] = TRUE;
2546  }
2547  else {
2548  if (CharFlags[CharID] == TRUE) {
2549  NumIllegalInCluster++;
2550  CharFlags[CharID] = ILLEGAL_CHAR;
2551  }
2552  NumCharInCluster--;
2553  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2554  if (PercentIllegal > MaxIllegal) {
2555  destroy(SearchState);
2556  return (TRUE);
2557  }
2558  }
2559  }
2560  return (FALSE);
2561 
2562 } // MultipleCharSamples
void memfree(void *element)
Definition: freelist.cpp:30
float FLOAT32
Definition: host.h:111
#define ILLEGAL_CHAR
unsigned SampleCount
Definition: cluster.h:35
unsigned char BOOL8
Definition: host.h:113
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT32 NumChar
Definition: cluster.h:93
LIST destroy(LIST list)
Definition: oldlist.cpp:187
#define FALSE
Definition: capi.h:29
Definition: cluster.h:32
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:625
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define TRUE
Definition: capi.h:28
inT32 CharID
Definition: cluster.h:38
#define NULL
Definition: host.h:144
int inT32
Definition: host.h:102
CHISTRUCT * NewChiStruct ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine allocates a new data structure which is used to hold a chi-squared value along with its associated number of degrees of freedom and alpha value.

Parameters
DegreesOfFreedomdegrees of freedom for new chi value
Alphaconfidence level for new chi value
Returns
none
Note
Exceptions: none
History: Fri Aug 4 11:04:59 1989, DSJ, Created.

Definition at line 2370 of file cluster.cpp.

2370  {
2372 
2373  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2374  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2375  NewChiStruct->Alpha = Alpha;
2376  return (NewChiStruct);
2377 
2378 } // NewChiStruct
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2370
void * Emalloc(int Size)
Definition: emalloc.cpp:47
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2243
uinT16 DegreesOfFreedom
Definition: cluster.cpp:190
FLOAT64 Alpha
Definition: cluster.cpp:191
PROTOTYPE * NewEllipticalProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new elliptical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1553 of file cluster.cpp.

1555  {
1556  PROTOTYPE *Proto;
1557  FLOAT32 *CoVariance;
1558  int i;
1559 
1560  Proto = NewSimpleProto (N, Cluster);
1561  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1562  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1563  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1564 
1565  CoVariance = Statistics->CoVariance;
1566  Proto->TotalMagnitude = 1.0;
1567  for (i = 0; i < N; i++, CoVariance += N + 1) {
1568  Proto->Variance.Elliptical[i] = *CoVariance;
1569  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1570  Proto->Variance.Elliptical[i] = MINVARIANCE;
1571 
1572  Proto->Magnitude.Elliptical[i] =
1573  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1574  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1575  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1576  }
1577  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1578  Proto->Style = elliptical;
1579  return (Proto);
1580 } // NewEllipticalProto
#define MINVARIANCE
Definition: cluster.cpp:142
float FLOAT32
Definition: host.h:111
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 TotalMagnitude
Definition: cluster.h:79
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 * Elliptical
Definition: cluster.h:64
#define PI
Definition: const.h:19
unsigned Style
Definition: cluster.h:74
FLOAT32 * CoVariance
Definition: cluster.cpp:173
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1623
PROTOTYPE * NewMixedProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines.

Parameters
Nnumber of dimensions
Clustercluster to be made into a mixed prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new mixed prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1598 of file cluster.cpp.

1598  {
1599  PROTOTYPE *Proto;
1600  int i;
1601 
1602  Proto = NewEllipticalProto (N, Cluster, Statistics);
1603  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1604 
1605  for (i = 0; i < N; i++) {
1606  Proto->Distrib[i] = normal;
1607  }
1608  Proto->Style = mixed;
1609  return (Proto);
1610 } // NewMixedProto
DISTRIBUTION * Distrib
Definition: cluster.h:77
DISTRIBUTION
Definition: cluster.h:58
Definition: cluster.h:59
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1553
void * Emalloc(int Size)
Definition: emalloc.cpp:47
Definition: cluster.h:45
unsigned Style
Definition: cluster.h:74
PROTOTYPE * NewSimpleProto ( inT16  N,
CLUSTER Cluster 
)

This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension.

Parameters
Nnumber of dimensions
Clustercluster to be made into a prototype
Returns
Pointer to new simple prototype
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1623 of file cluster.cpp.

1623  {
1624  PROTOTYPE *Proto;
1625  int i;
1626 
1627  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1628  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1629 
1630  for (i = 0; i < N; i++)
1631  Proto->Mean[i] = Cluster->Mean[i];
1632  Proto->Distrib = NULL;
1633 
1634  Proto->Significant = TRUE;
1635  Proto->Merged = FALSE;
1636  Proto->Style = spherical;
1637  Proto->NumSamples = Cluster->SampleCount;
1638  Proto->Cluster = Cluster;
1639  Proto->Cluster->Prototype = TRUE;
1640  return (Proto);
1641 } // NewSimpleProto
float FLOAT32
Definition: host.h:111
DISTRIBUTION * Distrib
Definition: cluster.h:77
unsigned SampleCount
Definition: cluster.h:35
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Significant
Definition: cluster.h:68
unsigned NumSamples
Definition: cluster.h:75
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
CLUSTER * Cluster
Definition: cluster.h:76
unsigned Prototype
Definition: cluster.h:34
#define FALSE
Definition: capi.h:29
#define TRUE
Definition: capi.h:28
unsigned Style
Definition: cluster.h:74
#define NULL
Definition: host.h:144
unsigned Merged
Definition: cluster.h:69
PROTOTYPE * NewSphericalProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new spherical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1519 of file cluster.cpp.

1521  {
1522  PROTOTYPE *Proto;
1523 
1524  Proto = NewSimpleProto (N, Cluster);
1525 
1526  Proto->Variance.Spherical = Statistics->AvgVariance;
1527  if (Proto->Variance.Spherical < MINVARIANCE)
1528  Proto->Variance.Spherical = MINVARIANCE;
1529 
1530  Proto->Magnitude.Spherical =
1531  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1532  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1533  (double) N);
1534  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1535  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1536 
1537  return (Proto);
1538 } // NewSphericalProto
#define MINVARIANCE
Definition: cluster.cpp:142
FLOAT32 AvgVariance
Definition: cluster.cpp:172
FLOAT32 Spherical
Definition: cluster.h:63
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Variance
Definition: cluster.h:81
FLOATUNION Weight
Definition: cluster.h:83
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOATUNION Magnitude
Definition: cluster.h:82
#define PI
Definition: const.h:19
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1623
CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or NULL.
Note
Exceptions: None
History: 6/16/89, DSJ, Created.

Definition at line 625 of file cluster.cpp.

625  {
626  CLUSTER *Cluster;
627 
628  if (*SearchState == NIL_LIST)
629  return (NULL);
630  Cluster = (CLUSTER *) first_node (*SearchState);
631  *SearchState = pop (*SearchState);
632  while (TRUE) {
633  if (Cluster->Left == NULL)
634  return (Cluster);
635  *SearchState = push (*SearchState, Cluster->Right);
636  Cluster = Cluster->Left;
637  }
638 } // NextSample
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
struct sample * Right
Definition: cluster.h:37
#define first_node(l)
Definition: oldlist.h:139
LIST pop(LIST list)
Definition: oldlist.cpp:305
Definition: cluster.h:32
#define TRUE
Definition: capi.h:28
#define NULL
Definition: host.h:144
LIST push(LIST list, void *element)
Definition: oldlist.cpp:323
uinT16 NormalBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete normal distribution defined by kNormalMean and kNormalStdDev. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meanmean of normal distribution
StdDevstandard deviation of normal distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2088 of file cluster.cpp.

2091  {
2092  FLOAT32 X;
2093 
2094  // wraparound circular parameters if necessary
2095  if (ParamDesc->Circular) {
2096  if (x - Mean > ParamDesc->HalfRange)
2097  x -= ParamDesc->Range;
2098  else if (x - Mean < -ParamDesc->HalfRange)
2099  x += ParamDesc->Range;
2100  }
2101 
2102  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2103  if (X < 0)
2104  return 0;
2105  if (X > BUCKETTABLESIZE - 1)
2106  return ((uinT16) (BUCKETTABLESIZE - 1));
2107  return (uinT16) floor((FLOAT64) X);
2108 } // NormalBucket
float FLOAT32
Definition: host.h:111
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
FLOAT32 Range
Definition: ocrfeatures.h:51
unsigned short uinT16
Definition: host.h:101
double FLOAT64
Definition: host.h:112
FLOAT64 NormalDensity ( inT32  x)

This routine computes the probability density function of a discrete normal distribution defined by the global variables kNormalMean, kNormalVariance, and kNormalMagnitude. Normal magnitude could, of course, be computed in terms of the normal variance but it is precomputed for efficiency.

Parameters
xnumber to compute the normal probability density for
Note
Globals: kNormalMean mean of a discrete normal distribution kNormalVariance variance of a discrete normal distribution kNormalMagnitude magnitude of a discrete normal distribution
Returns
The value of the normal distribution at x.
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1951 of file cluster.cpp.

1951  {
1952  FLOAT64 Distance;
1953 
1954  Distance = x - kNormalMean;
1955  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1956 } // NormalDensity
double FLOAT64
Definition: host.h:112
int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of histogram data structures to find one with the specified number of buckets. It is called by the list search routines.

Parameters
arg1current histogram being tested for a match
arg2match key
Returns
TRUE if arg1 matches arg2
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2266 of file cluster.cpp.

2267  { // uinT16 *DesiredNumberOfBuckets)
2268  BUCKETS *Histogram = (BUCKETS *) arg1;
2269  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2270 
2271  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2272 
2273 } // NumBucketsMatch
uinT16 NumberOfBuckets
Definition: cluster.cpp:183
unsigned short uinT16
Definition: host.h:101
uinT16 OptimumNumberOfBuckets ( uinT32  SampleCount)

This routine computes the optimum number of histogram buckets that should be used in a chi-squared goodness of fit test for the specified number of samples. The optimum number is computed based on Table 4.1 on pg. 147 of "Measurement and Analysis of Random Data" by Bendat & Piersol. Linear interpolation is used to interpolate between table values. The table is intended for a 0.05 level of significance (alpha). This routine assumes that it is equally valid for other alpha's, which may not be true.

Parameters
SampleCountnumber of samples to be tested
Returns
Optimum number of histogram buckets
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1859 of file cluster.cpp.

1859  {
1860  uinT8 Last, Next;
1861  FLOAT32 Slope;
1862 
1863  if (SampleCount < kCountTable[0])
1864  return kBucketsTable[0];
1865 
1866  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1867  if (SampleCount <= kCountTable[Next]) {
1868  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1869  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1870  return ((uinT16) (kBucketsTable[Last] +
1871  Slope * (SampleCount - kCountTable[Last])));
1872  }
1873  }
1874  return kBucketsTable[Last];
1875 } // OptimumNumberOfBuckets
float FLOAT32
Definition: host.h:111
#define LOOKUPTABLESIZE
Definition: cluster.cpp:228
unsigned short uinT16
Definition: host.h:101
unsigned char uinT8
Definition: host.h:99
FLOAT64 Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
FLOAT64  InitialGuess,
FLOAT64  Accuracy 
)

This routine attempts to find an x value at which Function goes to zero (i.e. a root of the function ). It will only work correctly if a solution actually exists and there are no extrema between the solution and the InitialGuess. The algorithms used are extremely primitive.

Parameters
Functionfunction whose zero is to be found
FunctionParamsarbitrary data to pass to function
InitialGuesspoint to start solution search at
Accuracymaximum allowed error
Returns
Solution of function ( x for which f(x) = 0 ).
Note
Exceptions: none
History: Fri Aug 4 11:08:59 1989, DSJ, Created.

Definition at line 2397 of file cluster.cpp.

2401 {
2402  FLOAT64 x;
2403  FLOAT64 f;
2404  FLOAT64 Slope;
2405  FLOAT64 Delta;
2406  FLOAT64 NewDelta;
2407  FLOAT64 xDelta;
2408  FLOAT64 LastPosX, LastNegX;
2409 
2410  x = InitialGuess;
2411  Delta = INITIALDELTA;
2412  LastPosX = MAX_FLOAT32;
2413  LastNegX = -MAX_FLOAT32;
2414  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2415  while (Abs (LastPosX - LastNegX) > Accuracy) {
2416  // keep track of outer bounds of current estimate
2417  if (f < 0)
2418  LastNegX = x;
2419  else
2420  LastPosX = x;
2421 
2422  // compute the approx. slope of f(x) at the current point
2423  Slope =
2424  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2425 
2426  // compute the next solution guess */
2427  xDelta = f / Slope;
2428  x -= xDelta;
2429 
2430  // reduce the delta used for computing slope to be a fraction of
2431  //the amount moved to get to the new guess
2432  NewDelta = Abs (xDelta) * DELTARATIO;
2433  if (NewDelta < Delta)
2434  Delta = NewDelta;
2435 
2436  // compute the value of the function at the new guess
2437  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2438  }
2439  return (x);
2440 
2441 } // Solve
#define INITIALDELTA
#define Abs(N)
Definition: cluster.cpp:208
#define MAX_FLOAT32
Definition: host.h:124
#define DELTARATIO
double FLOAT64
Definition: host.h:112
FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 664 of file cluster.cpp.

664  {
665  switch (Proto->Style) {
666  case spherical:
667  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
668  case elliptical:
669  return ((FLOAT32)
670  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
671  case mixed:
672  switch (Proto->Distrib[Dimension]) {
673  case normal:
674  return ((FLOAT32)
675  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
676  case uniform:
677  case D_random:
678  return (Proto->Variance.Elliptical[Dimension]);
679  case DISTRIBUTION_COUNT:
680  ASSERT_HOST(!"Distribution count not allowed!");
681  }
682  }
683  return 0.0f;
684 } // StandardDeviation
float FLOAT32
Definition: host.h:111
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 Spherical
Definition: cluster.h:63
FLOATUNION Variance
Definition: cluster.h:81
Definition: cluster.h:59
#define ASSERT_HOST(x)
Definition: errcode.h:84
FLOAT32 * Elliptical
Definition: cluster.h:64
Definition: cluster.h:45
unsigned Style
Definition: cluster.h:74
PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Configprovides the magic number of samples that make a good cluster
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Returns
Pointer to new elliptical prototype or NULL.

Definition at line 1119 of file cluster.cpp.

1122  {
1123  // Fraction of the number of samples used as a range around 1 within
1124  // which a cluster has the magic size that allows a boost to the
1125  // FTable by kFTableBoostMargin, thus allowing clusters near the
1126  // magic size (equal to the number of sample characters) to be more
1127  // likely to stay together.
1128  const double kMagicSampleMargin = 0.0625;
1129  const double kFTableBoostMargin = 2.0;
1130 
1131  int N = Clusterer->SampleSize;
1132  CLUSTER* Left = Cluster->Left;
1133  CLUSTER* Right = Cluster->Right;
1134  if (Left == NULL || Right == NULL)
1135  return NULL;
1136  int TotalDims = Left->SampleCount + Right->SampleCount;
1137  if (TotalDims < N + 1 || TotalDims < 2)
1138  return NULL;
1139  const int kMatrixSize = N * N * sizeof(FLOAT32);
1140  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1141  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1142  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1143  // Compute a new covariance matrix that only uses essential features.
1144  for (int i = 0; i < N; ++i) {
1145  int row_offset = i * N;
1146  if (!Clusterer->ParamDesc[i].NonEssential) {
1147  for (int j = 0; j < N; ++j) {
1148  if (!Clusterer->ParamDesc[j].NonEssential)
1149  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1150  else
1151  Covariance[j + row_offset] = 0.0f;
1152  }
1153  } else {
1154  for (int j = 0; j < N; ++j) {
1155  if (i == j)
1156  Covariance[j + row_offset] = 1.0f;
1157  else
1158  Covariance[j + row_offset] = 0.0f;
1159  }
1160  }
1161  }
1162  double err = InvertMatrix(Covariance, N, Inverse);
1163  if (err > 1) {
1164  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1165  }
1166  int EssentialN = 0;
1167  for (int dim = 0; dim < N; ++dim) {
1168  if (!Clusterer->ParamDesc[dim].NonEssential) {
1169  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1170  ++EssentialN;
1171  } else {
1172  Delta[dim] = 0.0f;
1173  }
1174  }
1175  // Compute Hotelling's T-squared.
1176  double Tsq = 0.0;
1177  for (int x = 0; x < N; ++x) {
1178  double temp = 0.0;
1179  for (int y = 0; y < N; ++y) {
1180  temp += Inverse[y + N*x] * Delta[y];
1181  }
1182  Tsq += Delta[x] * temp;
1183  }
1184  memfree(Covariance);
1185  memfree(Inverse);
1186  memfree(Delta);
1187  // Changed this function to match the formula in
1188  // Statistical Methods in Medical Research p 473
1189  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1190  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1191  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1192  int Fx = EssentialN;
1193  if (Fx > FTABLE_X)
1194  Fx = FTABLE_X;
1195  --Fx;
1196  int Fy = TotalDims - EssentialN - 1;
1197  if (Fy > FTABLE_Y)
1198  Fy = FTABLE_Y;
1199  --Fy;
1200  double FTarget = FTable[Fy][Fx];
1201  if (Config->MagicSamples > 0 &&
1202  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1203  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1204  // Give magic-sized clusters a magic FTable boost.
1205  FTarget += kFTableBoostMargin;
1206  }
1207  if (F < FTarget) {
1208  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1209  }
1210  return NULL;
1211 }
void memfree(void *element)
Definition: freelist.cpp:30
#define FTABLE_Y
Definition: cluster.cpp:32
struct sample * Left
Definition: cluster.h:36
float FLOAT32
Definition: host.h:111
#define FTABLE_X
Definition: cluster.cpp:31
unsigned SampleCount
Definition: cluster.h:35
#define tprintf(...)
Definition: tprintf.h:31
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2569
struct sample * Right
Definition: cluster.h:37
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1553
int MagicSamples
Definition: cluster.h:55
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:35
inT8 NonEssential
Definition: ocrfeatures.h:48
PARAM_DESC * ParamDesc
Definition: cluster.h:88
Definition: cluster.h:32
#define NULL
Definition: host.h:144
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 * CoVariance
Definition: cluster.cpp:173
uinT16 UniformBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete uniform distribution defined by BUCKETTABLESIZE. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meancenter of range of uniform distribution
StdDev1/2 the range of the uniform distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2124 of file cluster.cpp.

2127  {
2128  FLOAT32 X;
2129 
2130  // wraparound circular parameters if necessary
2131  if (ParamDesc->Circular) {
2132  if (x - Mean > ParamDesc->HalfRange)
2133  x -= ParamDesc->Range;
2134  else if (x - Mean < -ParamDesc->HalfRange)
2135  x += ParamDesc->Range;
2136  }
2137 
2138  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2139  if (X < 0)
2140  return 0;
2141  if (X > BUCKETTABLESIZE - 1)
2142  return (uinT16) (BUCKETTABLESIZE - 1);
2143  return (uinT16) floor((FLOAT64) X);
2144 } // UniformBucket
float FLOAT32
Definition: host.h:111
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
FLOAT32 HalfRange
Definition: ocrfeatures.h:52
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:650
FLOAT32 Range
Definition: ocrfeatures.h:51
unsigned short uinT16
Definition: host.h:101
double FLOAT64
Definition: host.h:112
FLOAT64 UniformDensity ( inT32  x)

This routine computes the probability density function of a uniform distribution at the specified point. The range of the distribution is from 0 to BUCKETTABLESIZE.

Parameters
xnumber to compute the uniform probability density for
Returns
The value of the uniform distribution at x.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1968 of file cluster.cpp.

1968  {
1969  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1970 
1971  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1972  return UniformDistributionDensity;
1973  else
1974  return (FLOAT64) 0.0;
1975 } // UniformDensity
#define BUCKETTABLESIZE
Definition: cluster.cpp:160
double FLOAT64
Definition: host.h:112

Variable Documentation

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 35 of file cluster.cpp.