18#define _USE_MATH_DEFINES
1230 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1236#define MINVARIANCE 0.0004
1244#define MINSAMPLESPERBUCKET 5
1245#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1246#define MINSAMPLESNEEDED 1
1254#define BUCKETTABLESIZE 1024
1255#define NORMALEXTENT 3.0
1314#define Odd(N) ((N) % 2)
1315#define Mirror(N, R) ((R) - (N)-1)
1316#define Abs(N) (((N) < 0) ? (-(N)) : (N))
1326#define SqrtOf2Pi 2.506628275
1328static const double kNormalVariance =
1335#define LOOKUPTABLESIZE 8
1336#define MAXDEGREESOFFREEDOM MAXBUCKETS
1347static void CreateClusterTree(CLUSTERER *Clusterer);
1349static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1351static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance);
1353static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1355static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config);
1357static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster);
1359static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1362static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
1363 STATISTICS *Statistics);
1365static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1368static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1369 STATISTICS *Statistics, BUCKETS *Buckets);
1371static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1372 BUCKETS *NormalBuckets,
double Confidence);
1374static void MakeDimRandom(uint16_t
i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1376static void MakeDimUniform(uint16_t
i, PROTOTYPE *Proto, STATISTICS *Statistics);
1378static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1380static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1382static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1384static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1386static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1388static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence);
1390static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
1393static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence);
1395static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1397static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha);
1399static double NormalDensity(int32_t
x);
1401static double UniformDensity(int32_t
x);
1403static double Integral(
double f1,
double f2,
double Dx);
1405static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1406 float Mean,
float StdDev);
1408static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1410static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev);
1412static bool DistributionOK(BUCKETS *Buckets);
1414static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1416static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1418static void InitBuckets(BUCKETS *Buckets);
1420static int AlphaMatch(
void *arg1,
1423static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy);
1425static double ChiArea(CHISTRUCT *ChiParams,
double x);
1427static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal);
1429static double InvertMatrix(
const float *input,
int size,
float *inv);
1446 Clusterer->NumberOfSamples = 0;
1447 Clusterer->NumChar = 0;
1450 Clusterer->Root =
nullptr;
1454 Clusterer->ParamDesc =
new PARAM_DESC[SampleSize];
1455 for (
i = 0;
i < SampleSize;
i++) {
1457 Clusterer->ParamDesc[
i].NonEssential = ParamDesc[
i].
NonEssential;
1458 Clusterer->ParamDesc[
i].Min = ParamDesc[
i].
Min;
1459 Clusterer->ParamDesc[
i].Max = ParamDesc[
i].
Max;
1460 Clusterer->ParamDesc[
i].Range = ParamDesc[
i].
Max - ParamDesc[
i].
Min;
1461 Clusterer->ParamDesc[
i].HalfRange = Clusterer->ParamDesc[
i].Range / 2;
1462 Clusterer->ParamDesc[
i].MidRange = (ParamDesc[
i].
Max + ParamDesc[
i].
Min) / 2;
1466 Clusterer->KDTree =
MakeKDTree(SampleSize, ParamDesc);
1469 for (
auto &d : Clusterer->bucket_cache) {
1500 Sample->Clustered =
false;
1501 Sample->Prototype =
false;
1502 Sample->SampleCount = 1;
1503 Sample->Left =
nullptr;
1504 Sample->Right =
nullptr;
1505 Sample->CharID = CharID;
1508 Sample->Mean[
i] = Feature[
i];
1514 if (CharID >= Clusterer->
NumChar) {
1515 Clusterer->
NumChar = CharID + 1;
1545 if (Clusterer->
Root ==
nullptr) {
1546 CreateClusterTree(Clusterer);
1554 ComputePrototypes(Clusterer,
Config);
1576 if (Clusterer !=
nullptr) {
1578 delete Clusterer->
KDTree;
1579 delete Clusterer->
Root;
1609 auto *Prototype =
static_cast<PROTOTYPE *
>(arg);
1612 if (Prototype->Cluster !=
nullptr) {
1618 delete[] Prototype->Variance.Elliptical;
1619 delete[] Prototype->Magnitude.Elliptical;
1620 delete[] Prototype->Weight.Elliptical;
1644 Cluster =
reinterpret_cast<CLUSTER *
>((*SearchState)->first_node());
1645 *SearchState =
pop(*SearchState);
1647 if (
Cluster->Left ==
nullptr) {
1650 *SearchState =
push(*SearchState,
Cluster->Right);
1663 return (Proto->
Mean[Dimension]);
1674 switch (Proto->
Style) {
1680 switch (Proto->
Distrib[Dimension]) {
1709static void CreateClusterTree(CLUSTERER *Clusterer) {
1710 ClusteringContext context;
1715 context.tree = Clusterer->KDTree;
1716 context.candidates =
new TEMPCLUSTER[Clusterer->NumberOfSamples];
1718 context.heap =
new ClusterHeap(Clusterer->NumberOfSamples);
1719 KDWalk(context.tree, MakePotentialClusters, &context);
1722 while (context.heap->Pop(&HeapEntry)) {
1723 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1727 if (PotentialCluster->Cluster->Clustered) {
1733 else if (PotentialCluster->Neighbor->Clustered) {
1734 PotentialCluster->Neighbor =
1735 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1736 if (PotentialCluster->Neighbor !=
nullptr) {
1737 context.heap->Push(&HeapEntry);
1743 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1744 PotentialCluster->Neighbor =
1745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1746 if (PotentialCluster->Neighbor !=
nullptr) {
1747 context.heap->Push(&HeapEntry);
1753 Clusterer->Root =
static_cast<CLUSTER *
> RootOf(Clusterer->KDTree);
1756 delete context.tree;
1757 Clusterer->KDTree =
nullptr;
1758 delete context.heap;
1759 delete[] context.candidates;
1771static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t ) {
1773 int next = context->next;
1774 context->candidates[
next].Cluster = Cluster;
1775 HeapEntry.data() = &(context->candidates[
next]);
1776 context->candidates[
next].Neighbor =
1777 FindNearestNeighbor(context->tree, context->candidates[
next].Cluster, &HeapEntry.key());
1778 if (context->candidates[
next].Neighbor !=
nullptr) {
1779 context->heap->Push(&HeapEntry);
1797static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster,
float *Distance)
1798#define MAXNEIGHBORS 2
1799#define MAXDISTANCE FLT_MAX
1803 int NumberOfNeighbors;
1805 CLUSTER *BestNeighbor;
1809 reinterpret_cast<void **
>(Neighbor), Dist);
1813 BestNeighbor =
nullptr;
1814 for (
i = 0;
i < NumberOfNeighbors;
i++) {
1815 if ((Dist[
i] < *Distance) && (Neighbor[
i] != Cluster)) {
1816 *Distance = Dist[
i];
1817 BestNeighbor = Neighbor[
i];
1820 return BestNeighbor;
1832static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1834 auto Cluster =
new CLUSTER(Clusterer->SampleSize);
1835 Cluster->Clustered =
false;
1836 Cluster->Prototype =
false;
1837 Cluster->Left = TempCluster->Cluster;
1838 Cluster->Right = TempCluster->Neighbor;
1839 Cluster->CharID = -1;
1842 Cluster->Left->Clustered =
true;
1843 Cluster->Right->Clustered =
true;
1844 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1845 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1848 Cluster->SampleCount =
MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1849 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1850 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1853 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1871 float m1[],
float m2[]) {
1875 for (
i = N;
i > 0;
i--, ParamDesc++, m++, m1++, m2++) {
1880 if ((*m2 - *m1) > ParamDesc->
HalfRange) {
1881 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->
Range)) / n;
1882 if (*m < ParamDesc->Min) {
1883 *m += ParamDesc->
Range;
1885 }
else if ((*m1 - *m2) > ParamDesc->
HalfRange) {
1886 *m = (n1 * (*m1 - ParamDesc->
Range) + n2 * *m2) / n;
1887 if (*m < ParamDesc->Min) {
1888 *m += ParamDesc->
Range;
1891 *m = (n1 * *m1 + n2 * *m2) / n;
1894 *m = (n1 * *m1 + n2 * *m2) / n;
1908static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config) {
1911 PROTOTYPE *Prototype;
1915 if (Clusterer->Root !=
nullptr) {
1924 Cluster =
reinterpret_cast<CLUSTER *
>(ClusterStack->first_node());
1925 ClusterStack =
pop(ClusterStack);
1926 Prototype = MakePrototype(Clusterer,
Config, Cluster);
1927 if (Prototype !=
nullptr) {
1928 Clusterer->ProtoList =
push(Clusterer->ProtoList, Prototype);
1930 ClusterStack =
push(ClusterStack, Cluster->Right);
1931 ClusterStack =
push(ClusterStack, Cluster->Left);
1951static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster) {
1961 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1966 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics,
Config->
ProtoStyle,
1968 if (Proto !=
nullptr) {
1973 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1980 Proto = TestEllipticalProto(Clusterer,
Config, Cluster, Statistics);
1981 if (Proto !=
nullptr) {
1993 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1996 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1999 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2002 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2003 if (Proto !=
nullptr) {
2006 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2007 if (Proto !=
nullptr) {
2010 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
Config->
Confidence);
2036static PROTOTYPE *MakeDegenerateProto(
2037 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
PROTOSTYLE Style, int32_t MinSamples) {
2038 PROTOTYPE *Proto =
nullptr;
2044 if (Cluster->SampleCount < MinSamples) {
2047 Proto = NewSphericalProto(N, Cluster, Statistics);
2051 Proto = NewEllipticalProto(N, Cluster, Statistics);
2054 Proto = NewMixedProto(N, Cluster, Statistics);
2057 Proto->Significant =
false;
2075static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *
Config, CLUSTER *Cluster,
2076 STATISTICS *Statistics) {
2082 const double kMagicSampleMargin = 0.0625;
2083 const double kFTableBoostMargin = 2.0;
2085 int N = Clusterer->SampleSize;
2086 CLUSTER *Left = Cluster->Left;
2087 CLUSTER *Right = Cluster->Right;
2088 if (Left ==
nullptr || Right ==
nullptr) {
2091 int TotalDims = Left->SampleCount + Right->SampleCount;
2092 if (TotalDims < N + 1 || TotalDims < 2) {
2095 std::vector<float> Covariance(
static_cast<size_t>(N) * N);
2096 std::vector<float> Inverse(
static_cast<size_t>(N) * N);
2097 std::vector<float> Delta(N);
2099 for (
int i = 0;
i < N; ++
i) {
2100 int row_offset =
i * N;
2101 if (!Clusterer->ParamDesc[
i].NonEssential) {
2102 for (
int j = 0; j < N; ++j) {
2103 if (!Clusterer->ParamDesc[j].NonEssential) {
2104 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2106 Covariance[j + row_offset] = 0.0f;
2110 for (
int j = 0; j < N; ++j) {
2112 Covariance[j + row_offset] = 1.0f;
2114 Covariance[j + row_offset] = 0.0f;
2119 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2121 tprintf(
"Clustering error: Matrix inverse failed with error %g\n", err);
2124 for (
int dim = 0; dim < N; ++dim) {
2125 if (!Clusterer->ParamDesc[dim].NonEssential) {
2126 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2134 for (
int x = 0;
x < N; ++
x) {
2136 for (
int y = 0;
y < N; ++
y) {
2137 temp +=
static_cast<double>(Inverse[
y + N *
x]) * Delta[
y];
2139 Tsq += Delta[
x] * temp;
2145 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2146 int Fx = EssentialN;
2151 int Fy = TotalDims - EssentialN - 1;
2156 double FTarget =
FTable[Fy][Fx];
2160 FTarget += kFTableBoostMargin;
2163 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2179static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2181 PROTOTYPE *Proto =
nullptr;
2185 for (
i = 0;
i < Clusterer->SampleSize;
i++) {
2186 if (Clusterer->ParamDesc[
i].NonEssential) {
2190 FillBuckets(Buckets, Cluster,
i, &(Clusterer->ParamDesc[
i]), Cluster->Mean[
i],
2191 sqrt(
static_cast<double>(Statistics->AvgVariance)));
2192 if (!DistributionOK(Buckets)) {
2197 if (
i >= Clusterer->SampleSize) {
2198 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2214static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2215 STATISTICS *Statistics, BUCKETS *Buckets) {
2216 PROTOTYPE *Proto =
nullptr;
2220 for (
i = 0;
i < Clusterer->SampleSize;
i++) {
2221 if (Clusterer->ParamDesc[
i].NonEssential) {
2225 FillBuckets(Buckets, Cluster,
i, &(Clusterer->ParamDesc[
i]), Cluster->Mean[
i],
2226 sqrt(
static_cast<double>(Statistics->CoVariance[
i * (Clusterer->SampleSize + 1)])));
2227 if (!DistributionOK(Buckets)) {
2232 if (
i >= Clusterer->SampleSize) {
2233 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2253static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2254 BUCKETS *NormalBuckets,
double Confidence) {
2257 BUCKETS *UniformBuckets =
nullptr;
2258 BUCKETS *RandomBuckets =
nullptr;
2261 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2264 for (
i = 0;
i < Clusterer->SampleSize;
i++) {
2265 if (Clusterer->ParamDesc[
i].NonEssential) {
2269 FillBuckets(NormalBuckets, Cluster,
i, &(Clusterer->ParamDesc[
i]), Proto->Mean[
i],
2270 std::sqrt(Proto->Variance.Elliptical[
i]));
2271 if (DistributionOK(NormalBuckets)) {
2275 if (RandomBuckets ==
nullptr) {
2276 RandomBuckets = GetBuckets(Clusterer,
D_random, Cluster->SampleCount, Confidence);
2278 MakeDimRandom(
i, Proto, &(Clusterer->ParamDesc[
i]));
2279 FillBuckets(RandomBuckets, Cluster,
i, &(Clusterer->ParamDesc[
i]), Proto->Mean[
i],
2280 Proto->Variance.Elliptical[
i]);
2281 if (DistributionOK(RandomBuckets)) {
2285 if (UniformBuckets ==
nullptr) {
2286 UniformBuckets = GetBuckets(Clusterer,
uniform, Cluster->SampleCount, Confidence);
2288 MakeDimUniform(
i, Proto, Statistics);
2289 FillBuckets(UniformBuckets, Cluster,
i, &(Clusterer->ParamDesc[
i]), Proto->Mean[
i],
2290 Proto->Variance.Elliptical[
i]);
2291 if (DistributionOK(UniformBuckets)) {
2297 if (i < Clusterer->SampleSize) {
2311static void MakeDimRandom(uint16_t
i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2313 Proto->Mean[
i] = ParamDesc->MidRange;
2314 Proto->Variance.Elliptical[
i] = ParamDesc->HalfRange;
2317 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[
i];
2318 Proto->Magnitude.Elliptical[
i] = 1.0 / ParamDesc->Range;
2319 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[
i];
2320 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2332static void MakeDimUniform(uint16_t
i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2334 Proto->Mean[
i] = Proto->Cluster->Mean[
i] + (Statistics->Min[
i] + Statistics->Max[
i]) / 2;
2335 Proto->Variance.Elliptical[
i] = (Statistics->Max[
i] - Statistics->Min[
i]) / 2;
2341 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[
i];
2342 Proto->Magnitude.Elliptical[
i] = 1.0 / (2.0 * Proto->Variance.Elliptical[
i]);
2343 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[
i];
2344 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2363static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2367 uint32_t SampleCountAdjustedForBias;
2370 auto Statistics =
new STATISTICS(N);
2373 std::vector<float> Distance(N);
2377 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2378 for (
i = 0;
i < N;
i++) {
2379 Distance[
i] = Sample->Mean[
i] - Cluster->Mean[
i];
2380 if (ParamDesc[
i].Circular) {
2381 if (Distance[
i] > ParamDesc[
i].HalfRange) {
2382 Distance[
i] -= ParamDesc[
i].Range;
2384 if (Distance[
i] < -ParamDesc[
i].HalfRange) {
2385 Distance[
i] += ParamDesc[
i].Range;
2388 if (Distance[
i] < Statistics->Min[
i]) {
2389 Statistics->Min[
i] = Distance[
i];
2391 if (Distance[
i] > Statistics->Max[
i]) {
2392 Statistics->Max[
i] = Distance[
i];
2395 auto CoVariance = &Statistics->CoVariance[0];
2396 for (
i = 0;
i < N;
i++) {
2397 for (j = 0; j < N; j++, CoVariance++) {
2398 *CoVariance += Distance[
i] * Distance[j];
2406 if (Cluster->SampleCount > 1) {
2407 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2409 SampleCountAdjustedForBias = 1;
2411 auto CoVariance = &Statistics->CoVariance[0];
2412 for (
i = 0;
i < N;
i++) {
2413 for (j = 0; j < N; j++, CoVariance++) {
2414 *CoVariance /= SampleCountAdjustedForBias;
2419 Statistics->AvgVariance *= *CoVariance;
2423 Statistics->AvgVariance =
2424 static_cast<float>(pow(
static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2440static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2443 Proto = NewSimpleProto(N, Cluster);
2445 Proto->Variance.Spherical = Statistics->AvgVariance;
2450 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2451 Proto->TotalMagnitude =
static_cast<float>(
2452 pow(
static_cast<double>(Proto->Magnitude.Spherical),
static_cast<double>(N)));
2453 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2454 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2469static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2473 Proto = NewSimpleProto(N, Cluster);
2474 Proto->Variance.Elliptical =
new float[N];
2475 Proto->Magnitude.Elliptical =
new float[N];
2476 Proto->Weight.Elliptical =
new float[N];
2478 auto CoVariance = &Statistics->CoVariance[0];
2479 Proto->TotalMagnitude = 1.0;
2480 for (
i = 0;
i < N;
i++, CoVariance += N + 1) {
2481 Proto->Variance.Elliptical[
i] = *CoVariance;
2486 Proto->Magnitude.Elliptical[
i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[
i]);
2487 Proto->Weight.Elliptical[
i] = 1.0f / Proto->Variance.Elliptical[
i];
2488 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[
i];
2490 Proto->LogMagnitude = log(
static_cast<double>(Proto->TotalMagnitude));
2508static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2509 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2510 Proto->Distrib.clear();
2511 Proto->Distrib.resize(N,
normal);
2512 Proto->Style =
mixed;
2524static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2525 auto Proto =
new PROTOTYPE;
2526 Proto->
Mean = Cluster->Mean;
2527 Proto->Distrib.clear();
2528 Proto->Significant =
true;
2529 Proto->Merged =
false;
2531 Proto->NumSamples = Cluster->SampleCount;
2532 Proto->Cluster = Cluster;
2533 Proto->Cluster->Prototype =
true;
2555static bool Independent(PARAM_DESC *ParamDesc, int16_t N,
float *CoVariance,
float Independence) {
2559 float CorrelationCoeff;
2562 for (
i = 0;
i < N;
i++, VARii += N + 1) {
2563 if (ParamDesc[
i].NonEssential) {
2567 VARjj = VARii + N + 1;
2568 CoVariance = VARii + 1;
2569 for (j =
i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2570 if (ParamDesc[j].NonEssential) {
2574 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2575 CorrelationCoeff = 0.0;
2577 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2579 if (CorrelationCoeff > Independence) {
2602static BUCKETS *GetBuckets(CLUSTERER *clusterer,
DISTRIBUTION Distribution, uint32_t SampleCount,
2603 double Confidence) {
2605 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2606 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS];
2609 if (Buckets ==
nullptr) {
2610 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2611 clusterer->bucket_cache[Distribution][NumberOfBuckets -
MINBUCKETS] = Buckets;
2614 if (SampleCount != Buckets->SampleCount) {
2615 AdjustBuckets(Buckets, SampleCount);
2617 if (Confidence != Buckets->Confidence) {
2618 Buckets->Confidence = Confidence;
2619 Buckets->ChiSquared =
2620 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2622 InitBuckets(Buckets);
2643static BUCKETS *MakeBuckets(
DISTRIBUTION Distribution, uint32_t SampleCount,
double Confidence) {
2644 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2646 double BucketProbability;
2647 double NextBucketBoundary;
2649 double ProbabilityDelta;
2650 double LastProbDensity;
2652 uint16_t CurrentBucket;
2656 auto Buckets =
new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2657 Buckets->SampleCount = SampleCount;
2658 Buckets->Confidence = Confidence;
2661 Buckets->Distribution = Distribution;
2665 Buckets->ChiSquared =
2666 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2670 BucketProbability = 1.0 /
static_cast<double>(Buckets->NumberOfBuckets);
2673 CurrentBucket = Buckets->NumberOfBuckets / 2;
2674 if (
Odd(Buckets->NumberOfBuckets)) {
2675 NextBucketBoundary = BucketProbability / 2;
2677 NextBucketBoundary = BucketProbability;
2681 LastProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(
BUCKETTABLESIZE / 2);
2683 ProbDensity = (*DensityFunction[
static_cast<int>(Distribution)])(
i + 1);
2684 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2685 Probability += ProbabilityDelta;
2686 if (Probability > NextBucketBoundary) {
2687 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2690 NextBucketBoundary += BucketProbability;
2692 Buckets->Bucket[
i] = CurrentBucket;
2693 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>(ProbabilityDelta * SampleCount);
2694 LastProbDensity = ProbDensity;
2697 Buckets->ExpectedCount[CurrentBucket] +=
static_cast<float>((0.5 - Probability) * SampleCount);
2701 Buckets->Bucket[
i] =
Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2705 for (
i = 0, j = Buckets->NumberOfBuckets - 1;
i <= j;
i++, j--) {
2706 Buckets->ExpectedCount[
i] += Buckets->ExpectedCount[j];
2725static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2729 if (SampleCount < kCountTable[0]) {
2730 return kBucketsTable[0];
2734 if (SampleCount <= kCountTable[Next]) {
2735 Slope =
static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2736 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2738 static_cast<uint16_t
>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2741 return kBucketsTable[Last];
2760static double ComputeChiSquared(uint16_t DegreesOfFreedom,
double Alpha)
2761#define CHIACCURACY 0.01
2762#define MINALPHA (1e-200)
2769 if (
Odd(DegreesOfFreedom)) {
2776 CHISTRUCT SearchKey(0.0, Alpha);
2777 auto *found =
search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch);
2778 auto OldChiSquared =
reinterpret_cast<CHISTRUCT *
>(found ? found->first_node() :
nullptr);
2780 if (OldChiSquared ==
nullptr) {
2781 OldChiSquared =
new CHISTRUCT(DegreesOfFreedom, Alpha);
2782 OldChiSquared->ChiSquared =
2783 Solve(ChiArea, OldChiSquared,
static_cast<double>(DegreesOfFreedom),
CHIACCURACY);
2784 ChiWith[DegreesOfFreedom] =
push(ChiWith[DegreesOfFreedom], OldChiSquared);
2789 return (OldChiSquared->ChiSquared);
2806static double NormalDensity(int32_t
x) {
2809 Distance =
x - kNormalMean;
2810 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2820static double UniformDensity(int32_t
x) {
2824 return UniformDistributionDensity;
2838static double Integral(
double f1,
double f2,
double Dx) {
2839 return (f1 + f2) * Dx / 2.0;
2862static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2863 float Mean,
float StdDev) {
2870 for (
i = 0;
i < Buckets->NumberOfBuckets;
i++) {
2871 Buckets->Count[
i] = 0;
2874 if (StdDev == 0.0) {
2883 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2884 if (Sample->Mean[Dim] >
Mean) {
2885 BucketID = Buckets->NumberOfBuckets - 1;
2886 }
else if (Sample->Mean[Dim] <
Mean) {
2891 Buckets->Count[BucketID] += 1;
2893 if (
i >= Buckets->NumberOfBuckets) {
2900 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
2901 switch (Buckets->Distribution) {
2903 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2907 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim],
Mean, StdDev);
2912 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2928static uint16_t NormalBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2932 if (ParamDesc->Circular) {
2933 if (
x -
Mean > ParamDesc->HalfRange) {
2934 x -= ParamDesc->Range;
2935 }
else if (
x - Mean < -ParamDesc->HalfRange) {
2936 x += ParamDesc->Range;
2940 X = ((
x -
Mean) / StdDev) * kNormalStdDev + kNormalMean;
2947 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2961static uint16_t UniformBucket(PARAM_DESC *ParamDesc,
float x,
float Mean,
float StdDev) {
2965 if (ParamDesc->Circular) {
2966 if (
x -
Mean > ParamDesc->HalfRange) {
2967 x -= ParamDesc->Range;
2968 }
else if (
x - Mean < -ParamDesc->HalfRange) {
2969 x += ParamDesc->Range;
2980 return static_cast<uint16_t
>(floor(
static_cast<double>(X)));
2993static bool DistributionOK(BUCKETS *Buckets) {
2994 float FrequencyDifference;
2995 float TotalDifference;
2999 TotalDifference = 0.0;
3000 for (
i = 0;
i < Buckets->NumberOfBuckets;
i++) {
3001 FrequencyDifference = Buckets->Count[
i] - Buckets->ExpectedCount[
i];
3002 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[
i];
3006 if (TotalDifference > Buckets->ChiSquared) {
3025static uint16_t DegreesOfFreedom(
DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3026 static uint8_t DegreeOffsets[] = {3, 3, 1};
3028 uint16_t AdjustedNumBuckets;
3030 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[
static_cast<int>(Distribution)];
3031 if (
Odd(AdjustedNumBuckets)) {
3032 AdjustedNumBuckets++;
3034 return (AdjustedNumBuckets);
3045static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3047 double AdjustFactor;
3050 ((
static_cast<double>(NewSampleCount)) / (
static_cast<double>(Buckets->SampleCount)));
3052 for (
i = 0;
i < Buckets->NumberOfBuckets;
i++) {
3053 Buckets->ExpectedCount[
i] *= AdjustFactor;
3056 Buckets->SampleCount = NewSampleCount;
3065static void InitBuckets(BUCKETS *Buckets) {
3068 for (
i = 0;
i < Buckets->NumberOfBuckets;
i++) {
3069 Buckets->Count[
i] = 0;
3086static int AlphaMatch(
void *arg1,
3088 auto *ChiStruct =
static_cast<CHISTRUCT *
>(arg1);
3089 auto *SearchKey =
static_cast<CHISTRUCT *
>(arg2);
3091 return (ChiStruct->Alpha == SearchKey->Alpha);
3108static double Solve(
SOLVEFUNC Function,
void *FunctionParams,
double InitialGuess,
double Accuracy)
3109#define INITIALDELTA 0.1
3110#define DELTARATIO 0.1
3118 double LastPosX, LastNegX;
3123 LastNegX = -FLT_MAX;
3124 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams),
x);
3125 while (
Abs(LastPosX - LastNegX) > Accuracy) {
3134 Slope = ((*Function)(
static_cast<CHISTRUCT *
>(FunctionParams),
x + Delta) - f) / Delta;
3143 if (NewDelta < Delta) {
3148 f = (*Function)(
static_cast<CHISTRUCT *
>(FunctionParams),
x);
3172static double ChiArea(CHISTRUCT *ChiParams,
double x) {
3178 N = ChiParams->DegreesOfFreedom / 2 - 1;
3182 for (
i = 1;
i <= N;
i++) {
3183 Denominator *= 2 *
i;
3185 SeriesTotal += PowerOfx / Denominator;
3187 return ((SeriesTotal * exp(-0.5 *
x)) - ChiParams->Alpha);
3214static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster,
float MaxIllegal)
3215#define ILLEGAL_CHAR 2
3217 static std::vector<uint8_t> CharFlags;
3221 int32_t NumCharInCluster;
3222 int32_t NumIllegalInCluster;
3223 float PercentIllegal;
3227 NumIllegalInCluster = 0;
3229 if (Clusterer->NumChar > CharFlags.size()) {
3230 CharFlags.resize(Clusterer->NumChar);
3233 for (
auto &CharFlag : CharFlags) {
3239 while ((Sample =
NextSample(&SearchState)) !=
nullptr) {
3240 CharID = Sample->CharID;
3241 if (CharFlags[CharID] ==
false) {
3242 CharFlags[CharID] =
true;
3244 if (CharFlags[CharID] ==
true) {
3245 NumIllegalInCluster++;
3249 PercentIllegal =
static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3250 if (PercentIllegal > MaxIllegal) {
3265static double InvertMatrix(
const float *input,
int size,
float *inv) {
3267 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3268 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3269 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3274 for (row = 0; row < size; row++) {
3275 for (col = 0; col < size; col++) {
3276 U[row][col] = input[row * size + col];
3277 L[row][col] = row == col ? 1.0 : 0.0;
3278 U_inv[row][col] = 0.0;
3283 for (col = 0; col < size; ++col) {
3286 double best_pivot = -1.0;
3287 for (row = col; row < size; ++row) {
3288 if (
Abs(U[row][col]) > best_pivot) {
3289 best_pivot =
Abs(U[row][col]);
3294 if (best_row != col) {
3295 for (
int k = 0; k < size; ++k) {
3296 double tmp = U[best_row][k];
3297 U[best_row][k] = U[col][k];
3299 tmp = L[best_row][k];
3300 L[best_row][k] = L[col][k];
3305 for (row = col + 1; row < size; ++row) {
3306 double ratio = -U[row][col] / U[col][col];
3307 for (
int j = col; j < size; ++j) {
3308 U[row][j] += U[col][j] * ratio;
3310 for (
int k = 0; k < size; ++k) {
3311 L[row][k] += L[col][k] * ratio;
3316 for (col = 0; col < size; ++col) {
3317 U_inv[col][col] = 1.0 / U[col][col];
3318 for (row = col - 1; row >= 0; --row) {
3320 for (
int k = col; k > row; --k) {
3321 total += U[row][k] * U_inv[k][col];
3323 U_inv[row][col] = -total / U[row][row];
3327 for (row = 0; row < size; row++) {
3328 for (col = 0; col < size; col++) {
3330 for (
int k = row; k < size; ++k) {
3331 sum += U_inv[row][k] * L[k][col];
3333 inv[row * size + col] = sum;
3337 double error_sum = 0.0;
3338 for (row = 0; row < size; row++) {
3339 for (col = 0; col < size; col++) {
3341 for (
int k = 0; k < size; ++k) {
3342 sum +=
static_cast<double>(input[row * size + k]) * inv[k * size + col];
3345 error_sum +=
Abs(sum);
#define MAXDEGREESOFFREEDOM
#define InitSampleSearch(S, C)
void KDDelete(KDTREE *Tree, float Key[], void *Data)
void KDWalk(KDTREE *Tree, kdwalk_proc action, ClusteringContext *context)
void tprintf(const char *format,...)
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
tesseract::GenericHeap< ClusterPair > ClusterHeap
CLUSTER * NextSample(LIST *SearchState)
double(*)(int32_t) DENSITYFUNC
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
double(*)(CHISTRUCT *, double) SOLVEFUNC
const double FTable[FTABLE_Y][FTABLE_X]
void FreeProtoList(LIST *ProtoList)
void FreePrototype(void *arg)
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
void FreeClusterer(CLUSTERER *Clusterer)
LIST search(LIST list, void *key, int_compare is_equal)
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
LIST push(LIST list, void *element)
void destroy_nodes(LIST list, void_dest destructor)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
void KDStore(KDTREE *Tree, float *Key, CLUSTER *Data)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
std::vector< float > CoVariance
uint16_t Bucket[BUCKETTABLESIZE]
DISTRIBUTION Distribution
std::vector< uint32_t > Count
std::vector< float > ExpectedCount
uint16_t DegreesOfFreedom
CHISTRUCT(uint16_t degreesOfFreedom, double alpha)
std::vector< float > Mean
std::vector< DISTRIBUTION > Distrib
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]