tesseract v5.3.3.20231005
cluster.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: cluster.cpp
3 ** Purpose: Routines for clustering points in N-D space
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *****************************************************************************/
17
18#define _USE_MATH_DEFINES // for M_PI
19
20#include "cluster.h"
21
22#include "genericheap.h"
23#include "kdpair.h"
24#include "matrix.h"
25#include "tprintf.h"
26
27#include "helpers.h"
28
29#include <cfloat> // for FLT_MAX
30#include <cmath> // for M_PI
31#include <vector> // for std::vector
32
33namespace tesseract {
34
35#define HOTELLING 1 // If true use Hotelling's test to decide where to split.
36#define FTABLE_X 10 // Size of FTable.
37#define FTABLE_Y 100 // Size of FTable.
38
39// Table of values approximating the cumulative F-distribution for a confidence
40// of 1%.
41const double FTable[FTABLE_Y][FTABLE_X] = {
42 {
43 4052.19,
44 4999.52,
45 5403.34,
46 5624.62,
47 5763.65,
48 5858.97,
49 5928.33,
50 5981.10,
51 6022.50,
52 6055.85,
53 },
54 {
55 98.502,
56 99.000,
57 99.166,
58 99.249,
59 99.300,
60 99.333,
61 99.356,
62 99.374,
63 99.388,
64 99.399,
65 },
66 {
67 34.116,
68 30.816,
69 29.457,
70 28.710,
71 28.237,
72 27.911,
73 27.672,
74 27.489,
75 27.345,
76 27.229,
77 },
78 {
79 21.198,
80 18.000,
81 16.694,
82 15.977,
83 15.522,
84 15.207,
85 14.976,
86 14.799,
87 14.659,
88 14.546,
89 },
90 {
91 16.258,
92 13.274,
93 12.060,
94 11.392,
95 10.967,
96 10.672,
97 10.456,
98 10.289,
99 10.158,
100 10.051,
101 },
102 {
103 13.745,
104 10.925,
105 9.780,
106 9.148,
107 8.746,
108 8.466,
109 8.260,
110 8.102,
111 7.976,
112 7.874,
113 },
114 {
115 12.246,
116 9.547,
117 8.451,
118 7.847,
119 7.460,
120 7.191,
121 6.993,
122 6.840,
123 6.719,
124 6.620,
125 },
126 {
127 11.259,
128 8.649,
129 7.591,
130 7.006,
131 6.632,
132 6.371,
133 6.178,
134 6.029,
135 5.911,
136 5.814,
137 },
138 {
139 10.561,
140 8.022,
141 6.992,
142 6.422,
143 6.057,
144 5.802,
145 5.613,
146 5.467,
147 5.351,
148 5.257,
149 },
150 {
151 10.044,
152 7.559,
153 6.552,
154 5.994,
155 5.636,
156 5.386,
157 5.200,
158 5.057,
159 4.942,
160 4.849,
161 },
162 {
163 9.646,
164 7.206,
165 6.217,
166 5.668,
167 5.316,
168 5.069,
169 4.886,
170 4.744,
171 4.632,
172 4.539,
173 },
174 {
175 9.330,
176 6.927,
177 5.953,
178 5.412,
179 5.064,
180 4.821,
181 4.640,
182 4.499,
183 4.388,
184 4.296,
185 },
186 {
187 9.074,
188 6.701,
189 5.739,
190 5.205,
191 4.862,
192 4.620,
193 4.441,
194 4.302,
195 4.191,
196 4.100,
197 },
198 {
199 8.862,
200 6.515,
201 5.564,
202 5.035,
203 4.695,
204 4.456,
205 4.278,
206 4.140,
207 4.030,
208 3.939,
209 },
210 {
211 8.683,
212 6.359,
213 5.417,
214 4.893,
215 4.556,
216 4.318,
217 4.142,
218 4.004,
219 3.895,
220 3.805,
221 },
222 {
223 8.531,
224 6.226,
225 5.292,
226 4.773,
227 4.437,
228 4.202,
229 4.026,
230 3.890,
231 3.780,
232 3.691,
233 },
234 {
235 8.400,
236 6.112,
237 5.185,
238 4.669,
239 4.336,
240 4.102,
241 3.927,
242 3.791,
243 3.682,
244 3.593,
245 },
246 {
247 8.285,
248 6.013,
249 5.092,
250 4.579,
251 4.248,
252 4.015,
253 3.841,
254 3.705,
255 3.597,
256 3.508,
257 },
258 {
259 8.185,
260 5.926,
261 5.010,
262 4.500,
263 4.171,
264 3.939,
265 3.765,
266 3.631,
267 3.523,
268 3.434,
269 },
270 {
271 8.096,
272 5.849,
273 4.938,
274 4.431,
275 4.103,
276 3.871,
277 3.699,
278 3.564,
279 3.457,
280 3.368,
281 },
282 {
283 8.017,
284 5.780,
285 4.874,
286 4.369,
287 4.042,
288 3.812,
289 3.640,
290 3.506,
291 3.398,
292 3.310,
293 },
294 {
295 7.945,
296 5.719,
297 4.817,
298 4.313,
299 3.988,
300 3.758,
301 3.587,
302 3.453,
303 3.346,
304 3.258,
305 },
306 {
307 7.881,
308 5.664,
309 4.765,
310 4.264,
311 3.939,
312 3.710,
313 3.539,
314 3.406,
315 3.299,
316 3.211,
317 },
318 {
319 7.823,
320 5.614,
321 4.718,
322 4.218,
323 3.895,
324 3.667,
325 3.496,
326 3.363,
327 3.256,
328 3.168,
329 },
330 {
331 7.770,
332 5.568,
333 4.675,
334 4.177,
335 3.855,
336 3.627,
337 3.457,
338 3.324,
339 3.217,
340 3.129,
341 },
342 {
343 7.721,
344 5.526,
345 4.637,
346 4.140,
347 3.818,
348 3.591,
349 3.421,
350 3.288,
351 3.182,
352 3.094,
353 },
354 {
355 7.677,
356 5.488,
357 4.601,
358 4.106,
359 3.785,
360 3.558,
361 3.388,
362 3.256,
363 3.149,
364 3.062,
365 },
366 {
367 7.636,
368 5.453,
369 4.568,
370 4.074,
371 3.754,
372 3.528,
373 3.358,
374 3.226,
375 3.120,
376 3.032,
377 },
378 {
379 7.598,
380 5.420,
381 4.538,
382 4.045,
383 3.725,
384 3.499,
385 3.330,
386 3.198,
387 3.092,
388 3.005,
389 },
390 {
391 7.562,
392 5.390,
393 4.510,
394 4.018,
395 3.699,
396 3.473,
397 3.305,
398 3.173,
399 3.067,
400 2.979,
401 },
402 {
403 7.530,
404 5.362,
405 4.484,
406 3.993,
407 3.675,
408 3.449,
409 3.281,
410 3.149,
411 3.043,
412 2.955,
413 },
414 {
415 7.499,
416 5.336,
417 4.459,
418 3.969,
419 3.652,
420 3.427,
421 3.258,
422 3.127,
423 3.021,
424 2.934,
425 },
426 {
427 7.471,
428 5.312,
429 4.437,
430 3.948,
431 3.630,
432 3.406,
433 3.238,
434 3.106,
435 3.000,
436 2.913,
437 },
438 {
439 7.444,
440 5.289,
441 4.416,
442 3.927,
443 3.611,
444 3.386,
445 3.218,
446 3.087,
447 2.981,
448 2.894,
449 },
450 {
451 7.419,
452 5.268,
453 4.396,
454 3.908,
455 3.592,
456 3.368,
457 3.200,
458 3.069,
459 2.963,
460 2.876,
461 },
462 {
463 7.396,
464 5.248,
465 4.377,
466 3.890,
467 3.574,
468 3.351,
469 3.183,
470 3.052,
471 2.946,
472 2.859,
473 },
474 {
475 7.373,
476 5.229,
477 4.360,
478 3.873,
479 3.558,
480 3.334,
481 3.167,
482 3.036,
483 2.930,
484 2.843,
485 },
486 {
487 7.353,
488 5.211,
489 4.343,
490 3.858,
491 3.542,
492 3.319,
493 3.152,
494 3.021,
495 2.915,
496 2.828,
497 },
498 {
499 7.333,
500 5.194,
501 4.327,
502 3.843,
503 3.528,
504 3.305,
505 3.137,
506 3.006,
507 2.901,
508 2.814,
509 },
510 {
511 7.314,
512 5.179,
513 4.313,
514 3.828,
515 3.514,
516 3.291,
517 3.124,
518 2.993,
519 2.888,
520 2.801,
521 },
522 {
523 7.296,
524 5.163,
525 4.299,
526 3.815,
527 3.501,
528 3.278,
529 3.111,
530 2.980,
531 2.875,
532 2.788,
533 },
534 {
535 7.280,
536 5.149,
537 4.285,
538 3.802,
539 3.488,
540 3.266,
541 3.099,
542 2.968,
543 2.863,
544 2.776,
545 },
546 {
547 7.264,
548 5.136,
549 4.273,
550 3.790,
551 3.476,
552 3.254,
553 3.087,
554 2.957,
555 2.851,
556 2.764,
557 },
558 {
559 7.248,
560 5.123,
561 4.261,
562 3.778,
563 3.465,
564 3.243,
565 3.076,
566 2.946,
567 2.840,
568 2.754,
569 },
570 {
571 7.234,
572 5.110,
573 4.249,
574 3.767,
575 3.454,
576 3.232,
577 3.066,
578 2.935,
579 2.830,
580 2.743,
581 },
582 {
583 7.220,
584 5.099,
585 4.238,
586 3.757,
587 3.444,
588 3.222,
589 3.056,
590 2.925,
591 2.820,
592 2.733,
593 },
594 {
595 7.207,
596 5.087,
597 4.228,
598 3.747,
599 3.434,
600 3.213,
601 3.046,
602 2.916,
603 2.811,
604 2.724,
605 },
606 {
607 7.194,
608 5.077,
609 4.218,
610 3.737,
611 3.425,
612 3.204,
613 3.037,
614 2.907,
615 2.802,
616 2.715,
617 },
618 {
619 7.182,
620 5.066,
621 4.208,
622 3.728,
623 3.416,
624 3.195,
625 3.028,
626 2.898,
627 2.793,
628 2.706,
629 },
630 {
631 7.171,
632 5.057,
633 4.199,
634 3.720,
635 3.408,
636 3.186,
637 3.020,
638 2.890,
639 2.785,
640 2.698,
641 },
642 {
643 7.159,
644 5.047,
645 4.191,
646 3.711,
647 3.400,
648 3.178,
649 3.012,
650 2.882,
651 2.777,
652 2.690,
653 },
654 {
655 7.149,
656 5.038,
657 4.182,
658 3.703,
659 3.392,
660 3.171,
661 3.005,
662 2.874,
663 2.769,
664 2.683,
665 },
666 {
667 7.139,
668 5.030,
669 4.174,
670 3.695,
671 3.384,
672 3.163,
673 2.997,
674 2.867,
675 2.762,
676 2.675,
677 },
678 {
679 7.129,
680 5.021,
681 4.167,
682 3.688,
683 3.377,
684 3.156,
685 2.990,
686 2.860,
687 2.755,
688 2.668,
689 },
690 {
691 7.119,
692 5.013,
693 4.159,
694 3.681,
695 3.370,
696 3.149,
697 2.983,
698 2.853,
699 2.748,
700 2.662,
701 },
702 {
703 7.110,
704 5.006,
705 4.152,
706 3.674,
707 3.363,
708 3.143,
709 2.977,
710 2.847,
711 2.742,
712 2.655,
713 },
714 {
715 7.102,
716 4.998,
717 4.145,
718 3.667,
719 3.357,
720 3.136,
721 2.971,
722 2.841,
723 2.736,
724 2.649,
725 },
726 {
727 7.093,
728 4.991,
729 4.138,
730 3.661,
731 3.351,
732 3.130,
733 2.965,
734 2.835,
735 2.730,
736 2.643,
737 },
738 {
739 7.085,
740 4.984,
741 4.132,
742 3.655,
743 3.345,
744 3.124,
745 2.959,
746 2.829,
747 2.724,
748 2.637,
749 },
750 {
751 7.077,
752 4.977,
753 4.126,
754 3.649,
755 3.339,
756 3.119,
757 2.953,
758 2.823,
759 2.718,
760 2.632,
761 },
762 {
763 7.070,
764 4.971,
765 4.120,
766 3.643,
767 3.333,
768 3.113,
769 2.948,
770 2.818,
771 2.713,
772 2.626,
773 },
774 {
775 7.062,
776 4.965,
777 4.114,
778 3.638,
779 3.328,
780 3.108,
781 2.942,
782 2.813,
783 2.708,
784 2.621,
785 },
786 {
787 7.055,
788 4.959,
789 4.109,
790 3.632,
791 3.323,
792 3.103,
793 2.937,
794 2.808,
795 2.703,
796 2.616,
797 },
798 {
799 7.048,
800 4.953,
801 4.103,
802 3.627,
803 3.318,
804 3.098,
805 2.932,
806 2.803,
807 2.698,
808 2.611,
809 },
810 {
811 7.042,
812 4.947,
813 4.098,
814 3.622,
815 3.313,
816 3.093,
817 2.928,
818 2.798,
819 2.693,
820 2.607,
821 },
822 {
823 7.035,
824 4.942,
825 4.093,
826 3.618,
827 3.308,
828 3.088,
829 2.923,
830 2.793,
831 2.689,
832 2.602,
833 },
834 {
835 7.029,
836 4.937,
837 4.088,
838 3.613,
839 3.304,
840 3.084,
841 2.919,
842 2.789,
843 2.684,
844 2.598,
845 },
846 {
847 7.023,
848 4.932,
849 4.083,
850 3.608,
851 3.299,
852 3.080,
853 2.914,
854 2.785,
855 2.680,
856 2.593,
857 },
858 {
859 7.017,
860 4.927,
861 4.079,
862 3.604,
863 3.295,
864 3.075,
865 2.910,
866 2.781,
867 2.676,
868 2.589,
869 },
870 {
871 7.011,
872 4.922,
873 4.074,
874 3.600,
875 3.291,
876 3.071,
877 2.906,
878 2.777,
879 2.672,
880 2.585,
881 },
882 {
883 7.006,
884 4.917,
885 4.070,
886 3.596,
887 3.287,
888 3.067,
889 2.902,
890 2.773,
891 2.668,
892 2.581,
893 },
894 {
895 7.001,
896 4.913,
897 4.066,
898 3.591,
899 3.283,
900 3.063,
901 2.898,
902 2.769,
903 2.664,
904 2.578,
905 },
906 {
907 6.995,
908 4.908,
909 4.062,
910 3.588,
911 3.279,
912 3.060,
913 2.895,
914 2.765,
915 2.660,
916 2.574,
917 },
918 {
919 6.990,
920 4.904,
921 4.058,
922 3.584,
923 3.275,
924 3.056,
925 2.891,
926 2.762,
927 2.657,
928 2.570,
929 },
930 {
931 6.985,
932 4.900,
933 4.054,
934 3.580,
935 3.272,
936 3.052,
937 2.887,
938 2.758,
939 2.653,
940 2.567,
941 },
942 {
943 6.981,
944 4.896,
945 4.050,
946 3.577,
947 3.268,
948 3.049,
949 2.884,
950 2.755,
951 2.650,
952 2.563,
953 },
954 {
955 6.976,
956 4.892,
957 4.047,
958 3.573,
959 3.265,
960 3.046,
961 2.881,
962 2.751,
963 2.647,
964 2.560,
965 },
966 {
967 6.971,
968 4.888,
969 4.043,
970 3.570,
971 3.261,
972 3.042,
973 2.877,
974 2.748,
975 2.644,
976 2.557,
977 },
978 {
979 6.967,
980 4.884,
981 4.040,
982 3.566,
983 3.258,
984 3.039,
985 2.874,
986 2.745,
987 2.640,
988 2.554,
989 },
990 {
991 6.963,
992 4.881,
993 4.036,
994 3.563,
995 3.255,
996 3.036,
997 2.871,
998 2.742,
999 2.637,
1000 2.551,
1001 },
1002 {
1003 6.958,
1004 4.877,
1005 4.033,
1006 3.560,
1007 3.252,
1008 3.033,
1009 2.868,
1010 2.739,
1011 2.634,
1012 2.548,
1013 },
1014 {
1015 6.954,
1016 4.874,
1017 4.030,
1018 3.557,
1019 3.249,
1020 3.030,
1021 2.865,
1022 2.736,
1023 2.632,
1024 2.545,
1025 },
1026 {
1027 6.950,
1028 4.870,
1029 4.027,
1030 3.554,
1031 3.246,
1032 3.027,
1033 2.863,
1034 2.733,
1035 2.629,
1036 2.542,
1037 },
1038 {
1039 6.947,
1040 4.867,
1041 4.024,
1042 3.551,
1043 3.243,
1044 3.025,
1045 2.860,
1046 2.731,
1047 2.626,
1048 2.539,
1049 },
1050 {
1051 6.943,
1052 4.864,
1053 4.021,
1054 3.548,
1055 3.240,
1056 3.022,
1057 2.857,
1058 2.728,
1059 2.623,
1060 2.537,
1061 },
1062 {
1063 6.939,
1064 4.861,
1065 4.018,
1066 3.545,
1067 3.238,
1068 3.019,
1069 2.854,
1070 2.725,
1071 2.621,
1072 2.534,
1073 },
1074 {
1075 6.935,
1076 4.858,
1077 4.015,
1078 3.543,
1079 3.235,
1080 3.017,
1081 2.852,
1082 2.723,
1083 2.618,
1084 2.532,
1085 },
1086 {
1087 6.932,
1088 4.855,
1089 4.012,
1090 3.540,
1091 3.233,
1092 3.014,
1093 2.849,
1094 2.720,
1095 2.616,
1096 2.529,
1097 },
1098 {
1099 6.928,
1100 4.852,
1101 4.010,
1102 3.538,
1103 3.230,
1104 3.012,
1105 2.847,
1106 2.718,
1107 2.613,
1108 2.527,
1109 },
1110 {
1111 6.925,
1112 4.849,
1113 4.007,
1114 3.535,
1115 3.228,
1116 3.009,
1117 2.845,
1118 2.715,
1119 2.611,
1120 2.524,
1121 },
1122 {
1123 6.922,
1124 4.846,
1125 4.004,
1126 3.533,
1127 3.225,
1128 3.007,
1129 2.842,
1130 2.713,
1131 2.609,
1132 2.522,
1133 },
1134 {
1135 6.919,
1136 4.844,
1137 4.002,
1138 3.530,
1139 3.223,
1140 3.004,
1141 2.840,
1142 2.711,
1143 2.606,
1144 2.520,
1145 },
1146 {
1147 6.915,
1148 4.841,
1149 3.999,
1150 3.528,
1151 3.221,
1152 3.002,
1153 2.838,
1154 2.709,
1155 2.604,
1156 2.518,
1157 },
1158 {
1159 6.912,
1160 4.838,
1161 3.997,
1162 3.525,
1163 3.218,
1164 3.000,
1165 2.835,
1166 2.706,
1167 2.602,
1168 2.515,
1169 },
1170 {
1171 6.909,
1172 4.836,
1173 3.995,
1174 3.523,
1175 3.216,
1176 2.998,
1177 2.833,
1178 2.704,
1179 2.600,
1180 2.513,
1181 },
1182 {
1183 6.906,
1184 4.833,
1185 3.992,
1186 3.521,
1187 3.214,
1188 2.996,
1189 2.831,
1190 2.702,
1191 2.598,
1192 2.511,
1193 },
1194 {
1195 6.904,
1196 4.831,
1197 3.990,
1198 3.519,
1199 3.212,
1200 2.994,
1201 2.829,
1202 2.700,
1203 2.596,
1204 2.509,
1205 },
1206 {
1207 6.901,
1208 4.829,
1209 3.988,
1210 3.517,
1211 3.210,
1212 2.992,
1213 2.827,
1214 2.698,
1215 2.594,
1216 2.507,
1217 },
1218 {
1219 6.898,
1220 4.826,
1221 3.986,
1222 3.515,
1223 3.208,
1224 2.990,
1225 2.825,
1226 2.696,
1227 2.592,
1228 2.505,
1229 },
1230 {6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}};
1231
1236#define MINVARIANCE 0.0004
1237
1244#define MINSAMPLESPERBUCKET 5
1245#define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
1246#define MINSAMPLESNEEDED 1
1247
1254#define BUCKETTABLESIZE 1024
1255#define NORMALEXTENT 3.0
1256
1260};
1261
1264
1266 STATISTICS(size_t n) : CoVariance(n * n), Min(n), Max(n) {
1267 }
1268 float AvgVariance = 1.0f;
1269 std::vector<float> CoVariance;
1270 std::vector<float> Min; // largest negative distance from the mean
1271 std::vector<float> Max; // largest positive distance from the mean
1272};
1273
1274struct BUCKETS {
1276 }
1278 }
1279 DISTRIBUTION Distribution = normal; // distribution being tested for
1280 uint32_t SampleCount = 0; // # of samples in histogram
1281 double Confidence = 0.0; // confidence level of test
1282 double ChiSquared = 0.0; // test threshold
1283 uint16_t NumberOfBuckets; // number of cells in histogram
1284 uint16_t Bucket[BUCKETTABLESIZE]; // mapping to histogram buckets
1285 std::vector<uint32_t> Count; // frequency of occurrence histogram
1286 std::vector<float> ExpectedCount; // expected histogram
1287};
1288
1296 CHISTRUCT(uint16_t degreesOfFreedom, double alpha) : DegreesOfFreedom(degreesOfFreedom), Alpha(alpha) {
1297 }
1298 uint16_t DegreesOfFreedom = 0;
1299 double Alpha = 0.0;
1300 double ChiSquared = 0.0;
1301};
1302
1303// For use with KDWalk / MakePotentialClusters
1305 ClusterHeap *heap; // heap used to hold temp clusters, "best" on top
1306 TEMPCLUSTER *candidates; // array of potential clusters
1307 KDTREE *tree; // kd-tree to be searched for neighbors
1308 int32_t next; // next candidate to be used
1309};
1310
1311using DENSITYFUNC = double (*)(int32_t);
1312using SOLVEFUNC = double (*)(CHISTRUCT *, double);
1313
1314#define Odd(N) ((N) % 2)
1315#define Mirror(N, R) ((R) - (N)-1)
1316#define Abs(N) (((N) < 0) ? (-(N)) : (N))
1317
1318//--------------Global Data Definitions and Declarations----------------------
1326#define SqrtOf2Pi 2.506628275
1327static const double kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
1328static const double kNormalVariance =
1330static const double kNormalMagnitude = (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
1331static const double kNormalMean = BUCKETTABLESIZE / 2;
1332
1335#define LOOKUPTABLESIZE 8
1336#define MAXDEGREESOFFREEDOM MAXBUCKETS
1337
1338static const uint32_t kCountTable[LOOKUPTABLESIZE] = {MINSAMPLES, 200, 400, 600, 800,
1339 1000, 1500, 2000}; // number of samples
1340
1341static const uint16_t kBucketsTable[LOOKUPTABLESIZE] = {
1342 MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS}; // number of buckets
1343
1344/*-------------------------------------------------------------------------
1345 Private Function Prototypes
1346--------------------------------------------------------------------------*/
1347static void CreateClusterTree(CLUSTERER *Clusterer);
1348
1349static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t Level);
1350
1351static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance);
1352
1353static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
1354
1355static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
1356
1357static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster);
1358
1359static PROTOTYPE *MakeDegenerateProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics,
1360 PROTOSTYLE Style, int32_t MinSamples);
1361
1362static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
1363 STATISTICS *Statistics);
1364
1365static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1366 BUCKETS *Buckets);
1367
1368static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
1369 STATISTICS *Statistics, BUCKETS *Buckets);
1370
1371static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
1372 BUCKETS *NormalBuckets, double Confidence);
1373
1374static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
1375
1376static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics);
1377
1378static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster);
1379
1380static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1381
1382static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1383
1384static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics);
1385
1386static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster);
1387
1388static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence);
1389
1390static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
1391 double Confidence);
1392
1393static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence);
1394
1395static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount);
1396
1397static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha);
1398
1399static double NormalDensity(int32_t x);
1400
1401static double UniformDensity(int32_t x);
1402
1403static double Integral(double f1, double f2, double Dx);
1404
1405static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
1406 float Mean, float StdDev);
1407
1408static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1409
1410static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev);
1411
1412static bool DistributionOK(BUCKETS *Buckets);
1413
1414static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets);
1415
1416static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount);
1417
1418static void InitBuckets(BUCKETS *Buckets);
1419
1420static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
1421 void *arg2); // CHISTRUCT *SearchKey);
1422
1423static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy);
1424
1425static double ChiArea(CHISTRUCT *ChiParams, double x);
1426
1427static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal);
1428
1429static double InvertMatrix(const float *input, int size, float *inv);
1430
1431//--------------------------Public Code--------------------------------------
1440CLUSTERER *MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[]) {
1441 int i;
1442
1443 // allocate main clusterer data structure and init simple fields
1444 auto Clusterer = new CLUSTERER;
1445 Clusterer->SampleSize = SampleSize;
1446 Clusterer->NumberOfSamples = 0;
1447 Clusterer->NumChar = 0;
1448
1449 // init fields which will not be used initially
1450 Clusterer->Root = nullptr;
1451 Clusterer->ProtoList = NIL_LIST;
1452
1453 // maintain a copy of param descriptors in the clusterer data structure
1454 Clusterer->ParamDesc = new PARAM_DESC[SampleSize];
1455 for (i = 0; i < SampleSize; i++) {
1456 Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
1457 Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
1458 Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
1459 Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
1460 Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
1461 Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
1462 Clusterer->ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
1463 }
1464
1465 // allocate a kd tree to hold the samples
1466 Clusterer->KDTree = MakeKDTree(SampleSize, ParamDesc);
1467
1468 // Initialize cache of histogram buckets to minimize recomputing them.
1469 for (auto &d : Clusterer->bucket_cache) {
1470 for (auto &c : d) {
1471 c = nullptr;
1472 }
1473 }
1474
1475 return Clusterer;
1476} // MakeClusterer
1477
1491SAMPLE *MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID) {
1492 int i;
1493
1494 // see if the samples have already been clustered - if so trap an error
1495 // Can't add samples after they have been clustered.
1496 ASSERT_HOST(Clusterer->Root == nullptr);
1497
1498 // allocate the new sample and initialize it
1499 auto Sample = new SAMPLE(Clusterer->SampleSize);
1500 Sample->Clustered = false;
1501 Sample->Prototype = false;
1502 Sample->SampleCount = 1;
1503 Sample->Left = nullptr;
1504 Sample->Right = nullptr;
1505 Sample->CharID = CharID;
1506
1507 for (i = 0; i < Clusterer->SampleSize; i++) {
1508 Sample->Mean[i] = Feature[i];
1509 }
1510
1511 // add the sample to the KD tree - keep track of the total # of samples
1512 Clusterer->NumberOfSamples++;
1513 KDStore(Clusterer->KDTree, &Sample->Mean[0], Sample);
1514 if (CharID >= Clusterer->NumChar) {
1515 Clusterer->NumChar = CharID + 1;
1516 }
1517
1518 // execute hook for monitoring clustering operation
1519 // (*SampleCreationHook)(Sample);
1520
1521 return (Sample);
1522} // MakeSample
1523
1544 // only create cluster tree if samples have never been clustered before
1545 if (Clusterer->Root == nullptr) {
1546 CreateClusterTree(Clusterer);
1547 }
1548
1549 // deallocate the old prototype list if one exists
1550 FreeProtoList(&Clusterer->ProtoList);
1551 Clusterer->ProtoList = NIL_LIST;
1552
1553 // compute prototypes starting at the root node in the tree
1554 ComputePrototypes(Clusterer, Config);
1555 // We don't need the cluster pointers in the protos any more, so null them
1556 // out, which makes it safe to delete the clusterer.
1557 LIST proto_list = Clusterer->ProtoList;
1558 iterate(proto_list) {
1559 auto *proto = reinterpret_cast<PROTOTYPE *>(proto_list->first_node());
1560 proto->Cluster = nullptr;
1561 }
1562 return Clusterer->ProtoList;
1563} // ClusterSamples
1564
1575void FreeClusterer(CLUSTERER *Clusterer) {
1576 if (Clusterer != nullptr) {
1577 delete[] Clusterer->ParamDesc;
1578 delete Clusterer->KDTree;
1579 delete Clusterer->Root;
1580 // Free up all used buckets structures.
1581 for (auto &d : Clusterer->bucket_cache) {
1582 for (auto &c : d) {
1583 delete c;
1584 }
1585 }
1586
1587 delete Clusterer;
1588 }
1589} // FreeClusterer
1590
1597void FreeProtoList(LIST *ProtoList) {
1598 destroy_nodes(*ProtoList, FreePrototype);
1599} // FreeProtoList
1600
1608void FreePrototype(void *arg) { // PROTOTYPE *Prototype)
1609 auto *Prototype = static_cast<PROTOTYPE *>(arg);
1610
1611 // unmark the corresponding cluster (if there is one
1612 if (Prototype->Cluster != nullptr) {
1613 Prototype->Cluster->Prototype = false;
1614 }
1615
1616 // deallocate the prototype statistics and then the prototype itself
1617 if (Prototype->Style != spherical) {
1618 delete[] Prototype->Variance.Elliptical;
1619 delete[] Prototype->Magnitude.Elliptical;
1620 delete[] Prototype->Weight.Elliptical;
1621 }
1622 delete Prototype;
1623} // FreePrototype
1624
1638CLUSTER *NextSample(LIST *SearchState) {
1640
1641 if (*SearchState == NIL_LIST) {
1642 return (nullptr);
1643 }
1644 Cluster = reinterpret_cast<CLUSTER *>((*SearchState)->first_node());
1645 *SearchState = pop(*SearchState);
1646 for (;;) {
1647 if (Cluster->Left == nullptr) {
1648 return (Cluster);
1649 }
1650 *SearchState = push(*SearchState, Cluster->Right);
1651 Cluster = Cluster->Left;
1652 }
1653} // NextSample
1654
1662float Mean(PROTOTYPE *Proto, uint16_t Dimension) {
1663 return (Proto->Mean[Dimension]);
1664} // Mean
1665
1673float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension) {
1674 switch (Proto->Style) {
1675 case spherical:
1676 return std::sqrt(Proto->Variance.Spherical);
1677 case elliptical:
1678 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1679 case mixed:
1680 switch (Proto->Distrib[Dimension]) {
1681 case normal:
1682 return std::sqrt(Proto->Variance.Elliptical[Dimension]);
1683 case uniform:
1684 case D_random:
1685 return Proto->Variance.Elliptical[Dimension];
1686 case DISTRIBUTION_COUNT:
1687 ASSERT_HOST(!"Distribution count not allowed!");
1688 }
1689 }
1690 return 0.0f;
1691} // StandardDeviation
1692
1693/*---------------------------------------------------------------------------
1694 Private Code
1695----------------------------------------------------------------------------*/
1709static void CreateClusterTree(CLUSTERER *Clusterer) {
1710 ClusteringContext context;
1711 ClusterPair HeapEntry;
1712
1713 // each sample and its nearest neighbor form a "potential" cluster
1714 // save these in a heap with the "best" potential clusters on top
1715 context.tree = Clusterer->KDTree;
1716 context.candidates = new TEMPCLUSTER[Clusterer->NumberOfSamples];
1717 context.next = 0;
1718 context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
1719 KDWalk(context.tree, MakePotentialClusters, &context);
1720
1721 // form potential clusters into actual clusters - always do "best" first
1722 while (context.heap->Pop(&HeapEntry)) {
1723 TEMPCLUSTER *PotentialCluster = HeapEntry.data();
1724
1725 // if main cluster of potential cluster is already in another cluster
1726 // then we don't need to worry about it
1727 if (PotentialCluster->Cluster->Clustered) {
1728 continue;
1729 }
1730
1731 // if main cluster is not yet clustered, but its nearest neighbor is
1732 // then we must find a new nearest neighbor
1733 else if (PotentialCluster->Neighbor->Clustered) {
1734 PotentialCluster->Neighbor =
1735 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1736 if (PotentialCluster->Neighbor != nullptr) {
1737 context.heap->Push(&HeapEntry);
1738 }
1739 }
1740
1741 // if neither cluster is already clustered, form permanent cluster
1742 else {
1743 PotentialCluster->Cluster = MakeNewCluster(Clusterer, PotentialCluster);
1744 PotentialCluster->Neighbor =
1745 FindNearestNeighbor(context.tree, PotentialCluster->Cluster, &HeapEntry.key());
1746 if (PotentialCluster->Neighbor != nullptr) {
1747 context.heap->Push(&HeapEntry);
1748 }
1749 }
1750 }
1751
1752 // the root node in the cluster tree is now the only node in the kd-tree
1753 Clusterer->Root = static_cast<CLUSTER *> RootOf(Clusterer->KDTree);
1754
1755 // free up the memory used by the K-D tree, heap, and temp clusters
1756 delete context.tree;
1757 Clusterer->KDTree = nullptr;
1758 delete context.heap;
1759 delete[] context.candidates;
1760} // CreateClusterTree
1761
1771static void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, int32_t /*Level*/) {
1772 ClusterPair HeapEntry;
1773 int next = context->next;
1774 context->candidates[next].Cluster = Cluster;
1775 HeapEntry.data() = &(context->candidates[next]);
1776 context->candidates[next].Neighbor =
1777 FindNearestNeighbor(context->tree, context->candidates[next].Cluster, &HeapEntry.key());
1778 if (context->candidates[next].Neighbor != nullptr) {
1779 context->heap->Push(&HeapEntry);
1780 context->next++;
1781 }
1782} // MakePotentialClusters
1783
1797static CLUSTER *FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, float *Distance)
1798#define MAXNEIGHBORS 2
1799#define MAXDISTANCE FLT_MAX
1800{
1801 CLUSTER *Neighbor[MAXNEIGHBORS];
1802 float Dist[MAXNEIGHBORS];
1803 int NumberOfNeighbors;
1804 int32_t i;
1805 CLUSTER *BestNeighbor;
1806
1807 // find the 2 nearest neighbors of the cluster
1808 KDNearestNeighborSearch(Tree, &Cluster->Mean[0], MAXNEIGHBORS, MAXDISTANCE, &NumberOfNeighbors,
1809 reinterpret_cast<void **>(Neighbor), Dist);
1810
1811 // search for the nearest neighbor that is not the cluster itself
1812 *Distance = MAXDISTANCE;
1813 BestNeighbor = nullptr;
1814 for (i = 0; i < NumberOfNeighbors; i++) {
1815 if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
1816 *Distance = Dist[i];
1817 BestNeighbor = Neighbor[i];
1818 }
1819 }
1820 return BestNeighbor;
1821} // FindNearestNeighbor
1822
1832static CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
1833 // allocate the new cluster and initialize it
1834 auto Cluster = new CLUSTER(Clusterer->SampleSize);
1835 Cluster->Clustered = false;
1836 Cluster->Prototype = false;
1837 Cluster->Left = TempCluster->Cluster;
1838 Cluster->Right = TempCluster->Neighbor;
1839 Cluster->CharID = -1;
1840
1841 // mark the old clusters as "clustered" and delete them from the kd-tree
1842 Cluster->Left->Clustered = true;
1843 Cluster->Right->Clustered = true;
1844 KDDelete(Clusterer->KDTree, &Cluster->Left->Mean[0], Cluster->Left);
1845 KDDelete(Clusterer->KDTree, &Cluster->Right->Mean[0], Cluster->Right);
1846
1847 // compute the mean and sample count for the new cluster
1848 Cluster->SampleCount = MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
1849 Cluster->Left->SampleCount, Cluster->Right->SampleCount,
1850 &Cluster->Mean[0], &Cluster->Left->Mean[0], &Cluster->Right->Mean[0]);
1851
1852 // add the new cluster to the KD tree
1853 KDStore(Clusterer->KDTree, &Cluster->Mean[0], Cluster);
1854 return Cluster;
1855} // MakeNewCluster
1856
1870int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[],
1871 float m1[], float m2[]) {
1872 int32_t i, n;
1873
1874 n = n1 + n2;
1875 for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
1876 if (ParamDesc->Circular) {
1877 // if distance between means is greater than allowed
1878 // reduce upper point by one "rotation" to compute mean
1879 // then normalize the mean back into the accepted range
1880 if ((*m2 - *m1) > ParamDesc->HalfRange) {
1881 *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
1882 if (*m < ParamDesc->Min) {
1883 *m += ParamDesc->Range;
1884 }
1885 } else if ((*m1 - *m2) > ParamDesc->HalfRange) {
1886 *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
1887 if (*m < ParamDesc->Min) {
1888 *m += ParamDesc->Range;
1889 }
1890 } else {
1891 *m = (n1 * *m1 + n2 * *m2) / n;
1892 }
1893 } else {
1894 *m = (n1 * *m1 + n2 * *m2) / n;
1895 }
1896 }
1897 return n;
1898} // MergeClusters
1899
1908static void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) {
1909 LIST ClusterStack = NIL_LIST;
1910 CLUSTER *Cluster;
1911 PROTOTYPE *Prototype;
1912
1913 // use a stack to keep track of clusters waiting to be processed
1914 // initially the only cluster on the stack is the root cluster
1915 if (Clusterer->Root != nullptr) {
1916 ClusterStack = push(NIL_LIST, Clusterer->Root);
1917 }
1918
1919 // loop until we have analyzed all clusters which are potential prototypes
1920 while (ClusterStack != NIL_LIST) {
1921 // remove the next cluster to be analyzed from the stack
1922 // try to make a prototype from the cluster
1923 // if successful, put it on the proto list, else split the cluster
1924 Cluster = reinterpret_cast<CLUSTER *>(ClusterStack->first_node());
1925 ClusterStack = pop(ClusterStack);
1926 Prototype = MakePrototype(Clusterer, Config, Cluster);
1927 if (Prototype != nullptr) {
1928 Clusterer->ProtoList = push(Clusterer->ProtoList, Prototype);
1929 } else {
1930 ClusterStack = push(ClusterStack, Cluster->Right);
1931 ClusterStack = push(ClusterStack, Cluster->Left);
1932 }
1933 }
1934} // ComputePrototypes
1935
1951static PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster) {
1952 PROTOTYPE *Proto;
1953 BUCKETS *Buckets;
1954
1955 // filter out clusters which contain samples from the same character
1956 if (MultipleCharSamples(Clusterer, Cluster, Config->MaxIllegal)) {
1957 return nullptr;
1958 }
1959
1960 // compute the covariance matrix and ranges for the cluster
1961 auto Statistics = ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
1962
1963 // check for degenerate clusters which need not be analyzed further
1964 // note that the MinSamples test assumes that all clusters with multiple
1965 // character samples have been removed (as above)
1966 Proto = MakeDegenerateProto(Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
1967 static_cast<int32_t>(Config->MinSamples * Clusterer->NumChar));
1968 if (Proto != nullptr) {
1969 delete Statistics;
1970 return Proto;
1971 }
1972 // check to ensure that all dimensions are independent
1973 if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize, &Statistics->CoVariance[0],
1974 Config->Independence)) {
1975 delete Statistics;
1976 return nullptr;
1977 }
1978
1980 Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1981 if (Proto != nullptr) {
1982 delete Statistics;
1983 return Proto;
1984 }
1985 }
1986
1987 // create a histogram data structure used to evaluate distributions
1988 Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount, Config->Confidence);
1989
1990 // create a prototype based on the statistics and test it
1991 switch (Config->ProtoStyle) {
1992 case spherical:
1993 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1994 break;
1995 case elliptical:
1996 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1997 break;
1998 case mixed:
1999 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2000 break;
2001 case automatic:
2002 Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
2003 if (Proto != nullptr) {
2004 break;
2005 }
2006 Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
2007 if (Proto != nullptr) {
2008 break;
2009 }
2010 Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets, Config->Confidence);
2011 break;
2012 }
2013 delete Statistics;
2014 return Proto;
2015} // MakePrototype
2016
2036static PROTOTYPE *MakeDegenerateProto( // this was MinSample
2037 uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, int32_t MinSamples) {
2038 PROTOTYPE *Proto = nullptr;
2039
2040 if (MinSamples < MINSAMPLESNEEDED) {
2041 MinSamples = MINSAMPLESNEEDED;
2042 }
2043
2044 if (Cluster->SampleCount < MinSamples) {
2045 switch (Style) {
2046 case spherical:
2047 Proto = NewSphericalProto(N, Cluster, Statistics);
2048 break;
2049 case elliptical:
2050 case automatic:
2051 Proto = NewEllipticalProto(N, Cluster, Statistics);
2052 break;
2053 case mixed:
2054 Proto = NewMixedProto(N, Cluster, Statistics);
2055 break;
2056 }
2057 Proto->Significant = false;
2058 }
2059 return (Proto);
2060} // MakeDegenerateProto
2061
2075static PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster,
2076 STATISTICS *Statistics) {
2077 // Fraction of the number of samples used as a range around 1 within
2078 // which a cluster has the magic size that allows a boost to the
2079 // FTable by kFTableBoostMargin, thus allowing clusters near the
2080 // magic size (equal to the number of sample characters) to be more
2081 // likely to stay together.
2082 const double kMagicSampleMargin = 0.0625;
2083 const double kFTableBoostMargin = 2.0;
2084
2085 int N = Clusterer->SampleSize;
2086 CLUSTER *Left = Cluster->Left;
2087 CLUSTER *Right = Cluster->Right;
2088 if (Left == nullptr || Right == nullptr) {
2089 return nullptr;
2090 }
2091 int TotalDims = Left->SampleCount + Right->SampleCount;
2092 if (TotalDims < N + 1 || TotalDims < 2) {
2093 return nullptr;
2094 }
2095 std::vector<float> Covariance(static_cast<size_t>(N) * N);
2096 std::vector<float> Inverse(static_cast<size_t>(N) * N);
2097 std::vector<float> Delta(N);
2098 // Compute a new covariance matrix that only uses essential features.
2099 for (int i = 0; i < N; ++i) {
2100 int row_offset = i * N;
2101 if (!Clusterer->ParamDesc[i].NonEssential) {
2102 for (int j = 0; j < N; ++j) {
2103 if (!Clusterer->ParamDesc[j].NonEssential) {
2104 Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
2105 } else {
2106 Covariance[j + row_offset] = 0.0f;
2107 }
2108 }
2109 } else {
2110 for (int j = 0; j < N; ++j) {
2111 if (i == j) {
2112 Covariance[j + row_offset] = 1.0f;
2113 } else {
2114 Covariance[j + row_offset] = 0.0f;
2115 }
2116 }
2117 }
2118 }
2119 double err = InvertMatrix(&Covariance[0], N, &Inverse[0]);
2120 if (err > 1) {
2121 tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
2122 }
2123 int EssentialN = 0;
2124 for (int dim = 0; dim < N; ++dim) {
2125 if (!Clusterer->ParamDesc[dim].NonEssential) {
2126 Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
2127 ++EssentialN;
2128 } else {
2129 Delta[dim] = 0.0f;
2130 }
2131 }
2132 // Compute Hotelling's T-squared.
2133 double Tsq = 0.0;
2134 for (int x = 0; x < N; ++x) {
2135 double temp = 0.0;
2136 for (int y = 0; y < N; ++y) {
2137 temp += static_cast<double>(Inverse[y + N * x]) * Delta[y];
2138 }
2139 Tsq += Delta[x] * temp;
2140 }
2141 // Changed this function to match the formula in
2142 // Statistical Methods in Medical Research p 473
2143 // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
2144 // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
2145 double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2) * EssentialN);
2146 int Fx = EssentialN;
2147 if (Fx > FTABLE_X) {
2148 Fx = FTABLE_X;
2149 }
2150 --Fx;
2151 int Fy = TotalDims - EssentialN - 1;
2152 if (Fy > FTABLE_Y) {
2153 Fy = FTABLE_Y;
2154 }
2155 --Fy;
2156 double FTarget = FTable[Fy][Fx];
2157 if (Config->MagicSamples > 0 && TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
2158 TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
2159 // Give magic-sized clusters a magic FTable boost.
2160 FTarget += kFTableBoostMargin;
2161 }
2162 if (F < FTarget) {
2163 return NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2164 }
2165 return nullptr;
2166}
2167
2179static PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2180 BUCKETS *Buckets) {
2181 PROTOTYPE *Proto = nullptr;
2182 int i;
2183
2184 // check that each dimension is a normal distribution
2185 for (i = 0; i < Clusterer->SampleSize; i++) {
2186 if (Clusterer->ParamDesc[i].NonEssential) {
2187 continue;
2188 }
2189
2190 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2191 sqrt(static_cast<double>(Statistics->AvgVariance)));
2192 if (!DistributionOK(Buckets)) {
2193 break;
2194 }
2195 }
2196 // if all dimensions matched a normal distribution, make a proto
2197 if (i >= Clusterer->SampleSize) {
2198 Proto = NewSphericalProto(Clusterer->SampleSize, Cluster, Statistics);
2199 }
2200 return (Proto);
2201} // MakeSphericalProto
2202
2214static PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster,
2215 STATISTICS *Statistics, BUCKETS *Buckets) {
2216 PROTOTYPE *Proto = nullptr;
2217 int i;
2218
2219 // check that each dimension is a normal distribution
2220 for (i = 0; i < Clusterer->SampleSize; i++) {
2221 if (Clusterer->ParamDesc[i].NonEssential) {
2222 continue;
2223 }
2224
2225 FillBuckets(Buckets, Cluster, i, &(Clusterer->ParamDesc[i]), Cluster->Mean[i],
2226 sqrt(static_cast<double>(Statistics->CoVariance[i * (Clusterer->SampleSize + 1)])));
2227 if (!DistributionOK(Buckets)) {
2228 break;
2229 }
2230 }
2231 // if all dimensions matched a normal distribution, make a proto
2232 if (i >= Clusterer->SampleSize) {
2233 Proto = NewEllipticalProto(Clusterer->SampleSize, Cluster, Statistics);
2234 }
2235 return (Proto);
2236} // MakeEllipticalProto
2237
2253static PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics,
2254 BUCKETS *NormalBuckets, double Confidence) {
2255 PROTOTYPE *Proto;
2256 int i;
2257 BUCKETS *UniformBuckets = nullptr;
2258 BUCKETS *RandomBuckets = nullptr;
2259
2260 // create a mixed proto to work on - initially assume all dimensions normal
2261 Proto = NewMixedProto(Clusterer->SampleSize, Cluster, Statistics);
2262
2263 // find the proper distribution for each dimension
2264 for (i = 0; i < Clusterer->SampleSize; i++) {
2265 if (Clusterer->ParamDesc[i].NonEssential) {
2266 continue;
2267 }
2268
2269 FillBuckets(NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2270 std::sqrt(Proto->Variance.Elliptical[i]));
2271 if (DistributionOK(NormalBuckets)) {
2272 continue;
2273 }
2274
2275 if (RandomBuckets == nullptr) {
2276 RandomBuckets = GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
2277 }
2278 MakeDimRandom(i, Proto, &(Clusterer->ParamDesc[i]));
2279 FillBuckets(RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2280 Proto->Variance.Elliptical[i]);
2281 if (DistributionOK(RandomBuckets)) {
2282 continue;
2283 }
2284
2285 if (UniformBuckets == nullptr) {
2286 UniformBuckets = GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
2287 }
2288 MakeDimUniform(i, Proto, Statistics);
2289 FillBuckets(UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]), Proto->Mean[i],
2290 Proto->Variance.Elliptical[i]);
2291 if (DistributionOK(UniformBuckets)) {
2292 continue;
2293 }
2294 break;
2295 }
2296 // if any dimension failed to match a distribution, discard the proto
2297 if (i < Clusterer->SampleSize) {
2298 FreePrototype(Proto);
2299 Proto = nullptr;
2300 }
2301 return (Proto);
2302} // MakeMixedProto
2303
2311static void MakeDimRandom(uint16_t i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
2312 Proto->Distrib[i] = D_random;
2313 Proto->Mean[i] = ParamDesc->MidRange;
2314 Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
2315
2316 // subtract out the previous magnitude of this dimension from the total
2317 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2318 Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
2319 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2320 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2321
2322 // note that the proto Weight is irrelevant for D_random protos
2323} // MakeDimRandom
2324
2332static void MakeDimUniform(uint16_t i, PROTOTYPE *Proto, STATISTICS *Statistics) {
2333 Proto->Distrib[i] = uniform;
2334 Proto->Mean[i] = Proto->Cluster->Mean[i] + (Statistics->Min[i] + Statistics->Max[i]) / 2;
2335 Proto->Variance.Elliptical[i] = (Statistics->Max[i] - Statistics->Min[i]) / 2;
2336 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2337 Proto->Variance.Elliptical[i] = MINVARIANCE;
2338 }
2339
2340 // subtract out the previous magnitude of this dimension from the total
2341 Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
2342 Proto->Magnitude.Elliptical[i] = 1.0 / (2.0 * Proto->Variance.Elliptical[i]);
2343 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2344 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2345
2346 // note that the proto Weight is irrelevant for uniform protos
2347} // MakeDimUniform
2348
2363static STATISTICS *ComputeStatistics(int16_t N, PARAM_DESC ParamDesc[], CLUSTER *Cluster) {
2364 int i, j;
2365 LIST SearchState;
2366 SAMPLE *Sample;
2367 uint32_t SampleCountAdjustedForBias;
2368
2369 // allocate memory to hold the statistics results
2370 auto Statistics = new STATISTICS(N);
2371
2372 // allocate temporary memory to hold the sample to mean distances
2373 std::vector<float> Distance(N);
2374
2375 // find each sample in the cluster and merge it into the statistics
2376 InitSampleSearch(SearchState, Cluster);
2377 while ((Sample = NextSample(&SearchState)) != nullptr) {
2378 for (i = 0; i < N; i++) {
2379 Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
2380 if (ParamDesc[i].Circular) {
2381 if (Distance[i] > ParamDesc[i].HalfRange) {
2382 Distance[i] -= ParamDesc[i].Range;
2383 }
2384 if (Distance[i] < -ParamDesc[i].HalfRange) {
2385 Distance[i] += ParamDesc[i].Range;
2386 }
2387 }
2388 if (Distance[i] < Statistics->Min[i]) {
2389 Statistics->Min[i] = Distance[i];
2390 }
2391 if (Distance[i] > Statistics->Max[i]) {
2392 Statistics->Max[i] = Distance[i];
2393 }
2394 }
2395 auto CoVariance = &Statistics->CoVariance[0];
2396 for (i = 0; i < N; i++) {
2397 for (j = 0; j < N; j++, CoVariance++) {
2398 *CoVariance += Distance[i] * Distance[j];
2399 }
2400 }
2401 }
2402 // normalize the variances by the total number of samples
2403 // use SampleCount-1 instead of SampleCount to get an unbiased estimate
2404 // also compute the geometic mean of the diagonal variances
2405 // ensure that clusters with only 1 sample are handled correctly
2406 if (Cluster->SampleCount > 1) {
2407 SampleCountAdjustedForBias = Cluster->SampleCount - 1;
2408 } else {
2409 SampleCountAdjustedForBias = 1;
2410 }
2411 auto CoVariance = &Statistics->CoVariance[0];
2412 for (i = 0; i < N; i++) {
2413 for (j = 0; j < N; j++, CoVariance++) {
2414 *CoVariance /= SampleCountAdjustedForBias;
2415 if (j == i) {
2416 if (*CoVariance < MINVARIANCE) {
2417 *CoVariance = MINVARIANCE;
2418 }
2419 Statistics->AvgVariance *= *CoVariance;
2420 }
2421 }
2422 }
2423 Statistics->AvgVariance =
2424 static_cast<float>(pow(static_cast<double>(Statistics->AvgVariance), 1.0 / N));
2425
2426 return Statistics;
2427} // ComputeStatistics
2428
2440static PROTOTYPE *NewSphericalProto(uint16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2441 PROTOTYPE *Proto;
2442
2443 Proto = NewSimpleProto(N, Cluster);
2444
2445 Proto->Variance.Spherical = Statistics->AvgVariance;
2446 if (Proto->Variance.Spherical < MINVARIANCE) {
2447 Proto->Variance.Spherical = MINVARIANCE;
2448 }
2449
2450 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
2451 Proto->TotalMagnitude = static_cast<float>(
2452 pow(static_cast<double>(Proto->Magnitude.Spherical), static_cast<double>(N)));
2453 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
2454 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2455
2456 return (Proto);
2457} // NewSphericalProto
2458
2469static PROTOTYPE *NewEllipticalProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2470 PROTOTYPE *Proto;
2471 int i;
2472
2473 Proto = NewSimpleProto(N, Cluster);
2474 Proto->Variance.Elliptical = new float[N];
2475 Proto->Magnitude.Elliptical = new float[N];
2476 Proto->Weight.Elliptical = new float[N];
2477
2478 auto CoVariance = &Statistics->CoVariance[0];
2479 Proto->TotalMagnitude = 1.0;
2480 for (i = 0; i < N; i++, CoVariance += N + 1) {
2481 Proto->Variance.Elliptical[i] = *CoVariance;
2482 if (Proto->Variance.Elliptical[i] < MINVARIANCE) {
2483 Proto->Variance.Elliptical[i] = MINVARIANCE;
2484 }
2485
2486 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
2487 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
2488 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
2489 }
2490 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
2491 Proto->Style = elliptical;
2492 return (Proto);
2493} // NewEllipticalProto
2494
2508static PROTOTYPE *NewMixedProto(int16_t N, CLUSTER *Cluster, STATISTICS *Statistics) {
2509 auto Proto = NewEllipticalProto(N, Cluster, Statistics);
2510 Proto->Distrib.clear();
2511 Proto->Distrib.resize(N, normal);
2512 Proto->Style = mixed;
2513 return Proto;
2514} // NewMixedProto
2515
2524static PROTOTYPE *NewSimpleProto(int16_t N, CLUSTER *Cluster) {
2525 auto Proto = new PROTOTYPE;
2526 Proto->Mean = Cluster->Mean;
2527 Proto->Distrib.clear();
2528 Proto->Significant = true;
2529 Proto->Merged = false;
2530 Proto->Style = spherical;
2531 Proto->NumSamples = Cluster->SampleCount;
2532 Proto->Cluster = Cluster;
2533 Proto->Cluster->Prototype = true;
2534 return Proto;
2535} // NewSimpleProto
2536
2555static bool Independent(PARAM_DESC *ParamDesc, int16_t N, float *CoVariance, float Independence) {
2556 int i, j;
2557 float *VARii; // points to ith on-diagonal element
2558 float *VARjj; // points to jth on-diagonal element
2559 float CorrelationCoeff;
2560
2561 VARii = CoVariance;
2562 for (i = 0; i < N; i++, VARii += N + 1) {
2563 if (ParamDesc[i].NonEssential) {
2564 continue;
2565 }
2566
2567 VARjj = VARii + N + 1;
2568 CoVariance = VARii + 1;
2569 for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
2570 if (ParamDesc[j].NonEssential) {
2571 continue;
2572 }
2573
2574 if ((*VARii == 0.0) || (*VARjj == 0.0)) {
2575 CorrelationCoeff = 0.0;
2576 } else {
2577 CorrelationCoeff = sqrt(std::sqrt(*CoVariance * *CoVariance / (*VARii * *VARjj)));
2578 }
2579 if (CorrelationCoeff > Independence) {
2580 return false;
2581 }
2582 }
2583 }
2584 return true;
2585} // Independent
2586
2602static BUCKETS *GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uint32_t SampleCount,
2603 double Confidence) {
2604 // Get an old bucket structure with the same number of buckets.
2605 uint16_t NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
2606 BUCKETS *Buckets = clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
2607
2608 // If a matching bucket structure is not found, make one and save it.
2609 if (Buckets == nullptr) {
2610 Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
2611 clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] = Buckets;
2612 } else {
2613 // Just adjust the existing buckets.
2614 if (SampleCount != Buckets->SampleCount) {
2615 AdjustBuckets(Buckets, SampleCount);
2616 }
2617 if (Confidence != Buckets->Confidence) {
2618 Buckets->Confidence = Confidence;
2619 Buckets->ChiSquared =
2620 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2621 }
2622 InitBuckets(Buckets);
2623 }
2624 return Buckets;
2625} // GetBuckets
2626
2643static BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uint32_t SampleCount, double Confidence) {
2644 const DENSITYFUNC DensityFunction[] = {NormalDensity, UniformDensity, UniformDensity};
2645 int i, j;
2646 double BucketProbability;
2647 double NextBucketBoundary;
2648 double Probability;
2649 double ProbabilityDelta;
2650 double LastProbDensity;
2651 double ProbDensity;
2652 uint16_t CurrentBucket;
2653 bool Symmetrical;
2654
2655 // allocate memory needed for data structure
2656 auto Buckets = new BUCKETS(OptimumNumberOfBuckets(SampleCount));
2657 Buckets->SampleCount = SampleCount;
2658 Buckets->Confidence = Confidence;
2659
2660 // initialize simple fields
2661 Buckets->Distribution = Distribution;
2662
2663 // all currently defined distributions are symmetrical
2664 Symmetrical = true;
2665 Buckets->ChiSquared =
2666 ComputeChiSquared(DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
2667
2668 if (Symmetrical) {
2669 // allocate buckets so that all have approx. equal probability
2670 BucketProbability = 1.0 / static_cast<double>(Buckets->NumberOfBuckets);
2671
2672 // distribution is symmetric so fill in upper half then copy
2673 CurrentBucket = Buckets->NumberOfBuckets / 2;
2674 if (Odd(Buckets->NumberOfBuckets)) {
2675 NextBucketBoundary = BucketProbability / 2;
2676 } else {
2677 NextBucketBoundary = BucketProbability;
2678 }
2679
2680 Probability = 0.0;
2681 LastProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(BUCKETTABLESIZE / 2);
2682 for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
2683 ProbDensity = (*DensityFunction[static_cast<int>(Distribution)])(i + 1);
2684 ProbabilityDelta = Integral(LastProbDensity, ProbDensity, 1.0);
2685 Probability += ProbabilityDelta;
2686 if (Probability > NextBucketBoundary) {
2687 if (CurrentBucket < Buckets->NumberOfBuckets - 1) {
2688 CurrentBucket++;
2689 }
2690 NextBucketBoundary += BucketProbability;
2691 }
2692 Buckets->Bucket[i] = CurrentBucket;
2693 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>(ProbabilityDelta * SampleCount);
2694 LastProbDensity = ProbDensity;
2695 }
2696 // place any leftover probability into the last bucket
2697 Buckets->ExpectedCount[CurrentBucket] += static_cast<float>((0.5 - Probability) * SampleCount);
2698
2699 // copy upper half of distribution to lower half
2700 for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--) {
2701 Buckets->Bucket[i] = Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
2702 }
2703
2704 // copy upper half of expected counts to lower half
2705 for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--) {
2706 Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
2707 }
2708 }
2709 return Buckets;
2710} // MakeBuckets
2711
2725static uint16_t OptimumNumberOfBuckets(uint32_t SampleCount) {
2726 uint8_t Last, Next;
2727 float Slope;
2728
2729 if (SampleCount < kCountTable[0]) {
2730 return kBucketsTable[0];
2731 }
2732
2733 for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
2734 if (SampleCount <= kCountTable[Next]) {
2735 Slope = static_cast<float>(kBucketsTable[Next] - kBucketsTable[Last]) /
2736 static_cast<float>(kCountTable[Next] - kCountTable[Last]);
2737 return (
2738 static_cast<uint16_t>(kBucketsTable[Last] + Slope * (SampleCount - kCountTable[Last])));
2739 }
2740 }
2741 return kBucketsTable[Last];
2742} // OptimumNumberOfBuckets
2743
2760static double ComputeChiSquared(uint16_t DegreesOfFreedom, double Alpha)
2761#define CHIACCURACY 0.01
2762#define MINALPHA (1e-200)
2763{
2764 static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
2765
2766 // limit the minimum alpha that can be used - if alpha is too small
2767 // it may not be possible to compute chi-squared.
2768 Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
2769 if (Odd(DegreesOfFreedom)) {
2770 DegreesOfFreedom++;
2771 }
2772
2773 /* find the list of chi-squared values which have already been computed
2774 for the specified number of degrees of freedom. Search the list for
2775 the desired chi-squared. */
2776 CHISTRUCT SearchKey(0.0, Alpha);
2777 auto *found = search(ChiWith[DegreesOfFreedom], &SearchKey, AlphaMatch);
2778 auto OldChiSquared = reinterpret_cast<CHISTRUCT *>(found ? found->first_node() : nullptr);
2779
2780 if (OldChiSquared == nullptr) {
2781 OldChiSquared = new CHISTRUCT(DegreesOfFreedom, Alpha);
2782 OldChiSquared->ChiSquared =
2783 Solve(ChiArea, OldChiSquared, static_cast<double>(DegreesOfFreedom), CHIACCURACY);
2784 ChiWith[DegreesOfFreedom] = push(ChiWith[DegreesOfFreedom], OldChiSquared);
2785 } else {
2786 // further optimization might move OldChiSquared to front of list
2787 }
2788
2789 return (OldChiSquared->ChiSquared);
2790
2791} // ComputeChiSquared
2792
2806static double NormalDensity(int32_t x) {
2807 double Distance;
2808
2809 Distance = x - kNormalMean;
2810 return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
2811} // NormalDensity
2812
2820static double UniformDensity(int32_t x) {
2821 constexpr auto UniformDistributionDensity = 1.0 / BUCKETTABLESIZE;
2822
2823 if ((x >= 0) && (x <= BUCKETTABLESIZE)) {
2824 return UniformDistributionDensity;
2825 } else {
2826 return 0.0;
2827 }
2828} // UniformDensity
2829
2838static double Integral(double f1, double f2, double Dx) {
2839 return (f1 + f2) * Dx / 2.0;
2840} // Integral
2841
2862static void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uint16_t Dim, PARAM_DESC *ParamDesc,
2863 float Mean, float StdDev) {
2864 uint16_t BucketID;
2865 int i;
2866 LIST SearchState;
2867 SAMPLE *Sample;
2868
2869 // initialize the histogram bucket counts to 0
2870 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2871 Buckets->Count[i] = 0;
2872 }
2873
2874 if (StdDev == 0.0) {
2875 /* if the standard deviation is zero, then we can't statistically
2876 analyze the cluster. Use a pseudo-analysis: samples exactly on
2877 the mean are distributed evenly across all buckets. Samples greater
2878 than the mean are placed in the last bucket; samples less than the
2879 mean are placed in the first bucket. */
2880
2881 InitSampleSearch(SearchState, Cluster);
2882 i = 0;
2883 while ((Sample = NextSample(&SearchState)) != nullptr) {
2884 if (Sample->Mean[Dim] > Mean) {
2885 BucketID = Buckets->NumberOfBuckets - 1;
2886 } else if (Sample->Mean[Dim] < Mean) {
2887 BucketID = 0;
2888 } else {
2889 BucketID = i;
2890 }
2891 Buckets->Count[BucketID] += 1;
2892 i++;
2893 if (i >= Buckets->NumberOfBuckets) {
2894 i = 0;
2895 }
2896 }
2897 } else {
2898 // search for all samples in the cluster and add to histogram buckets
2899 InitSampleSearch(SearchState, Cluster);
2900 while ((Sample = NextSample(&SearchState)) != nullptr) {
2901 switch (Buckets->Distribution) {
2902 case normal:
2903 BucketID = NormalBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2904 break;
2905 case D_random:
2906 case uniform:
2907 BucketID = UniformBucket(ParamDesc, Sample->Mean[Dim], Mean, StdDev);
2908 break;
2909 default:
2910 BucketID = 0;
2911 }
2912 Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2913 }
2914 }
2915} // FillBuckets
2916
2928static uint16_t NormalBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2929 float X;
2930
2931 // wraparound circular parameters if necessary
2932 if (ParamDesc->Circular) {
2933 if (x - Mean > ParamDesc->HalfRange) {
2934 x -= ParamDesc->Range;
2935 } else if (x - Mean < -ParamDesc->HalfRange) {
2936 x += ParamDesc->Range;
2937 }
2938 }
2939
2940 X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2941 if (X < 0) {
2942 return 0;
2943 }
2944 if (X > BUCKETTABLESIZE - 1) {
2945 return (static_cast<uint16_t>(BUCKETTABLESIZE - 1));
2946 }
2947 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2948} // NormalBucket
2949
2961static uint16_t UniformBucket(PARAM_DESC *ParamDesc, float x, float Mean, float StdDev) {
2962 float X;
2963
2964 // wraparound circular parameters if necessary
2965 if (ParamDesc->Circular) {
2966 if (x - Mean > ParamDesc->HalfRange) {
2967 x -= ParamDesc->Range;
2968 } else if (x - Mean < -ParamDesc->HalfRange) {
2969 x += ParamDesc->Range;
2970 }
2971 }
2972
2973 X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2974 if (X < 0) {
2975 return 0;
2976 }
2977 if (X > BUCKETTABLESIZE - 1) {
2978 return static_cast<uint16_t>(BUCKETTABLESIZE - 1);
2979 }
2980 return static_cast<uint16_t>(floor(static_cast<double>(X)));
2981} // UniformBucket
2982
2993static bool DistributionOK(BUCKETS *Buckets) {
2994 float FrequencyDifference;
2995 float TotalDifference;
2996 int i;
2997
2998 // compute how well the histogram matches the expected histogram
2999 TotalDifference = 0.0;
3000 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3001 FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
3002 TotalDifference += (FrequencyDifference * FrequencyDifference) / Buckets->ExpectedCount[i];
3003 }
3004
3005 // test to see if the difference is more than expected
3006 if (TotalDifference > Buckets->ChiSquared) {
3007 return false;
3008 } else {
3009 return true;
3010 }
3011} // DistributionOK
3012
3025static uint16_t DegreesOfFreedom(DISTRIBUTION Distribution, uint16_t HistogramBuckets) {
3026 static uint8_t DegreeOffsets[] = {3, 3, 1};
3027
3028 uint16_t AdjustedNumBuckets;
3029
3030 AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[static_cast<int>(Distribution)];
3031 if (Odd(AdjustedNumBuckets)) {
3032 AdjustedNumBuckets++;
3033 }
3034 return (AdjustedNumBuckets);
3035
3036} // DegreesOfFreedom
3037
3045static void AdjustBuckets(BUCKETS *Buckets, uint32_t NewSampleCount) {
3046 int i;
3047 double AdjustFactor;
3048
3049 AdjustFactor =
3050 ((static_cast<double>(NewSampleCount)) / (static_cast<double>(Buckets->SampleCount)));
3051
3052 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3053 Buckets->ExpectedCount[i] *= AdjustFactor;
3054 }
3055
3056 Buckets->SampleCount = NewSampleCount;
3057
3058} // AdjustBuckets
3059
3065static void InitBuckets(BUCKETS *Buckets) {
3066 int i;
3067
3068 for (i = 0; i < Buckets->NumberOfBuckets; i++) {
3069 Buckets->Count[i] = 0;
3070 }
3071
3072} // InitBuckets
3073
3086static int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
3087 void *arg2) { // CHISTRUCT *SearchKey)
3088 auto *ChiStruct = static_cast<CHISTRUCT *>(arg1);
3089 auto *SearchKey = static_cast<CHISTRUCT *>(arg2);
3090
3091 return (ChiStruct->Alpha == SearchKey->Alpha);
3092
3093} // AlphaMatch
3094
3108static double Solve(SOLVEFUNC Function, void *FunctionParams, double InitialGuess, double Accuracy)
3109#define INITIALDELTA 0.1
3110#define DELTARATIO 0.1
3111{
3112 double x;
3113 double f;
3114 double Slope;
3115 double Delta;
3116 double NewDelta;
3117 double xDelta;
3118 double LastPosX, LastNegX;
3119
3120 x = InitialGuess;
3121 Delta = INITIALDELTA;
3122 LastPosX = FLT_MAX;
3123 LastNegX = -FLT_MAX;
3124 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3125 while (Abs(LastPosX - LastNegX) > Accuracy) {
3126 // keep track of outer bounds of current estimate
3127 if (f < 0) {
3128 LastNegX = x;
3129 } else {
3130 LastPosX = x;
3131 }
3132
3133 // compute the approx. slope of f(x) at the current point
3134 Slope = ((*Function)(static_cast<CHISTRUCT *>(FunctionParams), x + Delta) - f) / Delta;
3135
3136 // compute the next solution guess */
3137 xDelta = f / Slope;
3138 x -= xDelta;
3139
3140 // reduce the delta used for computing slope to be a fraction of
3141 // the amount moved to get to the new guess
3142 NewDelta = Abs(xDelta) * DELTARATIO;
3143 if (NewDelta < Delta) {
3144 Delta = NewDelta;
3145 }
3146
3147 // compute the value of the function at the new guess
3148 f = (*Function)(static_cast<CHISTRUCT *>(FunctionParams), x);
3149 }
3150 return (x);
3151
3152} // Solve
3153
3172static double ChiArea(CHISTRUCT *ChiParams, double x) {
3173 int i, N;
3174 double SeriesTotal;
3175 double Denominator;
3176 double PowerOfx;
3177
3178 N = ChiParams->DegreesOfFreedom / 2 - 1;
3179 SeriesTotal = 1;
3180 Denominator = 1;
3181 PowerOfx = 1;
3182 for (i = 1; i <= N; i++) {
3183 Denominator *= 2 * i;
3184 PowerOfx *= x;
3185 SeriesTotal += PowerOfx / Denominator;
3186 }
3187 return ((SeriesTotal * exp(-0.5 * x)) - ChiParams->Alpha);
3188
3189} // ChiArea
3190
3214static bool MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, float MaxIllegal)
3215#define ILLEGAL_CHAR 2
3216{
3217 static std::vector<uint8_t> CharFlags;
3218 LIST SearchState;
3219 SAMPLE *Sample;
3220 int32_t CharID;
3221 int32_t NumCharInCluster;
3222 int32_t NumIllegalInCluster;
3223 float PercentIllegal;
3224
3225 // initial estimate assumes that no illegal chars exist in the cluster
3226 NumCharInCluster = Cluster->SampleCount;
3227 NumIllegalInCluster = 0;
3228
3229 if (Clusterer->NumChar > CharFlags.size()) {
3230 CharFlags.resize(Clusterer->NumChar);
3231 }
3232
3233 for (auto &CharFlag : CharFlags) {
3234 CharFlag = false;
3235 }
3236
3237 // find each sample in the cluster and check if we have seen it before
3238 InitSampleSearch(SearchState, Cluster);
3239 while ((Sample = NextSample(&SearchState)) != nullptr) {
3240 CharID = Sample->CharID;
3241 if (CharFlags[CharID] == false) {
3242 CharFlags[CharID] = true;
3243 } else {
3244 if (CharFlags[CharID] == true) {
3245 NumIllegalInCluster++;
3246 CharFlags[CharID] = ILLEGAL_CHAR;
3247 }
3248 NumCharInCluster--;
3249 PercentIllegal = static_cast<float>(NumIllegalInCluster) / NumCharInCluster;
3250 if (PercentIllegal > MaxIllegal) {
3251 destroy(SearchState);
3252 return true;
3253 }
3254 }
3255 }
3256 return false;
3257
3258} // MultipleCharSamples
3259
3265static double InvertMatrix(const float *input, int size, float *inv) {
3266 // Allocate memory for the 2D arrays.
3267 GENERIC_2D_ARRAY<double> U(size, size, 0.0);
3268 GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
3269 GENERIC_2D_ARRAY<double> L(size, size, 0.0);
3270
3271 // Initialize the working matrices. U starts as input, L as I and U_inv as O.
3272 int row;
3273 int col;
3274 for (row = 0; row < size; row++) {
3275 for (col = 0; col < size; col++) {
3276 U[row][col] = input[row * size + col];
3277 L[row][col] = row == col ? 1.0 : 0.0;
3278 U_inv[row][col] = 0.0;
3279 }
3280 }
3281
3282 // Compute forward matrix by inversion by LU decomposition of input.
3283 for (col = 0; col < size; ++col) {
3284 // Find best pivot
3285 int best_row = 0;
3286 double best_pivot = -1.0;
3287 for (row = col; row < size; ++row) {
3288 if (Abs(U[row][col]) > best_pivot) {
3289 best_pivot = Abs(U[row][col]);
3290 best_row = row;
3291 }
3292 }
3293 // Exchange pivot rows.
3294 if (best_row != col) {
3295 for (int k = 0; k < size; ++k) {
3296 double tmp = U[best_row][k];
3297 U[best_row][k] = U[col][k];
3298 U[col][k] = tmp;
3299 tmp = L[best_row][k];
3300 L[best_row][k] = L[col][k];
3301 L[col][k] = tmp;
3302 }
3303 }
3304 // Now do the pivot itself.
3305 for (row = col + 1; row < size; ++row) {
3306 double ratio = -U[row][col] / U[col][col];
3307 for (int j = col; j < size; ++j) {
3308 U[row][j] += U[col][j] * ratio;
3309 }
3310 for (int k = 0; k < size; ++k) {
3311 L[row][k] += L[col][k] * ratio;
3312 }
3313 }
3314 }
3315 // Next invert U.
3316 for (col = 0; col < size; ++col) {
3317 U_inv[col][col] = 1.0 / U[col][col];
3318 for (row = col - 1; row >= 0; --row) {
3319 double total = 0.0;
3320 for (int k = col; k > row; --k) {
3321 total += U[row][k] * U_inv[k][col];
3322 }
3323 U_inv[row][col] = -total / U[row][row];
3324 }
3325 }
3326 // Now the answer is U_inv.L.
3327 for (row = 0; row < size; row++) {
3328 for (col = 0; col < size; col++) {
3329 double sum = 0.0;
3330 for (int k = row; k < size; ++k) {
3331 sum += U_inv[row][k] * L[k][col];
3332 }
3333 inv[row * size + col] = sum;
3334 }
3335 }
3336 // Check matrix product.
3337 double error_sum = 0.0;
3338 for (row = 0; row < size; row++) {
3339 for (col = 0; col < size; col++) {
3340 double sum = 0.0;
3341 for (int k = 0; k < size; ++k) {
3342 sum += static_cast<double>(input[row * size + k]) * inv[k * size + col];
3343 }
3344 if (row != col) {
3345 error_sum += Abs(sum);
3346 }
3347 }
3348 }
3349 return error_sum;
3350}
3351
3352} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define iterate(l)
Definition: oldlist.h:91
#define NIL_LIST
Definition: oldlist.h:75
#define RootOf(T)
Definition: kdtree.h:98
#define MAXDISTANCE
#define MINALPHA
#define Abs(N)
Definition: cluster.cpp:1316
#define Odd(N)
Definition: cluster.cpp:1314
#define BUCKETTABLESIZE
Definition: cluster.cpp:1254
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:1336
#define FTABLE_X
Definition: cluster.cpp:36
#define INITIALDELTA
#define Mirror(N, R)
Definition: cluster.cpp:1315
#define LOOKUPTABLESIZE
Definition: cluster.cpp:1335
#define MINSAMPLES
Definition: cluster.cpp:1245
#define DELTARATIO
#define SqrtOf2Pi
Definition: cluster.cpp:1326
#define MINSAMPLESNEEDED
Definition: cluster.cpp:1246
#define HOTELLING
Definition: cluster.cpp:35
#define MAXNEIGHBORS
#define ILLEGAL_CHAR
#define CHIACCURACY
#define MINVARIANCE
Definition: cluster.cpp:1236
#define FTABLE_Y
Definition: cluster.cpp:37
#define NORMALEXTENT
Definition: cluster.cpp:1255
#define MAXBUCKETS
Definition: cluster.h:29
#define MINBUCKETS
Definition: cluster.h:28
#define InitSampleSearch(S, C)
Definition: cluster.h:110
const double y
void KDDelete(KDTREE *Tree, float Key[], void *Data)
Definition: kdtree.cpp:252
void KDWalk(KDTREE *Tree, kdwalk_proc action, ClusteringContext *context)
Definition: kdtree.cpp:313
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition: cluster.cpp:1870
list_rec * LIST
Definition: oldlist.h:125
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1662
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:1263
LIST destroy(LIST list)
Definition: oldlist.cpp:121
CLUSTERCONFIG Config
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:1638
double(*)(int32_t) DENSITYFUNC
Definition: cluster.cpp:1311
tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
Definition: cluster.cpp:1262
KDTREE * MakeKDTree(int16_t KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:186
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
double(*)(CHISTRUCT *, double) SOLVEFUNC
Definition: cluster.cpp:1312
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:41
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
void FreePrototype(void *arg)
Definition: cluster.cpp:1608
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:1440
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition: cluster.cpp:1673
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:1575
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:211
LIST pop(LIST list)
Definition: oldlist.cpp:166
PROTOSTYLE
Definition: cluster.h:53
@ spherical
Definition: cluster.h:53
@ mixed
Definition: cluster.h:53
@ elliptical
Definition: cluster.h:53
@ automatic
Definition: cluster.h:53
void KDNearestNeighborSearch(KDTREE *Tree, float Query[], int QuerySize, float MaxDistance, int *NumberOfResults, void **NBuffer, float DBuffer[])
Definition: kdtree.cpp:305
CLUSTER SAMPLE
Definition: cluster.h:51
LIST push(LIST list, void *element)
Definition: oldlist.cpp:178
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:137
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, uint32_t CharID)
Definition: cluster.cpp:1491
void KDStore(KDTREE *Tree, float *Key, CLUSTER *Data)
Definition: kdtree.cpp:215
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:1543
DISTRIBUTION
Definition: cluster.h:65
@ D_random
Definition: cluster.h:65
@ DISTRIBUTION_COUNT
Definition: cluster.h:65
@ uniform
Definition: cluster.h:65
@ normal
Definition: cluster.h:65
def next(obj)
Definition: ast.py:56
std::vector< float > Max
Definition: cluster.cpp:1271
std::vector< float > CoVariance
Definition: cluster.cpp:1269
std::vector< float > Min
Definition: cluster.cpp:1270
uint32_t SampleCount
Definition: cluster.cpp:1280
uint16_t Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:1284
BUCKETS(size_t n)
Definition: cluster.cpp:1275
DISTRIBUTION Distribution
Definition: cluster.cpp:1279
uint16_t NumberOfBuckets
Definition: cluster.cpp:1283
std::vector< uint32_t > Count
Definition: cluster.cpp:1285
std::vector< float > ExpectedCount
Definition: cluster.cpp:1286
uint16_t DegreesOfFreedom
Definition: cluster.cpp:1298
CHISTRUCT(uint16_t degreesOfFreedom, double alpha)
Definition: cluster.cpp:1296
unsigned SampleCount
Definition: cluster.h:45
PROTOSTYLE ProtoStyle
Definition: cluster.h:56
float * Elliptical
Definition: cluster.h:69
unsigned Style
Definition: cluster.h:79
std::vector< float > Mean
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:81
FLOATUNION Variance
Definition: cluster.h:86
std::vector< DISTRIBUTION > Distrib
Definition: cluster.h:82
int16_t SampleSize
Definition: cluster.h:92
CLUSTER * Root
Definition: cluster.h:96
PARAM_DESC * ParamDesc
Definition: cluster.h:93
KDTREE * KDTree
Definition: cluster.h:95
uint32_t NumChar
Definition: cluster.h:98
int32_t NumberOfSamples
Definition: cluster.h:94
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:100
list_rec * first_node()
Definition: oldlist.h:107