tesseract v5.3.3.20231005
clusttool.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: clusttool.cpp
3 ** Purpose: Misc. tools for use with the clustering routines
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *****************************************************************************/
17
18#define _USE_MATH_DEFINES // for M_PI
19
20#include "clusttool.h"
21
22#include <cmath> // for M_PI, std::isnan
23#include <locale> // for std::locale::classic
24#include <sstream> // for std::stringstream
25
26namespace tesseract {
27
28//---------------Global Data Definitions and Declarations--------------------
29#define TOKENSIZE 80
30#define QUOTED_TOKENSIZE "79"
31#define MAXSAMPLESIZE 65535
32
45static bool ReadNFloats(TFile *fp, uint16_t N, float Buffer[]) {
46 const int kMaxLineSize = 1024;
47 char line[kMaxLineSize];
48 if (fp->FGets(line, kMaxLineSize) == nullptr) {
49 tprintf("Hit EOF in ReadNFloats!\n");
50 return false;
51 }
52
53 std::stringstream stream(line);
54 // Use "C" locale (needed for float values Buffer[i]).
55 stream.imbue(std::locale::classic());
56 for (uint16_t i = 0; i < N; i++) {
57 float f = NAN;
58 stream >> f;
59 if (std::isnan(f)) {
60 tprintf("Read of %u floats failed!\n", N);
61 return false;
62 }
63 Buffer[i] = f;
64 }
65 return true;
66}
67
75static void WriteNFloats(FILE *File, uint16_t N, float Array[]) {
76 for (int i = 0; i < N; i++) {
77 fprintf(File, " %9.6f", Array[i]);
78 }
79 fprintf(File, "\n");
80}
81
89static void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
90 switch (ProtoStyle) {
91 case spherical:
92 fprintf(File, "spherical");
93 break;
94 case elliptical:
95 fprintf(File, "elliptical");
96 break;
97 case mixed:
98 fprintf(File, "mixed");
99 break;
100 case automatic:
101 fprintf(File, "automatic");
102 break;
103 }
104}
105
114uint16_t ReadSampleSize(TFile *fp) {
115 int SampleSize = 0;
116
117 const int kMaxLineSize = 100;
118 char line[kMaxLineSize];
119 ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
120 ASSERT_HOST(sscanf(line, "%d", &SampleSize) == 1);
121 ASSERT_HOST(SampleSize >= 0 && SampleSize <= MAXSAMPLESIZE);
122 return SampleSize;
123}
124
134PARAM_DESC *ReadParamDesc(TFile *fp, uint16_t N) {
135 auto ParamDesc = new PARAM_DESC[N];
136 for (int i = 0; i < N; i++) {
137 const int kMaxLineSize = TOKENSIZE * 4;
138 char line[kMaxLineSize];
139 ASSERT_HOST(fp->FGets(line, kMaxLineSize) != nullptr);
140 std::istringstream stream(line);
141 // Use "C" locale (needed for float values Min, Max).
142 stream.imbue(std::locale::classic());
143 std::string linear_token;
144 stream >> linear_token;
145 std::string essential_token;
146 stream >> essential_token;
147 stream >> ParamDesc[i].Min;
148 stream >> ParamDesc[i].Max;
149 ASSERT_HOST(!stream.fail());
150 ParamDesc[i].Circular = (linear_token[0] == 'c');
151 ParamDesc[i].NonEssential = (essential_token[0] != 'e');
152 ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
153 ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
154 ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
155 }
156 return (ParamDesc);
157}
158
168PROTOTYPE *ReadPrototype(TFile *fp, uint16_t N) {
169 char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
170 int SampleCount;
171 int i;
172
173 const int kMaxLineSize = TOKENSIZE * 4;
174 char line[kMaxLineSize];
175 if (fp->FGets(line, kMaxLineSize) == nullptr ||
176 sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d", sig_token, shape_token,
177 &SampleCount) != 3) {
178 tprintf("Invalid prototype: %s\n", line);
179 return nullptr;
180 }
181 auto Proto = new PROTOTYPE;
182 Proto->Cluster = nullptr;
183 Proto->Significant = (sig_token[0] == 's');
184
185 switch (shape_token[0]) {
186 case 's':
187 Proto->Style = spherical;
188 break;
189 case 'e':
190 Proto->Style = elliptical;
191 break;
192 case 'a':
193 Proto->Style = automatic;
194 break;
195 default:
196 tprintf("Invalid prototype style specification:%s\n", shape_token);
197 Proto->Style = elliptical;
198 }
199
200 ASSERT_HOST(SampleCount >= 0);
201 Proto->NumSamples = SampleCount;
202
203 Proto->Mean.resize(N);
204 ReadNFloats(fp, N, &Proto->Mean[0]);
205
206 switch (Proto->Style) {
207 case spherical:
208 ReadNFloats(fp, 1, &(Proto->Variance.Spherical));
209 Proto->Magnitude.Spherical = 1.0 / sqrt(2.0 * M_PI * Proto->Variance.Spherical);
210 Proto->TotalMagnitude = std::pow(Proto->Magnitude.Spherical, static_cast<float>(N));
211 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
212 Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
213 Proto->Distrib.clear();
214 break;
215 case elliptical:
216 Proto->Variance.Elliptical = new float[N];
217 ReadNFloats(fp, N, Proto->Variance.Elliptical);
218 Proto->Magnitude.Elliptical = new float[N];
219 Proto->Weight.Elliptical = new float[N];
220 Proto->TotalMagnitude = 1.0;
221 for (i = 0; i < N; i++) {
222 Proto->Magnitude.Elliptical[i] = 1.0f / sqrt(2.0f * M_PI * Proto->Variance.Elliptical[i]);
223 Proto->Weight.Elliptical[i] = 1.0f / Proto->Variance.Elliptical[i];
224 Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
225 }
226 Proto->LogMagnitude = log(static_cast<double>(Proto->TotalMagnitude));
227 Proto->Distrib.clear();
228 break;
229 default:
230 delete Proto;
231 tprintf("Invalid prototype style\n");
232 return nullptr;
233 }
234 return Proto;
235}
236
244void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[]) {
245 int i;
246
247 for (i = 0; i < N; i++) {
248 if (ParamDesc[i].Circular) {
249 fprintf(File, "circular ");
250 } else {
251 fprintf(File, "linear ");
252 }
253
254 if (ParamDesc[i].NonEssential) {
255 fprintf(File, "non-essential ");
256 } else {
257 fprintf(File, "essential ");
258 }
259
260 fprintf(File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
261 }
262}
263
271void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto) {
272 int i;
273
274 if (Proto->Significant) {
275 fprintf(File, "significant ");
276 } else {
277 fprintf(File, "insignificant ");
278 }
279 WriteProtoStyle(File, static_cast<PROTOSTYLE>(Proto->Style));
280 fprintf(File, "%6d\n\t", Proto->NumSamples);
281 WriteNFloats(File, N, &Proto->Mean[0]);
282 fprintf(File, "\t");
283
284 switch (Proto->Style) {
285 case spherical:
286 WriteNFloats(File, 1, &(Proto->Variance.Spherical));
287 break;
288 case elliptical:
289 WriteNFloats(File, N, Proto->Variance.Elliptical);
290 break;
291 case mixed:
292 for (i = 0; i < N; i++) {
293 switch (Proto->Distrib[i]) {
294 case normal:
295 fprintf(File, " %9s", "normal");
296 break;
297 case uniform:
298 fprintf(File, " %9s", "uniform");
299 break;
300 case D_random:
301 fprintf(File, " %9s", "random");
302 break;
304 ASSERT_HOST(!"Distribution count not allowed!");
305 }
306 }
307 fprintf(File, "\n\t");
308 WriteNFloats(File, N, Proto->Variance.Elliptical);
309 }
310}
311
312} // namespace tesseract
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define QUOTED_TOKENSIZE
Definition: clusttool.cpp:30
#define MAXSAMPLESIZE
max num of dimensions in feature space
Definition: clusttool.cpp:31
#define TOKENSIZE
max size of tokens read from an input file
Definition: clusttool.cpp:29
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:114
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
void WriteParamDesc(FILE *File, uint16_t N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:244
void WritePrototype(FILE *File, uint16_t N, PROTOTYPE *Proto)
Definition: clusttool.cpp:271
PROTOSTYLE
Definition: cluster.h:53
@ spherical
Definition: cluster.h:53
@ mixed
Definition: cluster.h:53
@ elliptical
Definition: cluster.h:53
@ automatic
Definition: cluster.h:53
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:168
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:134
@ D_random
Definition: cluster.h:65
@ DISTRIBUTION_COUNT
Definition: cluster.h:65
@ uniform
Definition: cluster.h:65
@ normal
Definition: cluster.h:65
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195
float * Elliptical
Definition: cluster.h:69
unsigned Style
Definition: cluster.h:79
std::vector< float > Mean
Definition: cluster.h:83
CLUSTER * Cluster
Definition: cluster.h:81
FLOATUNION Variance
Definition: cluster.h:86
unsigned NumSamples
Definition: cluster.h:80
std::vector< DISTRIBUTION > Distrib
Definition: cluster.h:82