tesseract v5.3.3.20231005
normmatch.cpp
Go to the documentation of this file.
1/******************************************************************************
2 ** Filename: normmatch.c
3 ** Purpose: Simple matcher based on character normalization features.
4 ** Author: Dan Johnson
5 **
6 ** (c) Copyright Hewlett-Packard Company, 1988.
7 ** Licensed under the Apache License, Version 2.0 (the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 ******************************************************************************/
17/*----------------------------------------------------------------------------
18 Include Files and Type Defines
19----------------------------------------------------------------------------*/
20#include "normmatch.h"
21
22#include "classify.h"
23#include "clusttool.h"
24#include "helpers.h"
25#include "normfeat.h"
26#include "params.h"
27#include "unicharset.h"
28
29#include <cmath>
30#include <cstdio>
31#include <sstream> // for std::istringstream
32
33namespace tesseract {
34
36 NORM_PROTOS(size_t n) : NumProtos(n), Protos(n) {
37 }
38 int NumParams = 0;
41 std::vector<LIST> Protos;
42};
43
44/*----------------------------------------------------------------------------
45 Private Code
46----------------------------------------------------------------------------*/
47
55static double NormEvidenceOf(double NormAdj) {
57
58 if (classify_norm_adj_curl == 3) {
59 NormAdj = NormAdj * NormAdj * NormAdj;
60 } else if (classify_norm_adj_curl == 2) {
61 NormAdj = NormAdj * NormAdj;
62 } else {
63 NormAdj = pow(NormAdj, classify_norm_adj_curl);
64 }
65 return (1.0 / (1.0 + NormAdj));
66}
67
68/*----------------------------------------------------------------------------
69 Variables
70----------------------------------------------------------------------------*/
71
73double_VAR(classify_norm_adj_midpoint, 32.0, "Norm adjust midpoint ...");
74double_VAR(classify_norm_adj_curl, 2.0, "Norm adjust curl ...");
76const double kWidthErrorWeighting = 0.125;
77
78/*----------------------------------------------------------------------------
79 Public Code
80----------------------------------------------------------------------------*/
94float Classify::ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) {
95 if (ClassId >= NormProtos->NumProtos) {
96 ClassId = NO_CLASS;
97 }
98
99 /* handle requests for classification as noise */
100 if (ClassId == NO_CLASS) {
101 /* kludge - clean up constants and make into control knobs later */
102 float Match = (feature.Params[CharNormLength] * feature.Params[CharNormLength] * 500.0f +
103 feature.Params[CharNormRx] * feature.Params[CharNormRx] * 8000.0f +
104 feature.Params[CharNormRy] * feature.Params[CharNormRy] * 8000.0f);
105 return (1.0f - NormEvidenceOf(Match));
106 }
107
108 float BestMatch = FLT_MAX;
109 LIST Protos = NormProtos->Protos[ClassId];
110
111 if (DebugMatch) {
112 tprintf("\nChar norm for class %s\n", unicharset.id_to_unichar(ClassId));
113 }
114
115 int ProtoId = 0;
116 iterate(Protos) {
117 auto Proto = reinterpret_cast<PROTOTYPE *>(Protos->first_node());
118 float Delta = feature.Params[CharNormY] - Proto->Mean[CharNormY];
119 float Match = Delta * Delta * Proto->Weight.Elliptical[CharNormY];
120 if (DebugMatch) {
121 tprintf("YMiddle: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormY], Delta,
122 Proto->Weight.Elliptical[CharNormY], Match);
123 }
124 Delta = feature.Params[CharNormRx] - Proto->Mean[CharNormRx];
125 Match += Delta * Delta * Proto->Weight.Elliptical[CharNormRx];
126 if (DebugMatch) {
127 tprintf("Height: Proto=%g, Delta=%g, Var=%g, Dist=%g\n", Proto->Mean[CharNormRx], Delta,
128 Proto->Weight.Elliptical[CharNormRx], Match);
129 }
130 // Ry is width! See intfx.cpp.
131 Delta = feature.Params[CharNormRy] - Proto->Mean[CharNormRy];
132 if (DebugMatch) {
133 tprintf("Width: Proto=%g, Delta=%g, Var=%g\n", Proto->Mean[CharNormRy], Delta,
134 Proto->Weight.Elliptical[CharNormRy]);
135 }
136 Delta = Delta * Delta * Proto->Weight.Elliptical[CharNormRy];
137 Delta *= kWidthErrorWeighting;
138 Match += Delta;
139 if (DebugMatch) {
140 tprintf("Total Dist=%g, scaled=%g, sigmoid=%g, penalty=%g\n", Match,
141 Match / classify_norm_adj_midpoint, NormEvidenceOf(Match),
142 256 * (1 - NormEvidenceOf(Match)));
143 }
144
145 if (Match < BestMatch) {
146 BestMatch = Match;
147 }
148
149 ProtoId++;
150 }
151 return 1.0 - NormEvidenceOf(BestMatch);
152} /* ComputeNormMatch */
153
155 if (NormProtos != nullptr) {
156 for (int i = 0; i < NormProtos->NumProtos; i++) {
158 }
159 delete[] NormProtos->ParamDesc;
160 delete NormProtos;
161 NormProtos = nullptr;
162 }
163}
164
174 char unichar[2 * UNICHAR_LEN + 1];
175 UNICHAR_ID unichar_id;
176 LIST Protos;
177 int NumProtos;
178
179 /* allocate and initialization data structure */
181
182 /* read file header and save in data structure */
185
186 /* read protos for each class into a separate list */
187 const int kMaxLineSize = 100;
188 char line[kMaxLineSize];
189 while (fp->FGets(line, kMaxLineSize) != nullptr) {
190 std::istringstream stream(line);
191 stream.imbue(std::locale::classic());
192 stream >> unichar >> NumProtos;
193 if (stream.fail()) {
194 continue;
195 }
196 if (unicharset.contains_unichar(unichar)) {
197 unichar_id = unicharset.unichar_to_id(unichar);
198 Protos = NormProtos->Protos[unichar_id];
199 for (int i = 0; i < NumProtos; i++) {
200 Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams));
201 }
202 NormProtos->Protos[unichar_id] = Protos;
203 } else {
204 tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar);
205 for (int i = 0; i < NumProtos; i++) {
207 }
208 }
209 }
210 return NormProtos;
211} /* ReadNormProtos */
212
213} // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:31
#define double_VAR(name, val, comment)
Definition: params.h:366
#define NO_CLASS
Definition: matchdefs.h:35
#define iterate(l)
Definition: oldlist.h:91
uint16_t ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:114
double classify_norm_adj_curl
Definition: normmatch.cpp:74
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
const double kWidthErrorWeighting
Definition: normmatch.cpp:76
double classify_norm_adj_midpoint
Definition: normmatch.cpp:73
@ CharNormLength
Definition: normfeat.h:30
@ CharNormRy
Definition: normfeat.h:30
@ CharNormY
Definition: normfeat.h:30
@ CharNormRx
Definition: normfeat.h:30
int UNICHAR_ID
Definition: unichar.h:34
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:1597
void FreePrototype(void *arg)
Definition: cluster.cpp:1608
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:192
PROTOTYPE * ReadPrototype(TFile *fp, uint16_t N)
Definition: clusttool.cpp:168
UNICHAR_ID CLASS_ID
Definition: matchdefs.h:34
PARAM_DESC * ReadParamDesc(TFile *fp, uint16_t N)
Definition: clusttool.cpp:134
UNICHARSET unicharset
Definition: ccutil.h:61
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:195
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:695
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
float ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch)
Definition: normmatch.cpp:94
NORM_PROTOS * ReadNormProtos(TFile *fp)
Definition: normmatch.cpp:173
NORM_PROTOS * NormProtos
Definition: classify.h:432
std::vector< LIST > Protos
Definition: normmatch.cpp:41
PARAM_DESC * ParamDesc
Definition: normmatch.cpp:40
std::vector< float > Params
Definition: ocrfeatures.h:66
list_rec * first_node()
Definition: oldlist.h:107