tesseract v5.3.3.20231005
ambigs.h
Go to the documentation of this file.
1
2// File: ambigs.h
3// Description: Constants, flags, functions for dealing with
4// ambiguities (training and recognition).
5// Author: Daria Antonova
6//
7// (C) Copyright 2008, Google Inc.
8// Licensed under the Apache License, Version 2.0 (the "License");
9// you may not use this file except in compliance with the License.
10// You may obtain a copy of the License at
11// http://www.apache.org/licenses/LICENSE-2.0
12// Unless required by applicable law or agreed to in writing, software
13// distributed under the License is distributed on an "AS IS" BASIS,
14// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15// See the License for the specific language governing permissions and
16// limitations under the License.
17//
19
20#ifndef TESSERACT_CCUTIL_AMBIGS_H_
21#define TESSERACT_CCUTIL_AMBIGS_H_
22
23#ifdef HAVE_CONFIG_H
24# include "config_auto.h" // DISABLED_LEGACY_ENGINE
25#endif
26
27#if !defined(DISABLED_LEGACY_ENGINE)
28
29# include <tesseract/unichar.h>
30# include "elst.h"
31# include "tprintf.h"
32# include "unicharset.h"
33
34# define MAX_AMBIG_SIZE 10
35
36namespace tesseract {
37
38using UnicharIdVector = std::vector<UNICHAR_ID>;
39
41 NOT_AMBIG, // the ngram pair is not ambiguous
42 REPLACE_AMBIG, // ocred ngram should always be substituted with correct
43 DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
44 SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
45 CASE_AMBIG, // this is a case ambiguity (1-1)
46
47 AMBIG_TYPE_COUNT // number of enum entries
48};
49
50// A collection of utility functions for arrays of UNICHAR_IDs that are
51// terminated by INVALID_UNICHAR_ID.
53public:
54 // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
55 // less than length of array2, if any array1[i] is less than array2[i].
56 // Returns 0 if the arrays are equal, 1 otherwise.
57 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
58 static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) {
59 for (;;) {
60 const UNICHAR_ID val1 = *ptr1++;
61 const UNICHAR_ID val2 = *ptr2++;
62 if (val1 != val2) {
63 if (val1 == INVALID_UNICHAR_ID) {
64 return -1;
65 }
66 if (val2 == INVALID_UNICHAR_ID) {
67 return 1;
68 }
69 if (val1 < val2) {
70 return -1;
71 }
72 return 1;
73 }
74 if (val1 == INVALID_UNICHAR_ID) {
75 return 0;
76 }
77 }
78 }
79
80 // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
81 // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
82 // and that dst has enough space for all the elements from src.
83 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
84 int i = 0;
85 do {
86 dst[i] = src[i];
87 } while (dst[i++] != INVALID_UNICHAR_ID);
88 return i - 1;
89 }
90
91 // Prints unichars corresponding to the unichar_ids in the given array.
92 // The function assumes that array is terminated by INVALID_UNICHAR_ID.
93 static inline void print(const UNICHAR_ID array[], const UNICHARSET &unicharset) {
94 const UNICHAR_ID *ptr = array;
95 if (*ptr == INVALID_UNICHAR_ID) {
96 tprintf("[Empty]");
97 }
98 while (*ptr != INVALID_UNICHAR_ID) {
99 tprintf("%s ", unicharset.id_to_unichar(*ptr++));
100 }
101 tprintf("( ");
102 ptr = array;
103 while (*ptr != INVALID_UNICHAR_ID) {
104 tprintf("%d ", *ptr++);
105 }
106 tprintf(")\n");
107 }
108};
109
110// AMBIG_SPEC_LIST stores a list of dangerous ambigs that
111// start with the same unichar (e.g. r->t rn->m rr1->m).
112class AmbigSpec : public ELIST_LINK {
113public:
114 AmbigSpec();
115 ~AmbigSpec() = default;
116
117 // Comparator function for sorting AmbigSpec_LISTs. The lists will
118 // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
119 // in a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
120 static int compare_ambig_specs(const void *spec1, const void *spec2) {
121 const AmbigSpec *s1 = *static_cast<const AmbigSpec *const *>(spec1);
122 const AmbigSpec *s2 = *static_cast<const AmbigSpec *const *>(spec2);
124 if (result != 0) {
125 return result;
126 }
128 }
129
135};
137
138// AMBIG_TABLE[i] stores a set of ambiguities whose
139// wrong ngram starts with unichar id i.
140using UnicharAmbigsVector = std::vector<AmbigSpec_LIST *>;
141
143public:
144 UnicharAmbigs() = default;
146 for (auto data : replace_ambigs_) {
147 delete data;
148 }
149 for (auto data : dang_ambigs_) {
150 delete data;
151 }
152 for (auto data : one_to_one_definite_ambigs_) {
153 delete data;
154 }
155 }
156
158 return dang_ambigs_;
159 }
161 return replace_ambigs_;
162 }
163
164 // Initializes the ambigs by adding a nullptr pointer to each table.
165 void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption);
166
167 // Loads the universal ambigs that are useful for any language.
168 void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset);
169
170 // Fills in two ambiguity tables (replaceable and dangerous) with information
171 // read from the ambigs file. An ambiguity table is an array of lists.
172 // The array is indexed by a class id. Each entry in the table provides
173 // a list of potential ambiguities which can start with the corresponding
174 // character. For example the ambiguity "rn -> m", would be located in the
175 // table at index of unicharset.unichar_to_id('r').
176 // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
177 // one_to_one_definite_ambigs_. This vector is also indexed by the class id
178 // of the wrong part of the ambiguity and each entry contains a vector of
179 // unichar ids that are ambiguous to it.
180 // encoder_set is used to encode the ambiguity strings, undisturbed by new
181 // unichar_ids that may be created by adding the ambigs.
182 void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level,
183 bool use_ambigs_for_adaption, UNICHARSET *unicharset);
184
185 // Returns definite 1-1 ambigs for the given unichar id.
186 inline const UnicharIdVector *OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const {
187 if (one_to_one_definite_ambigs_.empty()) {
188 return nullptr;
189 }
190 return one_to_one_definite_ambigs_[unichar_id];
191 }
192
193 // Returns a pointer to the vector with all unichar ids that appear in the
194 // 'correct' part of the ambiguity pair when the given unichar id appears
195 // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
196 // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
197 // m will return a pointer to a vector with unichar ids of r,n,i.
198 inline const UnicharIdVector *AmbigsForAdaption(UNICHAR_ID unichar_id) const {
199 if (ambigs_for_adaption_.empty()) {
200 return nullptr;
201 }
202 return ambigs_for_adaption_[unichar_id];
203 }
204
205 // Similar to the above, but return the vector of unichar ids for which
206 // the given unichar_id is an ambiguity (appears in the 'wrong' part of
207 // some ambiguity pair).
208 inline const UnicharIdVector *ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const {
209 if (reverse_ambigs_for_adaption_.empty()) {
210 return nullptr;
211 }
212 return reverse_ambigs_for_adaption_[unichar_id];
213 }
214
215private:
216 bool ParseAmbiguityLine(int line_num, int version, int debug_level, const UNICHARSET &unicharset,
217 char *buffer, int *test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
218 int *replacement_ambig_part_size, char *replacement_string, int *type);
219 bool InsertIntoTable(UnicharAmbigsVector &table, int test_ambig_part_size,
220 UNICHAR_ID *test_unichar_ids, int replacement_ambig_part_size,
221 const char *replacement_string, int type, AmbigSpec *ambig_spec,
222 UNICHARSET *unicharset);
223
224 UnicharAmbigsVector dang_ambigs_;
225 UnicharAmbigsVector replace_ambigs_;
226 std::vector<UnicharIdVector *> one_to_one_definite_ambigs_;
227 std::vector<UnicharIdVector *> ambigs_for_adaption_;
228 std::vector<UnicharIdVector *> reverse_ambigs_for_adaption_;
229};
230
231} // namespace tesseract
232
233#endif // !defined(DISABLED_LEGACY_ENGINE)
234
235#endif // TESSERACT_CCUTIL_AMBIGS_H_
#define ELISTIZEH(CLASSNAME)
Definition: elst.h:803
#define MAX_AMBIG_SIZE
Definition: ambigs.h:34
STL namespace.
std::vector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:140
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UNICHAR_ID
Definition: unichar.h:34
AmbigType
Definition: ambigs.h:40
@ CASE_AMBIG
Definition: ambigs.h:45
@ DEFINITE_AMBIG
Definition: ambigs.h:43
@ REPLACE_AMBIG
Definition: ambigs.h:42
@ AMBIG_TYPE_COUNT
Definition: ambigs.h:47
@ SIMILAR_AMBIG
Definition: ambigs.h:44
@ NOT_AMBIG
Definition: ambigs.h:41
std::vector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:38
type
Definition: upload.py:458
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:93
static int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2)
Definition: ambigs.h:58
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
Definition: ambigs.h:83
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:132
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:130
static int compare_ambig_specs(const void *spec1, const void *spec2)
Definition: ambigs.h:120
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:131
AmbigType type
Definition: ambigs.h:133
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:160
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:157
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:198
const UnicharIdVector * OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const
Definition: ambigs.h:186
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:208
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:279