All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
ambigs.h
Go to the documentation of this file.
1 // File: ambigs.h
3 // Description: Constants, flags, functions for dealing with
4 // ambiguities (training and recognition).
5 // Author: Daria Antonova
6 // Created: Mon Aug 23 11:26:43 PDT 2008
7 //
8 // (C) Copyright 2008, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
22 #define TESSERACT_CCUTIL_AMBIGS_H_
23 
24 #include "elst.h"
25 #include "tprintf.h"
26 #include "unichar.h"
27 #include "unicharset.h"
28 #include "genericvector.h"
29 
30 #define MAX_AMBIG_SIZE 10
31 
32 namespace tesseract {
33 
35 
36 static const int kUnigramAmbigsBufferSize = 1000;
37 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
38 static const char kAmbigDelimiters[] = "\t ";
39 static const char kIllegalMsg[] =
40  "Illegal ambiguity specification on line %d\n";
41 static const char kIllegalUnicharMsg[] =
42  "Illegal unichar %s in ambiguity specification\n";
43 
44 enum AmbigType {
45  NOT_AMBIG, // the ngram pair is not ambiguous
46  REPLACE_AMBIG, // ocred ngram should always be substituted with correct
47  DEFINITE_AMBIG, // add correct ngram to the classifier results (1-1)
48  SIMILAR_AMBIG, // use pairwise classifier for ocred/correct pair (1-1)
49  CASE_AMBIG, // this is a case ambiguity (1-1)
50 
51  AMBIG_TYPE_COUNT // number of enum entries
52 };
53 
54 // A collection of utility functions for arrays of UNICHAR_IDs that are
55 // terminated by INVALID_UNICHAR_ID.
57  public:
58  // Compares two arrays of unichar ids. Returns -1 if the length of array1 is
59  // less than length of array2, if any array1[i] is less than array2[i].
60  // Returns 0 if the arrays are equal, 1 otherwise.
61  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID.
62  static inline int compare(const UNICHAR_ID array1[],
63  const UNICHAR_ID array2[]) {
64  const UNICHAR_ID *ptr1 = array1;
65  const UNICHAR_ID *ptr2 = array2;
66  while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
67  if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
68  ++ptr1;
69  ++ptr2;
70  }
71  if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
72  return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
73  }
74 
75  // Look uid in the vector of uids. If found, the index of the matched
76  // element is returned. Otherwise, it returns -1.
77  static inline int find_in(const UnicharIdVector& uid_vec,
78  const UNICHAR_ID uid) {
79  for (int i = 0; i < uid_vec.size(); ++i)
80  if (uid_vec[i] == uid) return i;
81  return -1;
82  }
83 
84  // Copies UNICHAR_IDs from dst to src. Returns the number of ids copied.
85  // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID
86  // and that dst has enough space for all the elements from src.
87  static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
88  int i = 0;
89  do {
90  dst[i] = src[i];
91  } while (dst[i++] != INVALID_UNICHAR_ID);
92  return i - 1;
93  }
94 
95  // Prints unichars corresponding to the unichar_ids in the given array.
96  // The function assumes that array is terminated by INVALID_UNICHAR_ID.
97  static inline void print(const UNICHAR_ID array[],
98  const UNICHARSET &unicharset) {
99  const UNICHAR_ID *ptr = array;
100  if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
101  while (*ptr != INVALID_UNICHAR_ID) {
102  tprintf("%s ", unicharset.id_to_unichar(*ptr++));
103  }
104  tprintf("( ");
105  ptr = array;
106  while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
107  tprintf(")\n");
108  }
109 };
110 
111 // AMBIG_SPEC_LIST stores a list of dangerous ambigs that
112 // start with the same unichar (e.g. r->t rn->m rr1->m).
113 class AmbigSpec : public ELIST_LINK {
114  public:
115  AmbigSpec();
117 
118  // Comparator function for sorting AmbigSpec_LISTs. The lists will
119  // be sorted by their wrong_ngram arrays. Example of wrong_ngram vectors
120  // in a a sorted AmbigSpec_LIST: [9 1 3], [9 3 4], [9 8], [9, 8 1].
121  static int compare_ambig_specs(const void *spec1, const void *spec2) {
122  const AmbigSpec *s1 =
123  *reinterpret_cast<const AmbigSpec * const *>(spec1);
124  const AmbigSpec *s2 =
125  *reinterpret_cast<const AmbigSpec * const *>(spec2);
127  if (result != 0) return result;
129  s2->correct_fragments);
130  }
131 
137 };
139 
140 // AMBIG_TABLE[i] stores a set of ambiguities whose
141 // wrong ngram starts with unichar id i.
143 
145  public:
148  replace_ambigs_.delete_data_pointers();
149  dang_ambigs_.delete_data_pointers();
150  one_to_one_definite_ambigs_.delete_data_pointers();
151  }
152 
153  const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
154  const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
155 
156  // Initializes the ambigs by adding a NULL pointer to each table.
157  void InitUnicharAmbigs(const UNICHARSET& unicharset,
158  bool use_ambigs_for_adaption);
159 
160  // Loads the universal ambigs that are useful for any language.
161  void LoadUniversal(const UNICHARSET& encoder_set, UNICHARSET* unicharset);
162 
163  // Fills in two ambiguity tables (replaceable and dangerous) with information
164  // read from the ambigs file. An ambiguity table is an array of lists.
165  // The array is indexed by a class id. Each entry in the table provides
166  // a list of potential ambiguities which can start with the corresponding
167  // character. For example the ambiguity "rn -> m", would be located in the
168  // table at index of unicharset.unichar_to_id('r').
169  // In 1-1 ambiguities (e.g. s -> S, 1 -> I) are recorded in
170  // one_to_one_definite_ambigs_. This vector is also indexed by the class id
171  // of the wrong part of the ambiguity and each entry contains a vector of
172  // unichar ids that are ambiguous to it.
173  // encoder_set is used to encode the ambiguity strings, undisturbed by new
174  // unichar_ids that may be created by adding the ambigs.
175  void LoadUnicharAmbigs(const UNICHARSET& encoder_set,
176  TFile *ambigs_file, int debug_level,
177  bool use_ambigs_for_adaption, UNICHARSET *unicharset);
178 
179  // Returns definite 1-1 ambigs for the given unichar id.
180  inline const UnicharIdVector *OneToOneDefiniteAmbigs(
181  UNICHAR_ID unichar_id) const {
182  if (one_to_one_definite_ambigs_.empty()) return NULL;
183  return one_to_one_definite_ambigs_[unichar_id];
184  }
185 
186  // Returns a pointer to the vector with all unichar ids that appear in the
187  // 'correct' part of the ambiguity pair when the given unichar id appears
188  // in the 'wrong' part of the ambiguity. E.g. if DangAmbigs file consist of
189  // m->rn,rn->m,m->iii, UnicharAmbigsForAdaption() called with unichar id of
190  // m will return a pointer to a vector with unichar ids of r,n,i.
191  inline const UnicharIdVector *AmbigsForAdaption(
192  UNICHAR_ID unichar_id) const {
193  if (ambigs_for_adaption_.empty()) return NULL;
194  return ambigs_for_adaption_[unichar_id];
195  }
196 
197  // Similar to the above, but return the vector of unichar ids for which
198  // the given unichar_id is an ambiguity (appears in the 'wrong' part of
199  // some ambiguity pair).
200  inline const UnicharIdVector *ReverseAmbigsForAdaption(
201  UNICHAR_ID unichar_id) const {
202  if (reverse_ambigs_for_adaption_.empty()) return NULL;
203  return reverse_ambigs_for_adaption_[unichar_id];
204  }
205 
206  private:
207  bool ParseAmbiguityLine(int line_num, int version, int debug_level,
208  const UNICHARSET &unicharset, char *buffer,
209  int *test_ambig_part_size,
210  UNICHAR_ID *test_unichar_ids,
211  int *replacement_ambig_part_size,
212  char *replacement_string, int *type);
213  bool InsertIntoTable(UnicharAmbigsVector &table,
214  int test_ambig_part_size, UNICHAR_ID *test_unichar_ids,
215  int replacement_ambig_part_size,
216  const char *replacement_string, int type,
217  AmbigSpec *ambig_spec, UNICHARSET *unicharset);
218 
219  UnicharAmbigsVector dang_ambigs_;
220  UnicharAmbigsVector replace_ambigs_;
221  GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
222  GenericVector<UnicharIdVector *> ambigs_for_adaption_;
223  GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
224 };
225 
226 } // namespace tesseract
227 
228 #endif // TESSERACT_CCUTIL_AMBIGS_H_
AmbigType
Definition: ambigs.h:44
const UnicharIdVector * OneToOneDefiniteAmbigs(UNICHAR_ID unichar_id) const
Definition: ambigs.h:180
int size() const
Definition: genericvector.h:72
static int compare(const UNICHAR_ID array1[], const UNICHAR_ID array2[])
Definition: ambigs.h:62
static int compare_ambig_specs(const void *spec1, const void *spec2)
Definition: ambigs.h:121
#define MAX_AMBIG_SIZE
Definition: ambigs.h:30
#define tprintf(...)
Definition: tprintf.h:31
ELISTIZEH(AmbigSpec)
const UnicharAmbigsVector & replace_ambigs() const
Definition: ambigs.h:154
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:132
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:142
static int copy(const UNICHAR_ID src[], UNICHAR_ID dst[])
Definition: ambigs.h:87
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
void delete_data_pointers()
const char *const id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
static int find_in(const UnicharIdVector &uid_vec, const UNICHAR_ID uid)
Definition: ambigs.h:77
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:153
int UNICHAR_ID
Definition: unichar.h:33
UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:133
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:134
const UnicharIdVector * ReverseAmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:200
static void print(const UNICHAR_ID array[], const UNICHARSET &unicharset)
Definition: ambigs.h:97
const UnicharIdVector * AmbigsForAdaption(UNICHAR_ID unichar_id) const
Definition: ambigs.h:191
AmbigType type
Definition: ambigs.h:135
GenericVector< UNICHAR_ID > UnicharIdVector
Definition: ambigs.h:34
#define NULL
Definition: host.h:144