All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
dawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.c (Formerly dawg.c)
5  * Description: Use a Directed Accyclic Word Graph
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Wed Jul 24 16:59:16 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 
29 #ifdef _MSC_VER
30 #pragma warning(disable:4244) // Conversion warnings
31 #pragma warning(disable:4800) // int/bool warnings
32 #endif
33 #include "dawg.h"
34 
35 #include "cutil.h"
36 #include "dict.h"
37 #include "emalloc.h"
38 #include "freelist.h"
39 #include "helpers.h"
40 #include "strngs.h"
41 #include "tesscallback.h"
42 #include "tprintf.h"
43 
44 /*----------------------------------------------------------------------
45  F u n c t i o n s f o r D a w g
46 ----------------------------------------------------------------------*/
47 namespace tesseract {
48 
50  bool requires_complete) const {
51  if (word.length() == 0) return !requires_complete;
52  NODE_REF node = 0;
53  int end_index = word.length() - 1;
54  for (int i = 0; i < end_index; i++) {
55  EDGE_REF edge = edge_char_of(node, word.unichar_id(i), false);
56  if (edge == NO_EDGE) {
57  return false;
58  }
59  if ((node = next_node(edge)) == 0) {
60  // This only happens if all words following this edge terminate --
61  // there are no larger words. See Trie::add_word_to_dawg()
62  return false;
63  }
64  }
65  // Now check the last character.
66  return edge_char_of(node, word.unichar_id(end_index), requires_complete) !=
67  NO_EDGE;
68 }
69 
70 bool Dawg::word_in_dawg(const WERD_CHOICE &word) const {
71  return prefix_in_dawg(word, true);
72 }
73 
75  const UNICHARSET &unicharset,
76  bool enable_wildcard) const {
77  if (filename == NULL) return 0;
78 
79  FILE *word_file;
80  char string [CHARS_PER_LINE];
81  int misses = 0;
82  UNICHAR_ID wildcard = unicharset.unichar_to_id(kWildcard);
83 
84  word_file = open_file (filename, "r");
85 
86  while (fgets (string, CHARS_PER_LINE, word_file) != NULL) {
87  chomp_string(string); // remove newline
88  WERD_CHOICE word(string, unicharset);
89  if (word.length() > 0 &&
90  !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
91  if (!match_words(&word, 0, 0,
92  enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
93  tprintf("Missing word: %s\n", string);
94  ++misses;
95  }
96  } else {
97  tprintf("Failed to create a valid word from %s\n", string);
98  }
99  }
100  fclose (word_file);
101  // Make sure the user sees this with fprintf instead of tprintf.
102  if (debug_level_) tprintf("Number of lost words=%d\n", misses);
103  return misses;
104 }
105 
106 void Dawg::iterate_words(const UNICHARSET &unicharset,
108  WERD_CHOICE word(&unicharset);
109  iterate_words_rec(word, 0, cb);
110 }
111 
113  STRING s;
114  wc->string_and_lengths(&s, NULL);
115  cb->Run(s.string());
116 }
117 
118 void Dawg::iterate_words(const UNICHARSET &unicharset,
119  TessCallback1<const char *> *cb) const {
122  WERD_CHOICE word(&unicharset);
123  iterate_words_rec(word, 0, shim);
124  delete shim;
125 }
126 
127 void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
128  NODE_REF to_explore,
130  NodeChildVector children;
131  this->unichar_ids_of(to_explore, &children, false);
132  for (int i = 0; i < children.size(); i++) {
133  WERD_CHOICE next_word(word_so_far);
134  next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
135  if (this->end_of_word(children[i].edge_ref)) {
136  cb->Run(&next_word);
137  }
138  NODE_REF next = next_node(children[i].edge_ref);
139  if (next != 0) {
140  iterate_words_rec(next_word, next, cb);
141  }
142  }
143 }
144 
146  NODE_REF node, UNICHAR_ID wildcard) const {
147  EDGE_REF edge;
148  inT32 word_end;
149 
150  if (wildcard != INVALID_UNICHAR_ID && word->unichar_id(index) == wildcard) {
151  bool any_matched = false;
152  NodeChildVector vec;
153  this->unichar_ids_of(node, &vec, false);
154  for (int i = 0; i < vec.size(); ++i) {
155  word->set_unichar_id(vec[i].unichar_id, index);
156  if (match_words(word, index, node, wildcard))
157  any_matched = true;
158  }
159  word->set_unichar_id(wildcard, index);
160  return any_matched;
161  } else {
162  word_end = index == word->length() - 1;
163  edge = edge_char_of(node, word->unichar_id(index), word_end);
164  if (edge != NO_EDGE) { // normal edge in DAWG
165  node = next_node(edge);
166  if (word_end) {
167  if (debug_level_ > 1) word->print("match_words() found: ");
168  return true;
169  } else if (node != 0) {
170  return match_words(word, index+1, node, wildcard);
171  }
172  }
173  }
174  return false;
175 }
176 
177 void Dawg::init(DawgType type, const STRING &lang,
178  PermuterType perm, int unicharset_size, int debug_level) {
179  type_ = type;
180  lang_ = lang;
181  perm_ = perm;
182  ASSERT_HOST(unicharset_size > 0);
183  unicharset_size_ = unicharset_size;
184  // Set bit masks. We will use the value unicharset_size_ as a null char, so
185  // the actual number of unichars is unicharset_size_ + 1.
186  flag_start_bit_ = ceil(log(unicharset_size_ + 1.0) / log(2.0));
188  letter_mask_ = ~(~0ull << flag_start_bit_);
191 
192  debug_level_ = debug_level;
193 }
194 
195 
196 /*----------------------------------------------------------------------
197  F u n c t i o n s f o r S q u i s h e d D a w g
198 ----------------------------------------------------------------------*/
199 
201 
203  UNICHAR_ID unichar_id,
204  bool word_end) const {
205  EDGE_REF edge = node;
206  if (node == 0) { // binary search
207  EDGE_REF start = 0;
208  EDGE_REF end = num_forward_edges_in_node0 - 1;
209  int compare;
210  while (start <= end) {
211  edge = (start + end) >> 1; // (start + end) / 2
212  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
213  unichar_id, edges_[edge]);
214  if (compare == 0) { // given == vec[k]
215  return edge;
216  } else if (compare == 1) { // given > vec[k]
217  start = edge + 1;
218  } else { // given < vec[k]
219  end = edge - 1;
220  }
221  }
222  } else { // linear search
223  if (edge != NO_EDGE && edge_occupied(edge)) {
224  do {
225  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
226  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
227  return (edge);
228  } while (!last_edge(edge++));
229  }
230  }
231  return (NO_EDGE); // not found
232 }
233 
234 inT32 SquishedDawg::num_forward_edges(NODE_REF node) const {
235  EDGE_REF edge = node;
236  inT32 num = 0;
237 
238  if (forward_edge (edge)) {
239  do {
240  num++;
241  } while (!last_edge(edge++));
242  }
243 
244  return (num);
245 }
246 
247 void SquishedDawg::print_node(NODE_REF node, int max_num_edges) const {
248  if (node == NO_EDGE) return; // nothing to print
249 
250  EDGE_REF edge = node;
251  const char *forward_string = "FORWARD";
252  const char *backward_string = " ";
253 
254  const char *last_string = "LAST";
255  const char *not_last_string = " ";
256 
257  const char *eow_string = "EOW";
258  const char *not_eow_string = " ";
259 
260  const char *direction;
261  const char *is_last;
262  const char *eow;
263 
264  UNICHAR_ID unichar_id;
265 
266  if (edge_occupied(edge)) {
267  do {
268  direction =
269  forward_edge(edge) ? forward_string : backward_string;
270  is_last = last_edge(edge) ? last_string : not_last_string;
271  eow = end_of_word(edge) ? eow_string : not_eow_string;
272 
273  unichar_id = edge_letter(edge);
274  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
275  edge, next_node(edge), unichar_id,
276  direction, is_last, eow);
277 
278  if (edge - node > max_num_edges) return;
279  } while (!last_edge(edge++));
280 
281  if (edge < num_edges_ &&
282  edge_occupied(edge) && backward_edge(edge)) {
283  do {
284  direction =
285  forward_edge(edge) ? forward_string : backward_string;
286  is_last = last_edge(edge) ? last_string : not_last_string;
287  eow = end_of_word(edge) ? eow_string : not_eow_string;
288 
289  unichar_id = edge_letter(edge);
290  tprintf(REFFORMAT " : next = " REFFORMAT
291  ", unichar_id = %d, %s %s %s\n",
292  edge, next_node(edge), unichar_id,
293  direction, is_last, eow);
294 
295  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
296  } while (!last_edge(edge++));
297  }
298  }
299  else {
300  tprintf(REFFORMAT " : no edges in this node\n", node);
301  }
302  tprintf("\n");
303 }
304 
305 void SquishedDawg::print_edge(EDGE_REF edge) const {
306  if (edge == NO_EDGE) {
307  tprintf("NO_EDGE\n");
308  } else {
309  tprintf(REFFORMAT " : next = " REFFORMAT
310  ", unichar_id = '%d', %s %s %s\n", edge,
311  next_node(edge), edge_letter(edge),
312  (forward_edge(edge) ? "FORWARD" : " "),
313  (last_edge(edge) ? "LAST" : " "),
314  (end_of_word(edge) ? "EOW" : ""));
315  }
316 }
317 
318 void SquishedDawg::read_squished_dawg(FILE *file,
319  DawgType type,
320  const STRING &lang,
321  PermuterType perm,
322  int debug_level) {
323  if (debug_level) tprintf("Reading squished dawg\n");
324 
325  // Read the magic number and if it does not match kDawgMagicNumber
326  // set swap to true to indicate that we need to switch endianness.
327  inT16 magic;
328  fread(&magic, sizeof(inT16), 1, file);
329  bool swap = (magic != kDawgMagicNumber);
330 
331  int unicharset_size;
332  fread(&unicharset_size, sizeof(inT32), 1, file);
333  fread(&num_edges_, sizeof(inT32), 1, file);
334 
335  if (swap) {
336  ReverseN(&unicharset_size, sizeof(unicharset_size));
337  ReverseN(&num_edges_, sizeof(num_edges_));
338  }
339  ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
340  Dawg::init(type, lang, perm, unicharset_size, debug_level);
341 
342  edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
343  fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file);
344  EDGE_REF edge;
345  if (swap) {
346  for (edge = 0; edge < num_edges_; ++edge) {
347  ReverseN(&edges_[edge], sizeof(edges_[edge]));
348  }
349  }
350  if (debug_level > 2) {
351  tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
352  type_, lang_.string(), perm_, unicharset_size_, num_edges_);
353  for (edge = 0; edge < num_edges_; ++edge)
354  print_edge(edge);
355  }
356 }
357 
358 NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {
359  EDGE_REF edge;
360  NODE_MAP node_map;
361  inT32 node_counter;
362  inT32 num_edges;
363 
364  node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);
365 
366  for (edge = 0; edge < num_edges_; edge++) // init all slots
367  node_map [edge] = -1;
368 
369  node_counter = num_forward_edges(0);
370 
371  *num_nodes = 0;
372  for (edge = 0; edge < num_edges_; edge++) { // search all slots
373 
374  if (forward_edge(edge)) {
375  (*num_nodes)++; // count nodes links
376  node_map[edge] = (edge ? node_counter : 0);
377  num_edges = num_forward_edges(edge);
378  if (edge != 0) node_counter += num_edges;
379  edge += num_edges;
380  if (edge >= num_edges_) break;
381  if (backward_edge(edge)) while (!last_edge(edge++));
382  edge--;
383  }
384  }
385  return (node_map);
386 }
387 
389  EDGE_REF edge;
390  inT32 num_edges;
391  inT32 node_count = 0;
392  NODE_MAP node_map;
393  EDGE_REF old_index;
394  EDGE_RECORD temp_record;
395 
396  if (debug_level_) tprintf("write_squished_dawg\n");
397 
398  node_map = build_node_map(&node_count);
399 
400  // Write the magic number to help detecting a change in endianness.
401  inT16 magic = kDawgMagicNumber;
402  fwrite(&magic, sizeof(inT16), 1, file);
403  fwrite(&unicharset_size_, sizeof(inT32), 1, file);
404 
405  // Count the number of edges in this Dawg.
406  num_edges = 0;
407  for (edge=0; edge < num_edges_; edge++)
408  if (forward_edge(edge))
409  num_edges++;
410 
411  fwrite(&num_edges, sizeof(inT32), 1, file); // write edge count to file
412 
413  if (debug_level_) {
414  tprintf("%d nodes in DAWG\n", node_count);
415  tprintf("%d edges in DAWG\n", num_edges);
416  }
417 
418  for (edge = 0; edge < num_edges_; edge++) {
419  if (forward_edge(edge)) { // write forward edges
420  do {
421  old_index = next_node_from_edge_rec(edges_[edge]);
422  set_next_node(edge, node_map[old_index]);
423  temp_record = edges_[edge];
424  fwrite(&(temp_record), sizeof(EDGE_RECORD), 1, file);
425  set_next_node(edge, old_index);
426  } while (!last_edge(edge++));
427 
428  if (edge >= num_edges_) break;
429  if (backward_edge(edge)) // skip back links
430  while (!last_edge(edge++));
431 
432  edge--;
433  }
434  }
435  free(node_map);
436 }
437 
438 } // namespace tesseract
virtual void Run(A1)=0
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int next_node_start_bit_
Definition: dawg.h:299
const STRING & lang() const
Definition: dawg.h:128
void memfree(void *element)
Definition: freelist.cpp:30
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:356
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:53
virtual bool end_of_word(EDGE_REF edge_ref) const =0
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:304
int length() const
Definition: ratngs.h:300
void print_node(NODE_REF node, int max_num_edges) const
Definition: dawg.cpp:247
EDGE_REF * NODE_MAP
Definition: dawg.h:56
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:459
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE * > *cb) const
Definition: dawg.cpp:127
int * memalloc(int size)
Definition: freelist.cpp:22
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
PermuterType
Definition: ratngs.h:240
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:470
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
#define ASSERT_HOST(x)
Definition: errcode.h:84
void CallWithUTF8(TessCallback1< const char * > *cb, const WERD_CHOICE *wc)
Definition: dawg.cpp:112
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:465
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
Returns the edge that corresponds to the letter out of this node.
Definition: dawg.cpp:202
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:74
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
void write_squished_dawg(FILE *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:388
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
void iterate_words(const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE * > *cb) const
Definition: dawg.cpp:106
uinT64 next_node_mask_
Definition: dawg.h:300
uinT64 letter_mask_
Definition: dawg.h:302
void chomp_string(char *str)
Definition: helpers.h:75
int unicharset_size_
Definition: dawg.h:297
static const inT16 kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:121
uinT64 EDGE_RECORD
Definition: dawg.h:50
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
Definition: dawg.cpp:49
bool match_words(WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
Definition: dawg.cpp:145
int debug_level_
Definition: dawg.h:304
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
#define NUM_FLAG_BITS
Definition: dawg.h:91
int UNICHAR_ID
Definition: unichar.h:33
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:238
STRING lang_
Definition: dawg.h:290
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
int flag_start_bit_
Definition: dawg.h:298
#define REFFORMAT
Definition: dawg.h:92
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:177
DawgType type() const
Definition: dawg.h:127
DawgType
Definition: dawg.h:71
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
#define CHARS_PER_LINE
Definition: cutil.h:57
void print() const
Definition: ratngs.h:563
inT64 EDGE_REF
Definition: dawg.h:54
Definition: strngs.h:44
#define NULL
Definition: host.h:144
DawgType type_
Definition: dawg.h:289
inT64 NODE_REF
Definition: dawg.h:55
const char * string() const
Definition: strngs.cpp:193
void init(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: dawg.cpp:177
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:292
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
uinT64 flags_mask_
Definition: dawg.h:301
short inT16
Definition: host.h:100
int inT32
Definition: host.h:102