All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }
 

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
virtual ~Trie ()
 
void clear ()
 
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const
 
NODE_REF next_node (EDGE_REF edge_ref) const
 
bool end_of_word (EDGE_REF edge_ref) const
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 
void KillEdge (EDGE_RECORD *edge_rec) const
 
bool DeadEdge (const EDGE_RECORD &edge_rec) const
 
void print_node (NODE_REF node, int max_num_edges) const
 
SquishedDawgtrie_to_dawg ()
 
bool read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
 
bool read_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy, GenericVector< STRING > *words)
 
bool add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset)
 
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
 
void initialize_patterns (UNICHARSET *unicharset)
 
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
 
bool add_word_to_dawg (const WERD_CHOICE &word)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE * > *cb) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char * > *cb) const
 

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)
 

Static Public Attributes

static const int kSaneNumConcreteChars = 0
 
static const char kAlphaPatternUnicode [] = "\u2000"
 
static const char kDigitPatternUnicode [] = "\u2001"
 
static const char kAlphanumPatternUnicode [] = "\u2002"
 
static const char kPuncPatternUnicode [] = "\u2003"
 
static const char kLowerPatternUnicode [] = "\u2004"
 
static const char kUpperPatternUnicode [] = "\u2005"
 
- Static Public Attributes inherited from tesseract::Dawg
static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
 
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
 
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void print_edge_rec (const EDGE_RECORD &edge_rec) const
 
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
 
void print_all (const char *msg, int max_num_edges)
 
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
 
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
 
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
 
NODE_REF new_dawg_node ()
 
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
 
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
 
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
 
void sort_edges (EDGE_VECTOR *edges)
 
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
 
UNICHAR_ID character_class_to_pattern (char ch)
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg ()
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE * > *cb) const
 

Protected Attributes

TRIE_NODES nodes_
 
uinT64 num_edges_
 
uinT64 deref_direction_mask_
 
uinT64 deref_node_index_mask_
 
GenericVector< EDGE_INDEXroot_back_freelist_
 
bool initialized_patterns_
 
UNICHAR_ID alpha_pattern_
 
UNICHAR_ID digit_pattern_
 
UNICHAR_ID alphanum_pattern_
 
UNICHAR_ID punc_pattern_
 
UNICHAR_ID lower_pattern_
 
UNICHAR_ID upper_pattern_
 
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
 
STRING lang_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
int unicharset_size_
 
int flag_start_bit_
 
int next_node_start_bit_
 
uinT64 next_node_mask_
 
uinT64 flags_mask_
 
uinT64 letter_mask_
 
int debug_level_
 

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 62 of file trie.h.

Member Enumeration Documentation

Enumerator
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 64 of file trie.h.

Constructor & Destructor Documentation

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 89 of file trie.h.

90  {
91  init(type, lang, perm, unicharset_size, debug_level);
92  num_edges_ = 0;
94  new_dawg_node(); // need to allocate node 0
95  initialized_patterns_ = false;
96  }
uinT64 num_edges_
Definition: trie.h:421
NODE_REF new_dawg_node()
Definition: trie.cpp:277
uinT64 deref_node_index_mask_
Definition: trie.h:423
bool initialized_patterns_
Definition: trie.h:428
uinT64 letter_mask_
Definition: dawg.h:302
DawgType type() const
Definition: dawg.h:127
void init(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: dawg.cpp:177
virtual tesseract::Trie::~Trie ( )
inlinevirtual

Definition at line 97 of file trie.h.

TRIE_NODES nodes_
Definition: trie.h:420
void delete_data_pointers()

Member Function Documentation

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 125 of file trie.cpp.

127  {
128  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
129  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
130  int search_index;
131  if (node1 == 0 && direction == FORWARD_EDGE) {
132  search_index = 0; // find the index to make the add sorted
133  while (search_index < vec->size() &&
134  given_greater_than_edge_rec(node2, word_end, unichar_id,
135  (*vec)[search_index]) == 1) {
136  search_index++;
137  }
138  } else {
139  search_index = vec->size(); // add is unsorted, so index does not matter
140  }
141  EDGE_RECORD edge_rec;
142  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
143  if (node1 == 0 && direction == BACKWARD_EDGE &&
145  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
146  (*vec)[edge_index] = edge_rec;
147  } else if (search_index < vec->size()) {
148  vec->insert(edge_rec, search_index);
149  } else {
150  vec->push_back(edge_rec);
151  }
152  if (debug_level_ > 1) {
153  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
154  print_edge_rec(edge_rec);
155  tprintf("\n");
156  }
157  num_edges_++;
158  return true;
159 }
int size() const
Definition: genericvector.h:72
uinT64 num_edges_
Definition: trie.h:421
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:318
TRIE_NODES nodes_
Definition: trie.h:420
#define FORWARD_EDGE
Definition: dawg.h:84
void insert(T t, int index)
uinT64 EDGE_RECORD
Definition: dawg.h:50
#define BACKWARD_EDGE
Definition: dawg.h:85
int debug_level_
Definition: dawg.h:304
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:238
bool empty() const
Definition: genericvector.h:84
#define REFFORMAT
Definition: dawg.h:92
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:425
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:307
inT64 EDGE_INDEX
Definition: trie.h:32
bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 357 of file trie.h.

358  {
359  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
360  word_end, unichar_id) &&
361  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
362  word_end, unichar_id));
363  }
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:125
#define FORWARD_EDGE
Definition: dawg.h:84
#define BACKWARD_EDGE
Definition: dawg.h:85
void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 161 of file trie.cpp.

164  {
165  EDGE_RECORD *back_edge_ptr;
166  EDGE_INDEX back_edge_index;
167  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
168  unichar_id, &back_edge_ptr, &back_edge_index));
169  if (marker_flag) {
170  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
171  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
172  }
173  // Mark both directions as end of word.
174  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
175  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
176 }
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define WERD_END_FLAG
Definition: dawg.h:89
uinT64 EDGE_RECORD
Definition: dawg.h:50
#define BACKWARD_EDGE
Definition: dawg.h:85
int flag_start_bit_
Definition: dawg.h:298
#define MARKER_FLAG
Definition: dawg.h:87
inT64 EDGE_INDEX
Definition: trie.h:32
bool tesseract::Trie::add_word_list ( const GenericVector< STRING > &  words,
const UNICHARSET unicharset 
)

Definition at line 336 of file trie.cpp.

337  {
338  for (int i = 0; i < words.size(); ++i) {
339  WERD_CHOICE word(words[i].string(), unicharset);
340  if (!word_in_dawg(word)) {
341  add_word_to_dawg(word);
342  if (!word_in_dawg(word)) {
343  tprintf("Error: word '%s' not in DAWG after adding it\n",
344  words[i].string());
345  return false;
346  }
347  }
348  }
349  return true;
350 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

Definition at line 178 of file trie.cpp.

179  {
180  if (word.length() <= 0) return false; // can't add empty words
181  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
182  // Make sure the word does not contain invalid unchar ids.
183  for (int i = 0; i < word.length(); ++i) {
184  if (word.unichar_id(i) < 0 ||
185  word.unichar_id(i) >= unicharset_size_) return false;
186  }
187 
188  EDGE_RECORD *edge_ptr;
189  NODE_REF last_node = 0;
190  NODE_REF the_next_node;
191  bool marker_flag = false;
192  EDGE_INDEX edge_index;
193  int i;
194  inT32 still_finding_chars = true;
195  inT32 word_end = false;
196  bool add_failed = false;
197  bool found;
198 
199  if (debug_level_ > 1) word.print("\nAdding word: ");
200 
201  UNICHAR_ID unichar_id;
202  for (i = 0; i < word.length() - 1; ++i) {
203  unichar_id = word.unichar_id(i);
204  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
205  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
206  if (still_finding_chars) {
207  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
208  unichar_id, &edge_ptr, &edge_index);
209  if (found && debug_level_ > 1) {
210  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
211  edge_index, last_node);
212  }
213  if (!found) {
214  still_finding_chars = false;
215  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
216  // We hit the end of an existing word, but the new word is longer.
217  // In this case we have to disconnect the existing word from the
218  // backwards root node, mark the current position as end-of-word
219  // and add new nodes for the increased length. Disconnecting the
220  // existing word from the backwards root node requires a linear
221  // search, so it is much faster to add the longest words first,
222  // to avoid having to come here.
223  word_end = true;
224  still_finding_chars = false;
225  remove_edge(last_node, 0, word_end, unichar_id);
226  } else {
227  // We have to add a new branch here for the new word.
228  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
229  last_node = next_node_from_edge_rec(*edge_ptr);
230  }
231  }
232  if (!still_finding_chars) {
233  the_next_node = new_dawg_node();
234  if (debug_level_ > 1)
235  tprintf("adding node " REFFORMAT "\n", the_next_node);
236  if (the_next_node == 0) {
237  add_failed = true;
238  break;
239  }
240  if (!add_new_edge(last_node, the_next_node,
241  marker_flag, word_end, unichar_id)) {
242  add_failed = true;
243  break;
244  }
245  word_end = false;
246  last_node = the_next_node;
247  }
248  }
249  the_next_node = 0;
250  unichar_id = word.unichar_id(i);
251  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
252  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
253  if (still_finding_chars &&
254  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
255  unichar_id, &edge_ptr, &edge_index)) {
256  // An extension of this word already exists in the trie, so we
257  // only have to add the ending flags in both directions.
258  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
259  marker_flag, unichar_id);
260  } else {
261  // Add a link to node 0. All leaves connect to node 0 so the back links can
262  // be used in reduction to a dawg. This root backward node has one edge
263  // entry for every word, (except prefixes of longer words) so it is huge.
264  if (!add_failed &&
265  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
266  add_failed = true;
267  }
268  if (add_failed) {
269  tprintf("Re-initializing document dictionary...\n");
270  clear();
271  return false;
272  } else {
273  return true;
274  }
275 }
int size() const
Definition: genericvector.h:72
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:228
int length() const
Definition: ratngs.h:300
#define tprintf(...)
Definition: tprintf.h:31
NODE_REF new_dawg_node()
Definition: trie.cpp:277
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
void clear()
Definition: trie.cpp:66
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:357
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
#define ASSERT_HOST(x)
Definition: errcode.h:84
#define FORWARD_EDGE
Definition: dawg.h:84
int unicharset_size_
Definition: dawg.h:297
uinT64 EDGE_RECORD
Definition: dawg.h:50
const UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:312
int debug_level_
Definition: dawg.h:304
int UNICHAR_ID
Definition: unichar.h:33
#define REFFORMAT
Definition: dawg.h:92
void print() const
Definition: ratngs.h:563
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:161
#define NULL
Definition: host.h:144
inT64 NODE_REF
Definition: dawg.h:55
inT64 EDGE_INDEX
Definition: trie.h:32
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:382
int inT32
Definition: host.h:102
bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

Definition at line 270 of file trie.h.

270  {
271  return add_word_to_dawg(word, NULL);
272  }
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
#define NULL
Definition: host.h:144
bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec)
inlineprotected

Definition at line 327 of file trie.h.

327  {
328  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
329  return (node_ref != NO_EDGE &&
330  nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
331  }
int size() const
Definition: genericvector.h:72
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
TRIE_NODES nodes_
Definition: trie.h:420
inT64 NODE_REF
Definition: dawg.h:55
UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

Definition at line 391 of file trie.cpp.

391  {
392  if (ch == 'c') {
393  return alpha_pattern_;
394  } else if (ch == 'd') {
395  return digit_pattern_;
396  } else if (ch == 'n') {
397  return alphanum_pattern_;
398  } else if (ch == 'p') {
399  return punc_pattern_;
400  } else if (ch == 'a') {
401  return lower_pattern_;
402  } else if (ch == 'A') {
403  return upper_pattern_;
404  } else {
405  return INVALID_UNICHAR_ID;
406  }
407 }
UNICHAR_ID alphanum_pattern_
Definition: trie.h:431
UNICHAR_ID digit_pattern_
Definition: trie.h:430
UNICHAR_ID lower_pattern_
Definition: trie.h:433
UNICHAR_ID upper_pattern_
Definition: trie.h:434
UNICHAR_ID punc_pattern_
Definition: trie.h:432
UNICHAR_ID alpha_pattern_
Definition: trie.h:429
void tesseract::Trie::clear ( )

Definition at line 66 of file trie.cpp.

66  {
68  nodes_.clear();
70  num_edges_ = 0;
71  new_dawg_node(); // Need to allocate node 0.
72 }
uinT64 num_edges_
Definition: trie.h:421
NODE_REF new_dawg_node()
Definition: trie.cpp:277
TRIE_NODES nodes_
Definition: trie.h:420
void delete_data_pointers()
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:425
bool tesseract::Trie::DeadEdge ( const EDGE_RECORD edge_rec) const
inline

Definition at line 157 of file trie.h.

157  {
158  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
159  }
int unicharset_size_
Definition: dawg.h:297
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

Definition at line 292 of file trie.h.

292  {
293  int edge_index = static_cast<int>(
294  (edge_ref & letter_mask_) >> LETTER_START_BIT);
295  int node_index = static_cast<int>(
296  (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
297  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
298  return &(node_rec->forward_edges[edge_index]);
299  }
#define LETTER_START_BIT
Definition: dawg.h:90
uinT64 deref_node_index_mask_
Definition: trie.h:423
TRIE_NODES nodes_
Definition: trie.h:420
uinT64 letter_mask_
Definition: dawg.h:302
int flag_start_bit_
Definition: dawg.h:298
EDGE_VECTOR forward_edges
Definition: trie.h:49
EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 103 of file trie.h.

104  {
105  EDGE_RECORD *edge_ptr;
106  EDGE_INDEX edge_index;
107  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
108  &edge_ptr, &edge_index)) return NO_EDGE;
109  return make_edge_ref(node_ref, edge_index);
110  }
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:301
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
#define FORWARD_EDGE
Definition: dawg.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:50
inT64 EDGE_INDEX
Definition: trie.h:32
bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const
protected

Definition at line 74 of file trie.cpp.

76  {
77  if (debug_level_ == 3) {
78  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
79  " direction %d word_end %d unichar_id %d, exploring node:\n",
80  node_ref, next_node, direction, word_end, unichar_id);
81  if (node_ref != NO_EDGE) {
82  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
83  }
84  }
85  if (node_ref == NO_EDGE) return false;
86  assert(node_ref < nodes_.size());
87  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
88  nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
89  int vec_size = vec.size();
90  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
91  EDGE_INDEX start = 0;
92  EDGE_INDEX end = vec_size - 1;
93  EDGE_INDEX k;
94  int compare;
95  while (start <= end) {
96  k = (start + end) >> 1; // (start + end) / 2
97  compare = given_greater_than_edge_rec(next_node, word_end,
98  unichar_id, vec[k]);
99  if (compare == 0) { // given == vec[k]
100  *edge_ptr = &(vec[k]);
101  *edge_index = k;
102  return true;
103  } else if (compare == 1) { // given > vec[k]
104  start = k + 1;
105  } else { // given < vec[k]
106  end = k - 1;
107  }
108  }
109  } else { // linear search
110  for (int i = 0; i < vec_size; ++i) {
111  EDGE_RECORD &edge_rec = vec[i];
112  if (edge_rec_match(next_node, word_end, unichar_id,
113  next_node_from_edge_rec(edge_rec),
114  end_of_word_from_edge_rec(edge_rec),
115  unichar_id_from_edge_rec(edge_rec))) {
116  *edge_ptr = &(edge_rec);
117  *edge_index = i;
118  return true;
119  }
120  }
121  }
122  return false; // not found
123 }
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:713
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:259
TRIE_NODES nodes_
Definition: trie.h:420
#define FORWARD_EDGE
Definition: dawg.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:50
int debug_level_
Definition: dawg.h:304
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:238
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
#define REFFORMAT
Definition: dawg.h:92
inT64 EDGE_INDEX
Definition: trie.h:32
UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 147 of file trie.h.

147  {
148  if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
149  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
150  }
uinT64 num_edges_
Definition: trie.h:421
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:292
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
)
protected

Definition at line 574 of file trie.cpp.

576  {
577  if (debug_level_ > 1) {
578  tprintf("\nCollapsing node %d:\n", node);
580  tprintf("Candidate edges: ");
581  print_edge_rec(edge1);
582  tprintf(", ");
583  print_edge_rec(edge2);
584  tprintf("\n\n");
585  }
586  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
587  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
588  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
589  // Translate all edges going to/from next_node2 to go to/from next_node1.
590  EDGE_RECORD *edge_ptr = NULL;
591  EDGE_INDEX edge_index;
592  int i;
593  // The backward link in node to next_node2 will be zeroed out by the caller.
594  // Copy all the backward links in next_node2 to node next_node1
595  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
596  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
597  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
598  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
599  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
600  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
601  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
602  curr_word_end, curr_unichar_id);
603  // Relocate the corresponding forward edge in curr_next_node
604  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
605  curr_word_end, curr_unichar_id,
606  &edge_ptr, &edge_index));
607  set_next_node_in_edge_rec(edge_ptr, next_node1);
608  }
609  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
610  next_node2_ptr->backward_edges.size());
611  if (debug_level_ > 1) {
612  tprintf("removed %d edges from node " REFFORMAT "\n",
613  next_node2_num_edges, next_node2);
614  }
615  next_node2_ptr->forward_edges.clear();
616  next_node2_ptr->backward_edges.clear();
617  num_edges_ -= next_node2_num_edges;
618  return true;
619 }
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
EDGE_VECTOR backward_edges
Definition: trie.h:50
uinT64 num_edges_
Definition: trie.h:421
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:713
#define tprintf(...)
Definition: tprintf.h:31
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:318
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
#define ASSERT_HOST(x)
Definition: errcode.h:84
TRIE_NODES nodes_
Definition: trie.h:420
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:125
#define FORWARD_EDGE
Definition: dawg.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:50
#define BACKWARD_EDGE
Definition: dawg.h:85
int debug_level_
Definition: dawg.h:304
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:222
int UNICHAR_ID
Definition: unichar.h:33
#define REFFORMAT
Definition: dawg.h:92
EDGE_VECTOR forward_edges
Definition: trie.h:49
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:204
#define NULL
Definition: host.h:144
inT64 NODE_REF
Definition: dawg.h:55
inT64 EDGE_INDEX
Definition: trie.h:32
bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 141 of file trie.h.

141  {
142  if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
143  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
144  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
uinT64 num_edges_
Definition: trie.h:421
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:292
const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

Definition at line 61 of file trie.cpp.

61  {
62  return RTLReversePolicyNames[reverse_policy];
63 }
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:48
void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)

Definition at line 352 of file trie.cpp.

352  {
359  unicharset->unichar_insert(kPuncPatternUnicode);
365  initialized_patterns_ = true;
366  unicharset_size_ = unicharset->size();
367 }
static const char kAlphaPatternUnicode[]
Definition: trie.h:75
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
static const char kDigitPatternUnicode[]
Definition: trie.h:76
UNICHAR_ID alphanum_pattern_
Definition: trie.h:431
static const char kPuncPatternUnicode[]
Definition: trie.h:78
UNICHAR_ID digit_pattern_
Definition: trie.h:430
static const char kUpperPatternUnicode[]
Definition: trie.h:80
UNICHAR_ID lower_pattern_
Definition: trie.h:433
bool initialized_patterns_
Definition: trie.h:428
UNICHAR_ID upper_pattern_
Definition: trie.h:434
static const char kLowerPatternUnicode[]
Definition: trie.h:79
UNICHAR_ID punc_pattern_
Definition: trie.h:432
UNICHAR_ID alpha_pattern_
Definition: trie.h:429
int unicharset_size_
Definition: dawg.h:297
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
static const char kAlphanumPatternUnicode[]
Definition: trie.h:77
int size() const
Definition: unicharset.h:297
void tesseract::Trie::KillEdge ( EDGE_RECORD edge_rec) const
inline

Definition at line 153 of file trie.h.

153  {
154  *edge_rec &= ~letter_mask_;
155  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
156  }
#define LETTER_START_BIT
Definition: dawg.h:90
uinT64 letter_mask_
Definition: dawg.h:302
int unicharset_size_
Definition: dawg.h:297
void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

Definition at line 307 of file trie.h.

308  {
309  EDGE_RECORD flags = 0;
310  if (repeats) flags |= MARKER_FLAG;
311  if (word_end) flags |= WERD_END_FLAG;
312  if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
313  *edge = ((nxt << next_node_start_bit_) |
314  (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
315  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
316  }
int next_node_start_bit_
Definition: dawg.h:299
#define LETTER_START_BIT
Definition: dawg.h:90
#define DIRECTION_FLAG
Definition: dawg.h:88
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
#define WERD_END_FLAG
Definition: dawg.h:89
uinT64 EDGE_RECORD
Definition: dawg.h:50
#define BACKWARD_EDGE
Definition: dawg.h:85
int flag_start_bit_
Definition: dawg.h:298
#define MARKER_FLAG
Definition: dawg.h:87
EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 301 of file trie.h.

302  {
303  return ((node_index << flag_start_bit_) |
304  (edge_index << LETTER_START_BIT));
305  }
#define LETTER_START_BIT
Definition: dawg.h:90
int flag_start_bit_
Definition: dawg.h:298
NODE_REF tesseract::Trie::new_dawg_node ( )
protected

Definition at line 277 of file trie.cpp.

277  {
278  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
279  if (node == NULL) return 0; // failed to create new node
280  nodes_.push_back(node);
281  return nodes_.length() - 1;
282 }
int length() const
Definition: genericvector.h:79
int push_back(T object)
TRIE_NODES nodes_
Definition: trie.h:420
#define NULL
Definition: host.h:144
NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 132 of file trie.h.

132  {
133  if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
134  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
135  }
uinT64 num_edges_
Definition: trie.h:421
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:292
virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 248 of file trie.h.

250  {
251  if (edge_ref == NO_EDGE) return NO_EDGE;
252  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
253  return (marker_flag_from_edge_rec(*edge_rec) &&
254  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
255  word_end == end_of_word_from_edge_rec(*edge_rec)) ?
256  edge_ref : NO_EDGE;
257  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:292
uinT64 EDGE_RECORD
Definition: dawg.h:50
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:204
void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

Definition at line 335 of file trie.h.

335  {
336  tprintf("\n__________________________\n%s\n", msg);
337  for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
338  tprintf("__________________________\n");
339  }
int size() const
Definition: genericvector.h:72
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:713
#define tprintf(...)
Definition: tprintf.h:31
TRIE_NODES nodes_
Definition: trie.h:420
void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

Definition at line 318 of file trie.h.

318  {
319  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
320  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
321  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
322  end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
323  unichar_id_from_edge_rec(edge_rec));
324  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
#define tprintf(...)
Definition: tprintf.h:31
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
#define FORWARD_EDGE
Definition: dawg.h:84
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:208
#define REFFORMAT
Definition: dawg.h:92
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:204
void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 713 of file trie.cpp.

713  {
714  if (node == NO_EDGE) return; // nothing to print
715  TRIE_NODE_RECORD *node_ptr = nodes_[node];
716  int num_fwd = node_ptr->forward_edges.size();
717  int num_bkw = node_ptr->backward_edges.size();
718  EDGE_VECTOR *vec;
719  for (int dir = 0; dir < 2; ++dir) {
720  if (dir == 0) {
721  vec = &(node_ptr->forward_edges);
722  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
723  } else {
724  vec = &(node_ptr->backward_edges);
725  tprintf("\t");
726  }
727  int i;
728  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
729  i < max_num_edges; ++i) {
730  if (DeadEdge((*vec)[i])) continue;
731  print_edge_rec((*vec)[i]);
732  tprintf(" ");
733  }
734  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
735  tprintf("\n");
736  }
737 }
int size() const
Definition: genericvector.h:72
EDGE_VECTOR backward_edges
Definition: trie.h:50
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
#define tprintf(...)
Definition: tprintf.h:31
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:318
TRIE_NODES nodes_
Definition: trie.h:420
#define REFFORMAT
Definition: dawg.h:92
EDGE_VECTOR forward_edges
Definition: trie.h:49
bool tesseract::Trie::read_and_add_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 291 of file trie.cpp.

293  {
294  GenericVector<STRING> word_list;
295  if (!read_word_list(filename, unicharset, reverse_policy, &word_list))
296  return false;
297  word_list.sort(sort_strings_by_dec_length);
298  return add_word_list(word_list, unicharset);
299 }
bool read_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy, GenericVector< STRING > *words)
Definition: trie.cpp:301
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset)
Definition: trie.cpp:336
bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 409 of file trie.cpp.

410  {
411  if (!initialized_patterns_) {
412  tprintf("please call initialize_patterns() before read_pattern_list()\n");
413  return false;
414  }
415 
416  FILE *pattern_file = fopen(filename, "rb");
417  if (pattern_file == NULL) {
418  tprintf("Error opening pattern file %s\n", filename);
419  return false;
420  }
421 
422  int pattern_count = 0;
423  char string[CHARS_PER_LINE];
424  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
425  chomp_string(string); // remove newline
426  // Parse the pattern and construct a unichar id vector.
427  // Record the number of repetitions of each unichar in the parallel vector.
428  WERD_CHOICE word(&unicharset);
429  GenericVector<bool> repetitions_vec;
430  const char *str_ptr = string;
431  int step = unicharset.step(str_ptr);
432  bool failed = false;
433  while (step > 0) {
434  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
435  if (step == 1 && *str_ptr == '\\') {
436  ++str_ptr;
437  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
438  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
439  } else {
440  if (word.length() < kSaneNumConcreteChars) {
441  tprintf("Please provide at least %d concrete characters at the"
442  " beginning of the pattern\n", kSaneNumConcreteChars);
443  failed = true;
444  break;
445  }
446  // Parse character class from expression.
447  curr_unichar_id = character_class_to_pattern(*str_ptr);
448  }
449  } else {
450  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
451  }
452  if (curr_unichar_id == INVALID_UNICHAR_ID) {
453  failed = true;
454  break; // failed to parse this pattern
455  }
456  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
457  repetitions_vec.push_back(false);
458  str_ptr += step;
459  step = unicharset.step(str_ptr);
460  // Check if there is a repetition pattern specified after this unichar.
461  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
462  repetitions_vec[repetitions_vec.size()-1] = true;
463  str_ptr += 2;
464  step = unicharset.step(str_ptr);
465  }
466  }
467  if (failed) {
468  tprintf("Invalid user pattern %s\n", string);
469  continue;
470  }
471  // Insert the pattern into the trie.
472  if (debug_level_ > 2) {
473  tprintf("Inserting expanded user pattern %s\n",
474  word.debug_string().string());
475  }
476  if (!this->word_in_dawg(word)) {
477  this->add_word_to_dawg(word, &repetitions_vec);
478  if (!this->word_in_dawg(word)) {
479  tprintf("Error: failed to insert pattern '%s'\n", string);
480  }
481  }
482  ++pattern_count;
483  }
484  if (debug_level_) {
485  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
486  }
487  fclose(pattern_file);
488  return true;
489 }
int size() const
Definition: genericvector.h:72
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
bool initialized_patterns_
Definition: trie.h:428
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:70
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
void chomp_string(char *str)
Definition: helpers.h:75
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:391
int debug_level_
Definition: dawg.h:304
int UNICHAR_ID
Definition: unichar.h:33
int step(const char *str) const
Definition: unicharset.cpp:211
#define CHARS_PER_LINE
Definition: cutil.h:57
#define NULL
Definition: host.h:144
static const int kSaneNumConcreteChars
Definition: trie.h:71
bool tesseract::Trie::read_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse_policy,
GenericVector< STRING > *  words 
)

Definition at line 301 of file trie.cpp.

304  {
305  FILE *word_file;
306  char string[CHARS_PER_LINE];
307  int word_count = 0;
308 
309  word_file = fopen(filename, "rb");
310  if (word_file == NULL) return false;
311 
312  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
313  chomp_string(string); // remove newline
314  WERD_CHOICE word(string, unicharset);
315  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
316  word.has_rtl_unichar_id()) ||
317  reverse_policy == RRP_FORCE_REVERSE) {
318  word.reverse_and_mirror_unichar_ids();
319  }
320  ++word_count;
321  if (debug_level_ && word_count % 10000 == 0)
322  tprintf("Read %d words so far\n", word_count);
323  if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
324  words->push_back(word.unichar_string());
325  } else if (debug_level_) {
326  tprintf("Skipping invalid word %s\n", string);
327  if (debug_level_ >= 3) word.print();
328  }
329  }
330  if (debug_level_)
331  tprintf("Read %d words total.\n", word_count);
332  fclose(word_file);
333  return true;
334 }
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
void chomp_string(char *str)
Definition: helpers.h:75
int debug_level_
Definition: dawg.h:304
#define CHARS_PER_LINE
Definition: cutil.h:57
#define NULL
Definition: host.h:144
bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
)
protected

Definition at line 621 of file trie.cpp.

625  {
626  if (debug_level_ > 1)
627  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
628  // Compare each of the edge pairs with the given unichar_id.
629  bool did_something = false;
630  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
631  // Find the first edge that can be eliminated.
632  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
633  while (i < backward_edges->size()) {
634  if (!DeadEdge((*backward_edges)[i])) {
635  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
636  if (curr_unichar_id != unichar_id) return did_something;
637  if (can_be_eliminated((*backward_edges)[i])) break;
638  }
639  ++i;
640  }
641  if (i == backward_edges->size()) break;
642  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
643  // Compare it to the rest of the edges with the given unichar_id.
644  for (int j = i + 1; j < backward_edges->size(); ++j) {
645  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
646  if (DeadEdge(next_edge_rec)) continue;
647  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
648  if (next_id != unichar_id) break;
649  if (end_of_word_from_edge_rec(next_edge_rec) ==
650  end_of_word_from_edge_rec(edge_rec) &&
651  can_be_eliminated(next_edge_rec) &&
652  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
653  reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
654  did_something = true;
655  KillEdge(&(*backward_edges)[j]);
656  }
657  }
658  }
659  return did_something;
660 }
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:574
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
#define tprintf(...)
Definition: tprintf.h:31
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
uinT64 EDGE_RECORD
Definition: dawg.h:50
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:153
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:327
int debug_level_
Definition: dawg.h:304
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
int UNICHAR_ID
Definition: unichar.h:33
#define REFFORMAT
Definition: dawg.h:92
void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

Definition at line 676 of file trie.cpp.

677  {
678  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
679  sort_edges(&backward_edges);
680  if (debug_level_ > 1) {
681  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
683  }
684 
685  EDGE_INDEX edge_index = 0;
686  while (edge_index < backward_edges.size()) {
687  if (DeadEdge(backward_edges[edge_index])) continue;
688  UNICHAR_ID unichar_id =
689  unichar_id_from_edge_rec(backward_edges[edge_index]);
690  while (reduce_lettered_edges(edge_index, unichar_id, node,
691  &backward_edges, reduced_nodes));
692  while (++edge_index < backward_edges.size()) {
693  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
694  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
695  }
696  }
697  reduced_nodes[node] = true; // mark as reduced
698 
699  if (debug_level_ > 1) {
700  tprintf("Node " REFFORMAT " after reduction:\n", node);
702  }
703 
704  for (int i = 0; i < backward_edges.size(); ++i) {
705  if (DeadEdge(backward_edges[i])) continue;
706  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
707  if (next_node != 0 && !reduced_nodes[next_node]) {
708  reduce_node_input(next_node, reduced_nodes);
709  }
710  }
711 }
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:621
int size() const
Definition: genericvector.h:72
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:157
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:713
#define tprintf(...)
Definition: tprintf.h:31
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:662
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
TRIE_NODES nodes_
Definition: trie.h:420
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:676
int debug_level_
Definition: dawg.h:304
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
int UNICHAR_ID
Definition: unichar.h:33
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
#define REFFORMAT
Definition: dawg.h:92
inT64 NODE_REF
Definition: dawg.h:55
inT64 EDGE_INDEX
Definition: trie.h:32
void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 382 of file trie.h.

383  {
384  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
385  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
386  }
#define FORWARD_EDGE
Definition: dawg.h:84
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:491
#define BACKWARD_EDGE
Definition: dawg.h:85
void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 491 of file trie.cpp.

492  {
493  EDGE_RECORD *edge_ptr = NULL;
494  EDGE_INDEX edge_index = 0;
495  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
496  unichar_id, &edge_ptr, &edge_index));
497  if (debug_level_ > 1) {
498  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
499  print_edge_rec(*edge_ptr);
500  tprintf("\n");
501  }
502  if (direction == FORWARD_EDGE) {
503  nodes_[node1]->forward_edges.remove(edge_index);
504  } else if (node1 == 0) {
505  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
506  root_back_freelist_.push_back(edge_index);
507  } else {
508  nodes_[node1]->backward_edges.remove(edge_index);
509  }
510  --num_edges_;
511 }
uinT64 num_edges_
Definition: trie.h:421
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:318
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:103
#define ASSERT_HOST(x)
Definition: errcode.h:84
TRIE_NODES nodes_
Definition: trie.h:420
#define FORWARD_EDGE
Definition: dawg.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:50
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:153
int debug_level_
Definition: dawg.h:304
#define REFFORMAT
Definition: dawg.h:92
void remove(int index)
#define NULL
Definition: host.h:144
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:425
inT64 EDGE_INDEX
Definition: trie.h:32
void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 662 of file trie.cpp.

662  {
663  int num_edges = edges->size();
664  if (num_edges <= 1) return;
666  sort_vec.reserve(num_edges);
667  for (int i = 0; i < num_edges; ++i) {
668  sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
669  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
670  }
671  sort_vec.sort();
672  for (int i = 0; i < num_edges; ++i)
673  (*edges)[i] = sort_vec[i].data;
674 }
int size() const
Definition: genericvector.h:72
int push_back(T object)
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
void reserve(int size)
SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 526 of file trie.cpp.

526  {
527  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
528  if (debug_level_ > 2) {
529  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
530  }
531  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
532  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
533  this->reduce_node_input(0, reduced_nodes);
534  delete[] reduced_nodes;
535 
536  if (debug_level_ > 2) {
537  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
538  }
539  // Build a translation map from node indices in nodes_ vector to
540  // their target indices in EDGE_ARRAY.
541  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
542  int i, j;
543  node_ref_map[0] = 0;
544  for (i = 0; i < nodes_.size(); ++i) {
545  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
546  }
547  int num_forward_edges = node_ref_map[i];
548 
549  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
550  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
551  EDGE_ARRAY edge_array =
552  (EDGE_ARRAY)memalloc(num_forward_edges * sizeof(EDGE_RECORD));
553  EDGE_ARRAY edge_array_ptr = edge_array;
554  for (i = 0; i < nodes_.size(); ++i) {
555  TRIE_NODE_RECORD *node_ptr = nodes_[i];
556  int end = node_ptr->forward_edges.size();
557  for (j = 0; j < end; ++j) {
558  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
559  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
560  ASSERT_HOST(node_ref < nodes_.size());
561  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
562  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
563  end_of_word_from_edge_rec(edge_rec), unichar_id);
564  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
565  ++edge_array_ptr;
566  }
567  }
568  delete[] node_ref_map;
569 
570  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
572 }
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:228
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:53
bool * NODE_MARKER
Definition: trie.h:45
int * memalloc(int size)
Definition: freelist.cpp:22
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:200
#define ASSERT_HOST(x)
Definition: errcode.h:84
TRIE_NODES nodes_
Definition: trie.h:420
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:335
#define FORWARD_EDGE
Definition: dawg.h:84
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:676
int unicharset_size_
Definition: dawg.h:297
uinT64 EDGE_RECORD
Definition: dawg.h:50
int debug_level_
Definition: dawg.h:304
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217
int UNICHAR_ID
Definition: unichar.h:33
STRING lang_
Definition: dawg.h:290
EDGE_VECTOR forward_edges
Definition: trie.h:49
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:425
DawgType type_
Definition: dawg.h:289
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:307
inT64 NODE_REF
Definition: dawg.h:55
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:292
void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const
virtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 369 of file trie.cpp.

371  {
372  bool is_alpha = unicharset.get_isalpha(unichar_id);
373  if (is_alpha) {
376  if (unicharset.get_islower(unichar_id)) {
378  } else if (unicharset.get_isupper(unichar_id)) {
380  }
381  }
382  if (unicharset.get_isdigit(unichar_id)) {
384  if (!is_alpha) vec->push_back(alphanum_pattern_);
385  }
386  if (unicharset.get_ispunctuation(unichar_id)) {
387  vec->push_back(punc_pattern_);
388  }
389 }
UNICHAR_ID alphanum_pattern_
Definition: trie.h:431
int push_back(T object)
UNICHAR_ID digit_pattern_
Definition: trie.h:430
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
UNICHAR_ID lower_pattern_
Definition: trie.h:433
UNICHAR_ID upper_pattern_
Definition: trie.h:434
UNICHAR_ID punc_pattern_
Definition: trie.h:432
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:470
UNICHAR_ID alpha_pattern_
Definition: trie.h:429
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:477
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:449
void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 116 of file trie.h.

117  {
118  const EDGE_VECTOR &forward_edges =
119  nodes_[static_cast<int>(node)]->forward_edges;
120  for (int i = 0; i < forward_edges.size(); ++i) {
121  if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
122  vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
123  make_edge_ref(node, i)));
124  }
125  }
126  }
int size() const
Definition: genericvector.h:72
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:213
int push_back(T object)
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:301
TRIE_NODES nodes_
Definition: trie.h:420
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:217

Member Data Documentation

UNICHAR_ID tesseract::Trie::alpha_pattern_
protected

Definition at line 429 of file trie.h.

UNICHAR_ID tesseract::Trie::alphanum_pattern_
protected

Definition at line 431 of file trie.h.

uinT64 tesseract::Trie::deref_direction_mask_
protected

Definition at line 422 of file trie.h.

uinT64 tesseract::Trie::deref_node_index_mask_
protected

Definition at line 423 of file trie.h.

UNICHAR_ID tesseract::Trie::digit_pattern_
protected

Definition at line 430 of file trie.h.

bool tesseract::Trie::initialized_patterns_
protected

Definition at line 428 of file trie.h.

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

Definition at line 77 of file trie.h.

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

Definition at line 75 of file trie.h.

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

Definition at line 76 of file trie.h.

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

Definition at line 79 of file trie.h.

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

Definition at line 78 of file trie.h.

const int tesseract::Trie::kSaneNumConcreteChars = 0
static

Definition at line 71 of file trie.h.

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

Definition at line 80 of file trie.h.

UNICHAR_ID tesseract::Trie::lower_pattern_
protected

Definition at line 433 of file trie.h.

TRIE_NODES tesseract::Trie::nodes_
protected

Definition at line 420 of file trie.h.

uinT64 tesseract::Trie::num_edges_
protected

Definition at line 421 of file trie.h.

UNICHAR_ID tesseract::Trie::punc_pattern_
protected

Definition at line 432 of file trie.h.

GenericVector<EDGE_INDEX> tesseract::Trie::root_back_freelist_
protected

Definition at line 425 of file trie.h.

UNICHAR_ID tesseract::Trie::upper_pattern_
protected

Definition at line 434 of file trie.h.


The documentation for this class was generated from the following files: