tesseract  4.00.00dev
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE, RRP_REVERSE_IF_HAS_RTL, RRP_FORCE_REVERSE }
 

Public Member Functions

 Trie (DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
virtual ~Trie ()
 
void clear ()
 
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const
 
NODE_REF next_node (EDGE_REF edge_ref) const
 
bool end_of_word (EDGE_REF edge_ref) const
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 
void KillEdge (EDGE_RECORD *edge_rec) const
 
bool DeadEdge (const EDGE_RECORD &edge_rec) const
 
void print_node (NODE_REF node, int max_num_edges) const
 
SquishedDawgtrie_to_dawg ()
 
bool read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
 
bool read_word_list (const char *filename, GenericVector< STRING > *words)
 
bool add_word_list (const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
 
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
 
void initialize_patterns (UNICHARSET *unicharset)
 
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 
bool add_word_to_dawg (const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
 
bool add_word_to_dawg (const WERD_CHOICE &word)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char *> *cb) const
 

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)
 

Static Public Attributes

static const int kSaneNumConcreteChars = 0
 
static const char kAlphaPatternUnicode [] = "\u2000"
 
static const char kDigitPatternUnicode [] = "\u2001"
 
static const char kAlphanumPatternUnicode [] = "\u2002"
 
static const char kPuncPatternUnicode [] = "\u2003"
 
static const char kLowerPatternUnicode [] = "\u2004"
 
static const char kUpperPatternUnicode [] = "\u2005"
 
- Static Public Attributes inherited from tesseract::Dawg
static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
 
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
 
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void print_edge_rec (const EDGE_RECORD &edge_rec) const
 
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
 
void print_all (const char *msg, int max_num_edges)
 
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
 
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
 
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
 
NODE_REF new_dawg_node ()
 
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
 
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
 
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
 
void sort_edges (EDGE_VECTOR *edges)
 
void reduce_node_input (NODE_REF node, NODE_MARKER reduced_nodes)
 
UNICHAR_ID character_class_to_pattern (char ch)
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
 

Protected Attributes

TRIE_NODES nodes_
 
uinT64 num_edges_
 
uinT64 deref_direction_mask_
 
uinT64 deref_node_index_mask_
 
GenericVector< EDGE_INDEXroot_back_freelist_
 
bool initialized_patterns_
 
UNICHAR_ID alpha_pattern_
 
UNICHAR_ID digit_pattern_
 
UNICHAR_ID alphanum_pattern_
 
UNICHAR_ID punc_pattern_
 
UNICHAR_ID lower_pattern_
 
UNICHAR_ID upper_pattern_
 
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
 
STRING lang_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
int unicharset_size_
 
int flag_start_bit_
 
int next_node_start_bit_
 
uinT64 next_node_mask_
 
uinT64 flags_mask_
 
uinT64 letter_mask_
 
int debug_level_
 

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 62 of file trie.h.

Member Enumeration Documentation

◆ RTLReversePolicy

Enumerator
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 64 of file trie.h.

Constructor & Destructor Documentation

◆ Trie()

tesseract::Trie::Trie ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 89 of file trie.h.

91  : Dawg(type, lang, perm, debug_level) {
92  init(unicharset_size);
93  num_edges_ = 0;
95  new_dawg_node(); // need to allocate node 0
96  initialized_patterns_ = false;
97  }
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:205
DawgType type() const
Definition: dawg.h:128
uinT64 num_edges_
Definition: trie.h:420
void init(int unicharset_size)
Definition: dawg.cpp:176
uinT64 letter_mask_
Definition: dawg.h:314
uinT64 deref_node_index_mask_
Definition: trie.h:422
bool initialized_patterns_
Definition: trie.h:427
NODE_REF new_dawg_node()
Definition: trie.cpp:276

◆ ~Trie()

virtual tesseract::Trie::~Trie ( )
inlinevirtual

Definition at line 98 of file trie.h.

TRIE_NODES nodes_
Definition: trie.h:419
void delete_data_pointers()

Member Function Documentation

◆ add_edge_linkage()

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 124 of file trie.cpp.

126  {
127  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
128  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
129  int search_index;
130  if (node1 == 0 && direction == FORWARD_EDGE) {
131  search_index = 0; // find the index to make the add sorted
132  while (search_index < vec->size() &&
133  given_greater_than_edge_rec(node2, word_end, unichar_id,
134  (*vec)[search_index]) == 1) {
135  search_index++;
136  }
137  } else {
138  search_index = vec->size(); // add is unsorted, so index does not matter
139  }
140  EDGE_RECORD edge_rec;
141  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
142  if (node1 == 0 && direction == BACKWARD_EDGE &&
144  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
145  (*vec)[edge_index] = edge_rec;
146  } else if (search_index < vec->size()) {
147  vec->insert(edge_rec, search_index);
148  } else {
149  vec->push_back(edge_rec);
150  }
151  if (debug_level_ > 1) {
152  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
153  print_edge_rec(edge_rec);
154  tprintf("\n");
155  }
156  num_edges_++;
157  return true;
158 }
bool empty() const
Definition: genericvector.h:91
#define REFFORMAT
Definition: dawg.h:93
#define FORWARD_EDGE
Definition: dawg.h:85
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:424
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:306
#define BACKWARD_EDGE
Definition: dawg.h:86
int push_back(T object)
uinT64 num_edges_
Definition: trie.h:420
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
uinT64 EDGE_RECORD
Definition: dawg.h:51
int debug_level_
Definition: dawg.h:316
void insert(T t, int index)
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:317

◆ add_new_edge()

bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 356 of file trie.h.

357  {
358  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
359  word_end, unichar_id) &&
360  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
361  word_end, unichar_id));
362  }
#define FORWARD_EDGE
Definition: dawg.h:85
#define BACKWARD_EDGE
Definition: dawg.h:86
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:124

◆ add_word_ending()

void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 160 of file trie.cpp.

163  {
164  EDGE_RECORD *back_edge_ptr;
165  EDGE_INDEX back_edge_index;
166  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
167  unichar_id, &back_edge_ptr, &back_edge_index));
168  if (marker_flag) {
169  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
170  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
171  }
172  // Mark both directions as end of word.
173  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
174  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
175 }
#define WERD_END_FLAG
Definition: dawg.h:90
inT64 EDGE_INDEX
Definition: trie.h:32
#define BACKWARD_EDGE
Definition: dawg.h:86
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:51
#define MARKER_FLAG
Definition: dawg.h:88
int flag_start_bit_
Definition: dawg.h:310

◆ add_word_list()

bool tesseract::Trie::add_word_list ( const GenericVector< STRING > &  words,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse_policy 
)

Definition at line 321 of file trie.cpp.

323  {
324  for (int i = 0; i < words.size(); ++i) {
325  WERD_CHOICE word(words[i].string(), unicharset);
326  if (word.length() == 0 || word.contains_unichar_id(INVALID_UNICHAR_ID))
327  continue;
328  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
329  word.has_rtl_unichar_id()) ||
330  reverse_policy == RRP_FORCE_REVERSE) {
331  word.reverse_and_mirror_unichar_ids();
332  }
333  if (!word_in_dawg(word)) {
334  add_word_to_dawg(word);
335  if (!word_in_dawg(word)) {
336  tprintf("Error: word '%s' not in DAWG after adding it\n",
337  words[i].string());
338  return false;
339  }
340  }
341  }
342  return true;
343 }
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69

◆ add_word_to_dawg() [1/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const GenericVector< bool > *  repetitions 
)

Definition at line 177 of file trie.cpp.

178  {
179  if (word.length() <= 0) return false; // can't add empty words
180  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
181  // Make sure the word does not contain invalid unchar ids.
182  for (int i = 0; i < word.length(); ++i) {
183  if (word.unichar_id(i) < 0 ||
184  word.unichar_id(i) >= unicharset_size_) return false;
185  }
186 
187  EDGE_RECORD *edge_ptr;
188  NODE_REF last_node = 0;
189  NODE_REF the_next_node;
190  bool marker_flag = false;
191  EDGE_INDEX edge_index;
192  int i;
193  inT32 still_finding_chars = true;
194  inT32 word_end = false;
195  bool add_failed = false;
196  bool found;
197 
198  if (debug_level_ > 1) word.print("\nAdding word: ");
199 
200  UNICHAR_ID unichar_id;
201  for (i = 0; i < word.length() - 1; ++i) {
202  unichar_id = word.unichar_id(i);
203  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
204  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
205  if (still_finding_chars) {
206  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
207  unichar_id, &edge_ptr, &edge_index);
208  if (found && debug_level_ > 1) {
209  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
210  edge_index, last_node);
211  }
212  if (!found) {
213  still_finding_chars = false;
214  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
215  // We hit the end of an existing word, but the new word is longer.
216  // In this case we have to disconnect the existing word from the
217  // backwards root node, mark the current position as end-of-word
218  // and add new nodes for the increased length. Disconnecting the
219  // existing word from the backwards root node requires a linear
220  // search, so it is much faster to add the longest words first,
221  // to avoid having to come here.
222  word_end = true;
223  still_finding_chars = false;
224  remove_edge(last_node, 0, word_end, unichar_id);
225  } else {
226  // We have to add a new branch here for the new word.
227  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
228  last_node = next_node_from_edge_rec(*edge_ptr);
229  }
230  }
231  if (!still_finding_chars) {
232  the_next_node = new_dawg_node();
233  if (debug_level_ > 1)
234  tprintf("adding node " REFFORMAT "\n", the_next_node);
235  if (the_next_node == 0) {
236  add_failed = true;
237  break;
238  }
239  if (!add_new_edge(last_node, the_next_node,
240  marker_flag, word_end, unichar_id)) {
241  add_failed = true;
242  break;
243  }
244  word_end = false;
245  last_node = the_next_node;
246  }
247  }
248  the_next_node = 0;
249  unichar_id = word.unichar_id(i);
250  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
251  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
252  if (still_finding_chars &&
253  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
254  unichar_id, &edge_ptr, &edge_index)) {
255  // An extension of this word already exists in the trie, so we
256  // only have to add the ending flags in both directions.
257  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
258  marker_flag, unichar_id);
259  } else {
260  // Add a link to node 0. All leaves connect to node 0 so the back links can
261  // be used in reduction to a dawg. This root backward node has one edge
262  // entry for every word, (except prefixes of longer words) so it is huge.
263  if (!add_failed &&
264  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
265  add_failed = true;
266  }
267  if (add_failed) {
268  tprintf("Re-initializing document dictionary...\n");
269  clear();
270  return false;
271  } else {
272  return true;
273  }
274 }
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:381
#define REFFORMAT
Definition: dawg.h:93
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
int unicharset_size_
Definition: dawg.h:309
#define FORWARD_EDGE
Definition: dawg.h:85
inT64 NODE_REF
Definition: dawg.h:56
int size() const
Definition: genericvector.h:72
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:311
void print() const
Definition: ratngs.h:576
void clear()
Definition: trie.cpp:65
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:241
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
int32_t inT32
Definition: host.h:38
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:51
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:356
int debug_level_
Definition: dawg.h:316
int length() const
Definition: ratngs.h:299
NODE_REF new_dawg_node()
Definition: trie.cpp:276
int UNICHAR_ID
Definition: unichar.h:35

◆ add_word_to_dawg() [2/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

Definition at line 269 of file trie.h.

269  {
270  return add_word_to_dawg(word, NULL);
271  }
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177

◆ can_be_eliminated()

bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec)
inlineprotected

Definition at line 326 of file trie.h.

326  {
327  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
328  return (node_ref != NO_EDGE &&
329  nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
330  }
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
TRIE_NODES nodes_
Definition: trie.h:419
inT64 NODE_REF
Definition: dawg.h:56
int size() const
Definition: genericvector.h:72

◆ character_class_to_pattern()

UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

Definition at line 384 of file trie.cpp.

384  {
385  if (ch == 'c') {
386  return alpha_pattern_;
387  } else if (ch == 'd') {
388  return digit_pattern_;
389  } else if (ch == 'n') {
390  return alphanum_pattern_;
391  } else if (ch == 'p') {
392  return punc_pattern_;
393  } else if (ch == 'a') {
394  return lower_pattern_;
395  } else if (ch == 'A') {
396  return upper_pattern_;
397  } else {
398  return INVALID_UNICHAR_ID;
399  }
400 }
UNICHAR_ID alphanum_pattern_
Definition: trie.h:430
UNICHAR_ID punc_pattern_
Definition: trie.h:431
UNICHAR_ID alpha_pattern_
Definition: trie.h:428
UNICHAR_ID lower_pattern_
Definition: trie.h:432
UNICHAR_ID upper_pattern_
Definition: trie.h:433
UNICHAR_ID digit_pattern_
Definition: trie.h:429

◆ clear()

void tesseract::Trie::clear ( )

Definition at line 65 of file trie.cpp.

65  {
67  nodes_.clear();
69  num_edges_ = 0;
70  new_dawg_node(); // Need to allocate node 0.
71 }
TRIE_NODES nodes_
Definition: trie.h:419
void delete_data_pointers()
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:424
uinT64 num_edges_
Definition: trie.h:420
NODE_REF new_dawg_node()
Definition: trie.cpp:276

◆ DeadEdge()

bool tesseract::Trie::DeadEdge ( const EDGE_RECORD edge_rec) const
inline

Definition at line 158 of file trie.h.

158  {
159  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
160  }
int unicharset_size_
Definition: dawg.h:309
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ deref_edge_ref()

EDGE_RECORD* tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

Definition at line 291 of file trie.h.

291  {
292  int edge_index = static_cast<int>(
293  (edge_ref & letter_mask_) >> LETTER_START_BIT);
294  int node_index = static_cast<int>(
295  (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
296  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
297  return &(node_rec->forward_edges[edge_index]);
298  }
TRIE_NODES nodes_
Definition: trie.h:419
EDGE_VECTOR forward_edges
Definition: trie.h:49
uinT64 letter_mask_
Definition: dawg.h:314
#define LETTER_START_BIT
Definition: dawg.h:91
uinT64 deref_node_index_mask_
Definition: trie.h:422
int flag_start_bit_
Definition: dawg.h:310

◆ edge_char_of() [1/2]

EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 104 of file trie.h.

105  {
106  EDGE_RECORD *edge_ptr;
107  EDGE_INDEX edge_index;
108  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
109  &edge_ptr, &edge_index)) return NO_EDGE;
110  return make_edge_ref(node_ref, edge_index);
111  }
#define FORWARD_EDGE
Definition: dawg.h:85
inT64 EDGE_INDEX
Definition: trie.h:32
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:300
uinT64 EDGE_RECORD
Definition: dawg.h:51

◆ edge_char_of() [2/2]

bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const
protected

Definition at line 73 of file trie.cpp.

75  {
76  if (debug_level_ == 3) {
77  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
78  " direction %d word_end %d unichar_id %d, exploring node:\n",
79  node_ref, next_node, direction, word_end, unichar_id);
80  if (node_ref != NO_EDGE) {
81  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
82  }
83  }
84  if (node_ref == NO_EDGE) return false;
85  assert(node_ref < nodes_.size());
86  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
87  nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
88  int vec_size = vec.size();
89  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
90  EDGE_INDEX start = 0;
91  EDGE_INDEX end = vec_size - 1;
92  EDGE_INDEX k;
93  int compare;
94  while (start <= end) {
95  k = (start + end) >> 1; // (start + end) / 2
96  compare = given_greater_than_edge_rec(next_node, word_end,
97  unichar_id, vec[k]);
98  if (compare == 0) { // given == vec[k]
99  *edge_ptr = &(vec[k]);
100  *edge_index = k;
101  return true;
102  } else if (compare == 1) { // given > vec[k]
103  start = k + 1;
104  } else { // given < vec[k]
105  end = k - 1;
106  }
107  }
108  } else { // linear search
109  for (int i = 0; i < vec_size; ++i) {
110  EDGE_RECORD &edge_rec = vec[i];
111  if (edge_rec_match(next_node, word_end, unichar_id,
112  next_node_from_edge_rec(edge_rec),
113  end_of_word_from_edge_rec(edge_rec),
114  unichar_id_from_edge_rec(edge_rec))) {
115  *edge_ptr = &(edge_rec);
116  *edge_index = i;
117  return true;
118  }
119  }
120  }
121  return false; // not found
122 }
#define REFFORMAT
Definition: dawg.h:93
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:705
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:272
#define FORWARD_EDGE
Definition: dawg.h:85
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:133
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
uinT64 EDGE_RECORD
Definition: dawg.h:51
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int debug_level_
Definition: dawg.h:316

◆ edge_letter()

UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 148 of file trie.h.

148  {
149  if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
150  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
151  }
uinT64 num_edges_
Definition: trie.h:420
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:291
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ eliminate_redundant_edges()

bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
)
protected

Definition at line 566 of file trie.cpp.

568  {
569  if (debug_level_ > 1) {
570  tprintf("\nCollapsing node %" PRIi64 ":\n", node);
572  tprintf("Candidate edges: ");
573  print_edge_rec(edge1);
574  tprintf(", ");
575  print_edge_rec(edge2);
576  tprintf("\n\n");
577  }
578  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
579  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
580  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
581  // Translate all edges going to/from next_node2 to go to/from next_node1.
582  EDGE_RECORD *edge_ptr = NULL;
583  EDGE_INDEX edge_index;
584  int i;
585  // The backward link in node to next_node2 will be zeroed out by the caller.
586  // Copy all the backward links in next_node2 to node next_node1
587  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
588  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
589  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
590  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
591  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
592  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
593  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
594  curr_word_end, curr_unichar_id);
595  // Relocate the corresponding forward edge in curr_next_node
596  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
597  curr_word_end, curr_unichar_id,
598  &edge_ptr, &edge_index));
599  set_next_node_in_edge_rec(edge_ptr, next_node1);
600  }
601  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
602  next_node2_ptr->backward_edges.size());
603  if (debug_level_ > 1) {
604  tprintf("removed %d edges from node " REFFORMAT "\n",
605  next_node2_num_edges, next_node2);
606  }
607  next_node2_ptr->forward_edges.clear();
608  next_node2_ptr->backward_edges.clear();
609  num_edges_ -= next_node2_num_edges;
610  return true;
611 }
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
#define REFFORMAT
Definition: dawg.h:93
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:705
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:235
EDGE_VECTOR backward_edges
Definition: trie.h:50
#define FORWARD_EDGE
Definition: dawg.h:85
TRIE_NODES nodes_
Definition: trie.h:419
inT64 NODE_REF
Definition: dawg.h:56
int size() const
Definition: genericvector.h:72
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:217
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
#define BACKWARD_EDGE
Definition: dawg.h:86
uinT64 num_edges_
Definition: trie.h:420
EDGE_VECTOR forward_edges
Definition: trie.h:49
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:124
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:51
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int debug_level_
Definition: dawg.h:316
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:317
int UNICHAR_ID
Definition: unichar.h:35

◆ end_of_word()

bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 142 of file trie.h.

142  {
143  if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
144  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
145  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
uinT64 num_edges_
Definition: trie.h:420
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:291

◆ get_reverse_policy_name()

const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

Definition at line 60 of file trie.cpp.

60  {
61  return RTLReversePolicyNames[reverse_policy];
62 }
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:47

◆ initialize_patterns()

void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)

Definition at line 345 of file trie.cpp.

345  {
352  unicharset->unichar_insert(kPuncPatternUnicode);
358  initialized_patterns_ = true;
359  unicharset_size_ = unicharset->size();
360 }
int unicharset_size_
Definition: dawg.h:309
static const char kPuncPatternUnicode[]
Definition: trie.h:78
UNICHAR_ID alphanum_pattern_
Definition: trie.h:430
UNICHAR_ID punc_pattern_
Definition: trie.h:431
static const char kUpperPatternUnicode[]
Definition: trie.h:80
UNICHAR_ID alpha_pattern_
Definition: trie.h:428
static const char kAlphanumPatternUnicode[]
Definition: trie.h:77
static const char kDigitPatternUnicode[]
Definition: trie.h:76
int size() const
Definition: unicharset.h:338
UNICHAR_ID lower_pattern_
Definition: trie.h:432
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
static const char kAlphaPatternUnicode[]
Definition: trie.h:75
static const char kLowerPatternUnicode[]
Definition: trie.h:79
bool initialized_patterns_
Definition: trie.h:427
UNICHAR_ID upper_pattern_
Definition: trie.h:433
UNICHAR_ID digit_pattern_
Definition: trie.h:429

◆ KillEdge()

void tesseract::Trie::KillEdge ( EDGE_RECORD edge_rec) const
inline

Definition at line 154 of file trie.h.

154  {
155  *edge_rec &= ~letter_mask_;
156  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
157  }
int unicharset_size_
Definition: dawg.h:309
uinT64 letter_mask_
Definition: dawg.h:314
#define LETTER_START_BIT
Definition: dawg.h:91

◆ link_edge()

void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

Definition at line 306 of file trie.h.

307  {
308  EDGE_RECORD flags = 0;
309  if (repeats) flags |= MARKER_FLAG;
310  if (word_end) flags |= WERD_END_FLAG;
311  if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
312  *edge = ((nxt << next_node_start_bit_) |
313  (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
314  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
315  }
#define WERD_END_FLAG
Definition: dawg.h:90
#define DIRECTION_FLAG
Definition: dawg.h:89
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int next_node_start_bit_
Definition: dawg.h:311
#define BACKWARD_EDGE
Definition: dawg.h:86
uinT64 EDGE_RECORD
Definition: dawg.h:51
#define LETTER_START_BIT
Definition: dawg.h:91
#define MARKER_FLAG
Definition: dawg.h:88
int flag_start_bit_
Definition: dawg.h:310

◆ make_edge_ref()

EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 300 of file trie.h.

301  {
302  return ((node_index << flag_start_bit_) |
303  (edge_index << LETTER_START_BIT));
304  }
#define LETTER_START_BIT
Definition: dawg.h:91
int flag_start_bit_
Definition: dawg.h:310

◆ new_dawg_node()

NODE_REF tesseract::Trie::new_dawg_node ( )
protected

Definition at line 276 of file trie.cpp.

276  {
277  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
278  nodes_.push_back(node);
279  return nodes_.length() - 1;
280 }
TRIE_NODES nodes_
Definition: trie.h:419
int push_back(T object)
int length() const
Definition: genericvector.h:86

◆ next_node()

NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 133 of file trie.h.

133  {
134  if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
135  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
136  }
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
uinT64 num_edges_
Definition: trie.h:420
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:291

◆ pattern_loop_edge()

virtual EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlinevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 247 of file trie.h.

249  {
250  if (edge_ref == NO_EDGE) return NO_EDGE;
251  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
252  return (marker_flag_from_edge_rec(*edge_rec) &&
253  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
254  word_end == end_of_word_from_edge_rec(*edge_rec)) ?
255  edge_ref : NO_EDGE;
256  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:217
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:291
uinT64 EDGE_RECORD
Definition: dawg.h:51
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ print_all()

void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

Definition at line 334 of file trie.h.

334  {
335  tprintf("\n__________________________\n%s\n", msg);
336  for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
337  tprintf("__________________________\n");
338  }
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:705
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31

◆ print_edge_rec()

void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

Definition at line 317 of file trie.h.

317  {
318  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
319  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
320  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
321  end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
322  unichar_id_from_edge_rec(edge_rec));
323  }
#define REFFORMAT
Definition: dawg.h:93
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
#define FORWARD_EDGE
Definition: dawg.h:85
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:217
#define tprintf(...)
Definition: tprintf.h:31
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:221

◆ print_node()

void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 705 of file trie.cpp.

705  {
706  if (node == NO_EDGE) return; // nothing to print
707  TRIE_NODE_RECORD *node_ptr = nodes_[node];
708  int num_fwd = node_ptr->forward_edges.size();
709  int num_bkw = node_ptr->backward_edges.size();
710  EDGE_VECTOR *vec;
711  for (int dir = 0; dir < 2; ++dir) {
712  if (dir == 0) {
713  vec = &(node_ptr->forward_edges);
714  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
715  } else {
716  vec = &(node_ptr->backward_edges);
717  tprintf("\t");
718  }
719  int i;
720  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
721  i < max_num_edges; ++i) {
722  if (DeadEdge((*vec)[i])) continue;
723  print_edge_rec((*vec)[i]);
724  tprintf(" ");
725  }
726  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
727  tprintf("\n");
728  }
729 }
#define REFFORMAT
Definition: dawg.h:93
EDGE_VECTOR backward_edges
Definition: trie.h:50
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
EDGE_VECTOR forward_edges
Definition: trie.h:49
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:317
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:158

◆ read_and_add_word_list()

bool tesseract::Trie::read_and_add_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 289 of file trie.cpp.

291  {
292  GenericVector<STRING> word_list;
293  if (!read_word_list(filename, &word_list)) return false;
294  word_list.sort(sort_strings_by_dec_length);
295  return add_word_list(word_list, unicharset, reverse_policy);
296 }
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:321
bool read_word_list(const char *filename, GenericVector< STRING > *words)
Definition: trie.cpp:298

◆ read_pattern_list()

bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 402 of file trie.cpp.

403  {
404  if (!initialized_patterns_) {
405  tprintf("please call initialize_patterns() before read_pattern_list()\n");
406  return false;
407  }
408 
409  FILE *pattern_file = fopen(filename, "rb");
410  if (pattern_file == NULL) {
411  tprintf("Error opening pattern file %s\n", filename);
412  return false;
413  }
414 
415  int pattern_count = 0;
416  char string[CHARS_PER_LINE];
417  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
418  chomp_string(string); // remove newline
419  // Parse the pattern and construct a unichar id vector.
420  // Record the number of repetitions of each unichar in the parallel vector.
421  WERD_CHOICE word(&unicharset);
422  GenericVector<bool> repetitions_vec;
423  const char *str_ptr = string;
424  int step = unicharset.step(str_ptr);
425  bool failed = false;
426  while (step > 0) {
427  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
428  if (step == 1 && *str_ptr == '\\') {
429  ++str_ptr;
430  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
431  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
432  } else {
433  if (word.length() < kSaneNumConcreteChars) {
434  tprintf("Please provide at least %d concrete characters at the"
435  " beginning of the pattern\n", kSaneNumConcreteChars);
436  failed = true;
437  break;
438  }
439  // Parse character class from expression.
440  curr_unichar_id = character_class_to_pattern(*str_ptr);
441  }
442  } else {
443  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
444  }
445  if (curr_unichar_id == INVALID_UNICHAR_ID) {
446  failed = true;
447  break; // failed to parse this pattern
448  }
449  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
450  repetitions_vec.push_back(false);
451  str_ptr += step;
452  step = unicharset.step(str_ptr);
453  // Check if there is a repetition pattern specified after this unichar.
454  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
455  repetitions_vec[repetitions_vec.size()-1] = true;
456  str_ptr += 2;
457  step = unicharset.step(str_ptr);
458  }
459  }
460  if (failed) {
461  tprintf("Invalid user pattern %s\n", string);
462  continue;
463  }
464  // Insert the pattern into the trie.
465  if (debug_level_ > 2) {
466  tprintf("Inserting expanded user pattern %s\n",
467  word.debug_string().string());
468  }
469  if (!this->word_in_dawg(word)) {
470  this->add_word_to_dawg(word, &repetitions_vec);
471  if (!this->word_in_dawg(word)) {
472  tprintf("Error: failed to insert pattern '%s'\n", string);
473  }
474  }
475  ++pattern_count;
476  }
477  if (debug_level_) {
478  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
479  }
480  fclose(pattern_file);
481  return true;
482 }
static const int kSaneNumConcreteChars
Definition: trie.h:71
int size() const
Definition: genericvector.h:72
void chomp_string(char *str)
Definition: helpers.h:82
#define tprintf(...)
Definition: tprintf.h:31
int push_back(T object)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:384
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
int debug_level_
Definition: dawg.h:316
bool initialized_patterns_
Definition: trie.h:427
int step(const char *str) const
Definition: unicharset.cpp:230
#define CHARS_PER_LINE
Definition: cutil.h:57
int UNICHAR_ID
Definition: unichar.h:35

◆ read_word_list()

bool tesseract::Trie::read_word_list ( const char *  filename,
GenericVector< STRING > *  words 
)

Definition at line 298 of file trie.cpp.

299  {
300  FILE *word_file;
301  char line_str[CHARS_PER_LINE];
302  int word_count = 0;
303 
304  word_file = fopen(filename, "rb");
305  if (word_file == NULL) return false;
306 
307  while (fgets(line_str, sizeof(line_str), word_file) != NULL) {
308  chomp_string(line_str); // remove newline
309  STRING word_str(line_str);
310  ++word_count;
311  if (debug_level_ && word_count % 10000 == 0)
312  tprintf("Read %d words so far\n", word_count);
313  words->push_back(word_str);
314  }
315  if (debug_level_)
316  tprintf("Read %d words total.\n", word_count);
317  fclose(word_file);
318  return true;
319 }
void chomp_string(char *str)
Definition: helpers.h:82
#define tprintf(...)
Definition: tprintf.h:31
int push_back(T object)
Definition: strngs.h:45
int debug_level_
Definition: dawg.h:316
#define CHARS_PER_LINE
Definition: cutil.h:57

◆ reduce_lettered_edges()

bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
EDGE_VECTOR backward_edges,
NODE_MARKER  reduced_nodes 
)
protected

Definition at line 613 of file trie.cpp.

617  {
618  if (debug_level_ > 1)
619  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
620  // Compare each of the edge pairs with the given unichar_id.
621  bool did_something = false;
622  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
623  // Find the first edge that can be eliminated.
624  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
625  while (i < backward_edges->size()) {
626  if (!DeadEdge((*backward_edges)[i])) {
627  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
628  if (curr_unichar_id != unichar_id) return did_something;
629  if (can_be_eliminated((*backward_edges)[i])) break;
630  }
631  ++i;
632  }
633  if (i == backward_edges->size()) break;
634  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
635  // Compare it to the rest of the edges with the given unichar_id.
636  for (int j = i + 1; j < backward_edges->size(); ++j) {
637  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
638  if (DeadEdge(next_edge_rec)) continue;
639  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
640  if (next_id != unichar_id) break;
641  if (end_of_word_from_edge_rec(next_edge_rec) ==
642  end_of_word_from_edge_rec(edge_rec) &&
643  can_be_eliminated(next_edge_rec) &&
644  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
645  reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
646  did_something = true;
647  KillEdge(&(*backward_edges)[j]);
648  }
649  }
650  }
651  return did_something;
652 }
#define REFFORMAT
Definition: dawg.h:93
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:154
int size() const
Definition: genericvector.h:72
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:326
#define tprintf(...)
Definition: tprintf.h:31
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:566
uinT64 EDGE_RECORD
Definition: dawg.h:51
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int debug_level_
Definition: dawg.h:316
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:158
int UNICHAR_ID
Definition: unichar.h:35

◆ reduce_node_input()

void tesseract::Trie::reduce_node_input ( NODE_REF  node,
NODE_MARKER  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

Definition at line 668 of file trie.cpp.

669  {
670  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
671  sort_edges(&backward_edges);
672  if (debug_level_ > 1) {
673  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
675  }
676 
677  EDGE_INDEX edge_index = 0;
678  while (edge_index < backward_edges.size()) {
679  if (DeadEdge(backward_edges[edge_index])) continue;
680  UNICHAR_ID unichar_id =
681  unichar_id_from_edge_rec(backward_edges[edge_index]);
682  while (reduce_lettered_edges(edge_index, unichar_id, node,
683  &backward_edges, reduced_nodes));
684  while (++edge_index < backward_edges.size()) {
685  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
686  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
687  }
688  }
689  reduced_nodes[node] = true; // mark as reduced
690 
691  if (debug_level_ > 1) {
692  tprintf("Node " REFFORMAT " after reduction:\n", node);
694  }
695 
696  for (int i = 0; i < backward_edges.size(); ++i) {
697  if (DeadEdge(backward_edges[i])) continue;
698  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
699  if (next_node != 0 && !reduced_nodes[next_node]) {
700  reduce_node_input(next_node, reduced_nodes);
701  }
702  }
703 }
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
#define REFFORMAT
Definition: dawg.h:93
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:705
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:668
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
inT64 NODE_REF
Definition: dawg.h:56
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:133
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:613
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:654
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int debug_level_
Definition: dawg.h:316
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:158
int UNICHAR_ID
Definition: unichar.h:35

◆ remove_edge()

void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 381 of file trie.h.

382  {
383  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
384  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
385  }
#define FORWARD_EDGE
Definition: dawg.h:85
#define BACKWARD_EDGE
Definition: dawg.h:86
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:484

◆ remove_edge_linkage()

void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 484 of file trie.cpp.

485  {
486  EDGE_RECORD *edge_ptr = NULL;
487  EDGE_INDEX edge_index = 0;
488  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
489  unichar_id, &edge_ptr, &edge_index));
490  if (debug_level_ > 1) {
491  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
492  print_edge_rec(*edge_ptr);
493  tprintf("\n");
494  }
495  if (direction == FORWARD_EDGE) {
496  nodes_[node1]->forward_edges.remove(edge_index);
497  } else if (node1 == 0) {
498  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
499  root_back_freelist_.push_back(edge_index);
500  } else {
501  nodes_[node1]->backward_edges.remove(edge_index);
502  }
503  --num_edges_;
504 }
#define REFFORMAT
Definition: dawg.h:93
void remove(int index)
#define FORWARD_EDGE
Definition: dawg.h:85
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:154
TRIE_NODES nodes_
Definition: trie.h:419
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:424
#define tprintf(...)
Definition: tprintf.h:31
inT64 EDGE_INDEX
Definition: trie.h:32
int push_back(T object)
uinT64 num_edges_
Definition: trie.h:420
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:51
int debug_level_
Definition: dawg.h:316
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:317

◆ sort_edges()

void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 654 of file trie.cpp.

654  {
655  int num_edges = edges->size();
656  if (num_edges <= 1) return;
658  sort_vec.reserve(num_edges);
659  for (int i = 0; i < num_edges; ++i) {
660  sort_vec.push_back(KDPairInc<UNICHAR_ID, EDGE_RECORD>(
661  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
662  }
663  sort_vec.sort();
664  for (int i = 0; i < num_edges; ++i)
665  (*edges)[i] = sort_vec[i].data;
666 }
void reserve(int size)
int size() const
Definition: genericvector.h:72
int push_back(T object)
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ trie_to_dawg()

SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 519 of file trie.cpp.

519  {
520  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
521  if (debug_level_ > 2) {
522  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
523  }
524  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
525  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
526  this->reduce_node_input(0, reduced_nodes);
527  delete[] reduced_nodes;
528 
529  if (debug_level_ > 2) {
530  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
531  }
532  // Build a translation map from node indices in nodes_ vector to
533  // their target indices in EDGE_ARRAY.
534  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
535  int i, j;
536  node_ref_map[0] = 0;
537  for (i = 0; i < nodes_.size(); ++i) {
538  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
539  }
540  int num_forward_edges = node_ref_map[i];
541 
542  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
543  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
544  EDGE_ARRAY edge_array = new EDGE_RECORD[num_forward_edges];
545  EDGE_ARRAY edge_array_ptr = edge_array;
546  for (i = 0; i < nodes_.size(); ++i) {
547  TRIE_NODE_RECORD *node_ptr = nodes_[i];
548  int end = node_ptr->forward_edges.size();
549  for (j = 0; j < end; ++j) {
550  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
551  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
552  ASSERT_HOST(node_ref < nodes_.size());
553  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
554  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
555  end_of_word_from_edge_rec(edge_rec), unichar_id);
556  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
557  ++edge_array_ptr;
558  }
559  }
560  delete[] node_ref_map;
561 
562  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
564 }
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:668
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
int unicharset_size_
Definition: dawg.h:309
#define FORWARD_EDGE
Definition: dawg.h:85
inT64 NODE_REF
Definition: dawg.h:56
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:334
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:424
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:304
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:306
EDGE_VECTOR forward_edges
Definition: trie.h:49
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:241
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:54
STRING lang_
Definition: dawg.h:302
#define ASSERT_HOST(x)
Definition: errcode.h:84
uinT64 EDGE_RECORD
Definition: dawg.h:51
bool * NODE_MARKER
Definition: trie.h:45
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230
int debug_level_
Definition: dawg.h:316
DawgType type_
Definition: dawg.h:301
int UNICHAR_ID
Definition: unichar.h:35

◆ unichar_id_to_patterns()

void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
GenericVector< UNICHAR_ID > *  vec 
) const
virtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 362 of file trie.cpp.

364  {
365  bool is_alpha = unicharset.get_isalpha(unichar_id);
366  if (is_alpha) {
369  if (unicharset.get_islower(unichar_id)) {
371  } else if (unicharset.get_isupper(unichar_id)) {
373  }
374  }
375  if (unicharset.get_isdigit(unichar_id)) {
377  if (!is_alpha) vec->push_back(alphanum_pattern_);
378  }
379  if (unicharset.get_ispunctuation(unichar_id)) {
380  vec->push_back(punc_pattern_);
381  }
382 }
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:518
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:511
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:490
UNICHAR_ID alphanum_pattern_
Definition: trie.h:430
UNICHAR_ID punc_pattern_
Definition: trie.h:431
UNICHAR_ID alpha_pattern_
Definition: trie.h:428
int push_back(T object)
UNICHAR_ID lower_pattern_
Definition: trie.h:432
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:497
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:504
UNICHAR_ID upper_pattern_
Definition: trie.h:433
UNICHAR_ID digit_pattern_
Definition: trie.h:429

◆ unichar_ids_of()

void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 117 of file trie.h.

118  {
119  const EDGE_VECTOR &forward_edges =
120  nodes_[static_cast<int>(node)]->forward_edges;
121  for (int i = 0; i < forward_edges.size(); ++i) {
122  if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
123  vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
124  make_edge_ref(node, i)));
125  }
126  }
127  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
TRIE_NODES nodes_
Definition: trie.h:419
int size() const
Definition: genericvector.h:72
int push_back(T object)
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:300
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

Member Data Documentation

◆ alpha_pattern_

UNICHAR_ID tesseract::Trie::alpha_pattern_
protected

Definition at line 428 of file trie.h.

◆ alphanum_pattern_

UNICHAR_ID tesseract::Trie::alphanum_pattern_
protected

Definition at line 430 of file trie.h.

◆ deref_direction_mask_

uinT64 tesseract::Trie::deref_direction_mask_
protected

Definition at line 421 of file trie.h.

◆ deref_node_index_mask_

uinT64 tesseract::Trie::deref_node_index_mask_
protected

Definition at line 422 of file trie.h.

◆ digit_pattern_

UNICHAR_ID tesseract::Trie::digit_pattern_
protected

Definition at line 429 of file trie.h.

◆ initialized_patterns_

bool tesseract::Trie::initialized_patterns_
protected

Definition at line 427 of file trie.h.

◆ kAlphanumPatternUnicode

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

Definition at line 77 of file trie.h.

◆ kAlphaPatternUnicode

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

Definition at line 75 of file trie.h.

◆ kDigitPatternUnicode

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

Definition at line 76 of file trie.h.

◆ kLowerPatternUnicode

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

Definition at line 79 of file trie.h.

◆ kPuncPatternUnicode

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

Definition at line 78 of file trie.h.

◆ kSaneNumConcreteChars

const int tesseract::Trie::kSaneNumConcreteChars = 0
static

Definition at line 71 of file trie.h.

◆ kUpperPatternUnicode

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

Definition at line 80 of file trie.h.

◆ lower_pattern_

UNICHAR_ID tesseract::Trie::lower_pattern_
protected

Definition at line 432 of file trie.h.

◆ nodes_

TRIE_NODES tesseract::Trie::nodes_
protected

Definition at line 419 of file trie.h.

◆ num_edges_

uinT64 tesseract::Trie::num_edges_
protected

Definition at line 420 of file trie.h.

◆ punc_pattern_

UNICHAR_ID tesseract::Trie::punc_pattern_
protected

Definition at line 431 of file trie.h.

◆ root_back_freelist_

GenericVector<EDGE_INDEX> tesseract::Trie::root_back_freelist_
protected

Definition at line 424 of file trie.h.

◆ upper_pattern_

UNICHAR_ID tesseract::Trie::upper_pattern_
protected

Definition at line 433 of file trie.h.


The documentation for this class was generated from the following files: