tesseract v5.3.3.20231005
tesseract::Trie Class Reference

#include <trie.h>

Inheritance diagram for tesseract::Trie:
tesseract::Dawg

Public Types

enum  RTLReversePolicy { RRP_DO_NO_REVERSE , RRP_REVERSE_IF_HAS_RTL , RRP_FORCE_REVERSE }
 

Public Member Functions

 Trie (DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
 
 ~Trie () override
 
void clear ()
 
EDGE_REF edge_char_of (NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const override
 
NODE_REF next_node (EDGE_REF edge_ref) const override
 
bool end_of_word (EDGE_REF edge_ref) const override
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const override
 
void KillEdge (EDGE_RECORD *edge_rec) const
 
bool DeadEdge (const EDGE_RECORD &edge_rec) const
 
void print_node (NODE_REF node, int max_num_edges) const override
 
SquishedDawgtrie_to_dawg ()
 
bool read_and_add_word_list (const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
 
bool read_word_list (const char *filename, std::vector< std::string > *words)
 
bool add_word_list (const std::vector< std::string > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
 
bool read_pattern_list (const char *filename, const UNICHARSET &unicharset)
 
void initialize_patterns (UNICHARSET *unicharset)
 
void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const override
 
EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
 
bool add_word_to_dawg (const WERD_CHOICE &word, const std::vector< bool > *repetitions)
 
bool add_word_to_dawg (const WERD_CHOICE &word)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const std::string & lang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
 
void iterate_words (const UNICHARSET &unicharset, const std::function< void(const char *)> &cb) const
 
virtual EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
 Returns the edge that corresponds to the letter out of this node. More...
 
virtual void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const =0
 
virtual NODE_REF next_node (EDGE_REF edge_ref) const =0
 
virtual bool end_of_word (EDGE_REF edge_ref) const =0
 
virtual UNICHAR_ID edge_letter (EDGE_REF edge_ref) const =0
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
 
virtual void print_node (NODE_REF node, int max_num_edges) const =0
 
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 

Static Public Member Functions

static const char * get_reverse_policy_name (RTLReversePolicy reverse_policy)
 

Static Public Attributes

static const int kSaneNumConcreteChars = 0
 
static const char kAlphaPatternUnicode [] = "\u2000"
 
static const char kDigitPatternUnicode [] = "\u2001"
 
static const char kAlphanumPatternUnicode [] = "\u2002"
 
static const char kPuncPatternUnicode [] = "\u2003"
 
static const char kLowerPatternUnicode [] = "\u2004"
 
static const char kUpperPatternUnicode [] = "\u2005"
 
- Static Public Attributes inherited from tesseract::Dawg
static const int16_t kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 

Protected Member Functions

EDGE_RECORDderef_edge_ref (EDGE_REF edge_ref) const
 
EDGE_REF make_edge_ref (NODE_REF node_index, EDGE_INDEX edge_index) const
 
void link_edge (EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void print_edge_rec (const EDGE_RECORD &edge_rec) const
 
bool can_be_eliminated (const EDGE_RECORD &edge_rec)
 
void print_all (const char *msg, int max_num_edges)
 
bool edge_char_of (NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end, UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const
 
bool add_edge_linkage (NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
 
bool add_new_edge (NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
 
void add_word_ending (EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
 
NODE_REF new_dawg_node ()
 
void remove_edge_linkage (NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
 
void remove_edge (NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
 
bool eliminate_redundant_edges (NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
 
bool reduce_lettered_edges (EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, std::vector< bool > &reduced_nodes)
 
void sort_edges (EDGE_VECTOR *edges)
 
void reduce_node_input (NODE_REF node, std::vector< bool > &reduced_nodes)
 
UNICHAR_ID character_class_to_pattern (char ch)
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const std::string &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, const std::function< void(const WERD_CHOICE *)> &cb) const
 

Protected Attributes

TRIE_NODES nodes_
 
std::vector< EDGE_INDEXroot_back_freelist_
 
uint64_t num_edges_ = 0
 
uint64_t deref_direction_mask_ = 0
 
uint64_t deref_node_index_mask_ = 0
 
UNICHAR_ID alpha_pattern_ = 0
 
UNICHAR_ID digit_pattern_ = 0
 
UNICHAR_ID alphanum_pattern_ = 0
 
UNICHAR_ID punc_pattern_ = 0
 
UNICHAR_ID lower_pattern_ = 0
 
UNICHAR_ID upper_pattern_ = 0
 
bool initialized_patterns_ = false
 
- Protected Attributes inherited from tesseract::Dawg
std::string lang_
 
DawgType type_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
uint64_t next_node_mask_ = 0
 
uint64_t flags_mask_ = 0
 
uint64_t letter_mask_ = 0
 
int unicharset_size_
 
int flag_start_bit_ = 0
 
int next_node_start_bit_ = 0
 
int debug_level_
 

Detailed Description

Concrete class for Trie data structure that allows to store a list of words (extends Dawg base class) as well as dynamically add new words. This class stores a vector of pointers to TRIE_NODE_RECORDs, each of which has a vector of forward and backward edges.

Definition at line 53 of file trie.h.

Member Enumeration Documentation

◆ RTLReversePolicy

Enumerator
RRP_DO_NO_REVERSE 
RRP_REVERSE_IF_HAS_RTL 
RRP_FORCE_REVERSE 

Definition at line 55 of file trie.h.

55 {
59 };
@ RRP_DO_NO_REVERSE
Definition: trie.h:56
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
@ RRP_FORCE_REVERSE
Definition: trie.h:58

Constructor & Destructor Documentation

◆ Trie()

tesseract::Trie::Trie ( DawgType  type,
const std::string &  lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 79 of file trie.h.

80 : Dawg(type, lang, perm, debug_level) {
81 init(unicharset_size);
82 deref_node_index_mask_ = ~letter_mask_;
83 new_dawg_node(); // need to allocate node 0
84 }
Dawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
Definition: dawg.h:201
const std::string & lang() const
Definition: dawg.h:122
void init(int unicharset_size)
Definition: dawg.cpp:178
DawgType type() const
Definition: dawg.h:119
uint64_t deref_node_index_mask_
Definition: trie.h:407
NODE_REF new_dawg_node()
Definition: trie.cpp:267

◆ ~Trie()

tesseract::Trie::~Trie ( )
inlineoverride

Definition at line 85 of file trie.h.

85 {
86 for (auto node : nodes_) {
87 delete node;
88 }
89 }
TRIE_NODES nodes_
Definition: trie.h:402

Member Function Documentation

◆ add_edge_linkage()

bool tesseract::Trie::add_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 110 of file trie.cpp.

111 {
112 EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ? &(nodes_[node1]->forward_edges)
113 : &(nodes_[node1]->backward_edges);
114 unsigned search_index;
115 if (node1 == 0 && direction == FORWARD_EDGE) {
116 search_index = 0; // find the index to make the add sorted
117 while (search_index < vec->size() &&
118 given_greater_than_edge_rec(node2, word_end, unichar_id, (*vec)[search_index]) == 1) {
119 search_index++;
120 }
121 } else {
122 search_index = vec->size(); // add is unsorted, so index does not matter
123 }
124 EDGE_RECORD edge_rec;
125 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
126 if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
127 EDGE_INDEX edge_index = root_back_freelist_.back();
128 root_back_freelist_.pop_back();
129 (*vec)[edge_index] = edge_rec;
130 } else if (search_index < vec->size()) {
131 vec->insert(vec->begin() + search_index, edge_rec);
132 } else {
133 vec->push_back(edge_rec);
134 }
135 if (debug_level_ > 1) {
136 tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
137 print_edge_rec(edge_rec);
138 tprintf("\n");
139 }
140 num_edges_++;
141 return true;
142}
#define BACKWARD_EDGE
Definition: dawg.h:78
#define FORWARD_EDGE
Definition: dawg.h:77
#define REFFORMAT
Definition: dawg.h:85
uint64_t EDGE_RECORD
Definition: dawg.h:47
std::vector< EDGE_RECORD > EDGE_VECTOR
Definition: trie.h:39
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t EDGE_INDEX
Definition: trie.h:38
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
int debug_level_
Definition: dawg.h:317
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:295
std::vector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:404
uint64_t num_edges_
Definition: trie.h:405
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311

◆ add_new_edge()

bool tesseract::Trie::add_new_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  repeats,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 348 of file trie.h.

349 {
350 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
351 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
352 }
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:110

◆ add_word_ending()

void tesseract::Trie::add_word_ending ( EDGE_RECORD edge,
NODE_REF  the_next_node,
bool  repeats,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 144 of file trie.cpp.

145 {
146 EDGE_RECORD *back_edge_ptr;
147 EDGE_INDEX back_edge_index;
148 ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false, unichar_id, &back_edge_ptr,
149 &back_edge_index));
150 if (marker_flag) {
151 *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
152 *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
153 }
154 // Mark both directions as end of word.
155 *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
156 *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
157}
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define WERD_END_FLAG
Definition: dawg.h:82
#define MARKER_FLAG
Definition: dawg.h:80
int flag_start_bit_
Definition: dawg.h:314
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:95

◆ add_word_list()

bool tesseract::Trie::add_word_list ( const std::vector< std::string > &  words,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse_policy 
)

Definition at line 310 of file trie.cpp.

311 {
312 for (const auto &i : words) {
313 WERD_CHOICE word(i.c_str(), unicharset);
314 if (word.empty() || word.contains_unichar_id(INVALID_UNICHAR_ID)) {
315 continue;
316 }
317 if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL && word.has_rtl_unichar_id()) ||
318 reverse_policy == RRP_FORCE_REVERSE) {
319 word.reverse_and_mirror_unichar_ids();
320 }
321 if (!word_in_dawg(word)) {
322 add_word_to_dawg(word);
323 if (!word_in_dawg(word)) {
324 tprintf("Error: word '%s' not in DAWG after adding it\n", i.c_str());
325 return false;
326 }
327 }
328 }
329 return true;
330}
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:64
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
Definition: trie.cpp:159

◆ add_word_to_dawg() [1/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word)
inline

Definition at line 262 of file trie.h.

262 {
263 return add_word_to_dawg(word, nullptr);
264 }

◆ add_word_to_dawg() [2/2]

bool tesseract::Trie::add_word_to_dawg ( const WERD_CHOICE word,
const std::vector< bool > *  repetitions 
)

Definition at line 159 of file trie.cpp.

159 {
160 if (word.length() <= 0) {
161 return false; // can't add empty words
162 }
163 if (repetitions != nullptr) {
164 ASSERT_HOST(repetitions->size() == word.length());
165 }
166 // Make sure the word does not contain invalid unchar ids.
167 for (unsigned i = 0; i < word.length(); ++i) {
168 if (word.unichar_id(i) < 0 || word.unichar_id(i) >= unicharset_size_) {
169 return false;
170 }
171 }
172
173 EDGE_RECORD *edge_ptr;
174 NODE_REF last_node = 0;
175 NODE_REF the_next_node;
176 bool marker_flag = false;
177 EDGE_INDEX edge_index;
178 int32_t still_finding_chars = true;
179 int32_t word_end = false;
180 bool add_failed = false;
181 bool found;
182
183 if (debug_level_ > 1) {
184 word.print("\nAdding word: ");
185 }
186
187 UNICHAR_ID unichar_id;
188 unsigned i;
189 for (i = 0; i < word.length() - 1; ++i) {
190 unichar_id = word.unichar_id(i);
191 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
192 if (debug_level_ > 1) {
193 tprintf("Adding letter %d\n", unichar_id);
194 }
195 if (still_finding_chars) {
196 found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
197 &edge_index);
198 if (found && debug_level_ > 1) {
199 tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n", edge_index, last_node);
200 }
201 if (!found) {
202 still_finding_chars = false;
203 } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
204 // We hit the end of an existing word, but the new word is longer.
205 // In this case we have to disconnect the existing word from the
206 // backwards root node, mark the current position as end-of-word
207 // and add new nodes for the increased length. Disconnecting the
208 // existing word from the backwards root node requires a linear
209 // search, so it is much faster to add the longest words first,
210 // to avoid having to come here.
211 word_end = true;
212 still_finding_chars = false;
213 remove_edge(last_node, 0, word_end, unichar_id);
214 } else {
215 // We have to add a new branch here for the new word.
216 if (marker_flag) {
218 }
219 last_node = next_node_from_edge_rec(*edge_ptr);
220 }
221 }
222 if (!still_finding_chars) {
223 the_next_node = new_dawg_node();
224 if (debug_level_ > 1) {
225 tprintf("adding node " REFFORMAT "\n", the_next_node);
226 }
227 if (the_next_node == 0) {
228 add_failed = true;
229 break;
230 }
231 if (!add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) {
232 add_failed = true;
233 break;
234 }
235 word_end = false;
236 last_node = the_next_node;
237 }
238 }
239 the_next_node = 0;
240 unichar_id = word.unichar_id(i);
241 marker_flag = (repetitions != nullptr) ? (*repetitions)[i] : false;
242 if (debug_level_ > 1) {
243 tprintf("Adding letter %d\n", unichar_id);
244 }
245 if (still_finding_chars &&
246 edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false, unichar_id, &edge_ptr, &edge_index)) {
247 // An extension of this word already exists in the trie, so we
248 // only have to add the ending flags in both directions.
249 add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr), marker_flag, unichar_id);
250 } else {
251 // Add a link to node 0. All leaves connect to node 0 so the back links can
252 // be used in reduction to a dawg. This root backward node has one edge
253 // entry for every word, (except prefixes of longer words) so it is huge.
254 if (!add_failed && !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id)) {
255 add_failed = true;
256 }
257 }
258 if (add_failed) {
259 tprintf("Re-initializing document dictionary...\n");
260 clear();
261 return false;
262 } else {
263 return true;
264 }
265}
int64_t NODE_REF
Definition: dawg.h:50
int UNICHAR_ID
Definition: unichar.h:34
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:210
int unicharset_size_
Definition: dawg.h:313
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:237
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:369
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:348
void clear()
Definition: trie.cpp:50
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:144

◆ can_be_eliminated()

bool tesseract::Trie::can_be_eliminated ( const EDGE_RECORD edge_rec)
inlineprotected

Definition at line 319 of file trie.h.

319 {
320 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321 return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
322 }

◆ character_class_to_pattern()

UNICHAR_ID tesseract::Trie::character_class_to_pattern ( char  ch)
protected

Definition at line 372 of file trie.cpp.

372 {
373 if (ch == 'c') {
374 return alpha_pattern_;
375 } else if (ch == 'd') {
376 return digit_pattern_;
377 } else if (ch == 'n') {
378 return alphanum_pattern_;
379 } else if (ch == 'p') {
380 return punc_pattern_;
381 } else if (ch == 'a') {
382 return lower_pattern_;
383 } else if (ch == 'A') {
384 return upper_pattern_;
385 } else {
386 return INVALID_UNICHAR_ID;
387 }
388}
UNICHAR_ID alpha_pattern_
Definition: trie.h:410
UNICHAR_ID digit_pattern_
Definition: trie.h:411
UNICHAR_ID lower_pattern_
Definition: trie.h:414
UNICHAR_ID upper_pattern_
Definition: trie.h:415
UNICHAR_ID punc_pattern_
Definition: trie.h:413
UNICHAR_ID alphanum_pattern_
Definition: trie.h:412

◆ clear()

void tesseract::Trie::clear ( )

Definition at line 50 of file trie.cpp.

50 {
51 for (auto node : nodes_) {
52 delete node;
53 }
54 nodes_.clear();
55 root_back_freelist_.clear();
56 num_edges_ = 0;
57 new_dawg_node(); // Need to allocate node 0.
58}

◆ DeadEdge()

bool tesseract::Trie::DeadEdge ( const EDGE_RECORD edge_rec) const
inline

Definition at line 154 of file trie.h.

154 {
155 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
156 }
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:227

◆ deref_edge_ref()

EDGE_RECORD * tesseract::Trie::deref_edge_ref ( EDGE_REF  edge_ref) const
inlineprotected

Definition at line 284 of file trie.h.

284 {
285 int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
286 int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
287 TRIE_NODE_RECORD *node_rec = nodes_[node_index];
288 return &(node_rec->forward_edges[edge_index]);
289 }
#define LETTER_START_BIT
Definition: dawg.h:83
uint64_t letter_mask_
Definition: dawg.h:312

◆ edge_char_of() [1/2]

bool tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
NODE_REF  next_node,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id,
EDGE_RECORD **  edge_ptr,
EDGE_INDEX edge_index 
) const
protected

Definition at line 60 of file trie.cpp.

62 {
63 if (debug_level_ == 3) {
64 tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
65 " direction %d word_end %d unichar_id %d, exploring node:\n",
66 node_ref, next_node, direction, word_end, unichar_id);
67 if (node_ref != NO_EDGE) {
68 print_node(node_ref, nodes_[node_ref]->forward_edges.size());
69 }
70 }
71 if (node_ref == NO_EDGE) {
72 return false;
73 }
74 assert(static_cast<size_t>(node_ref) < nodes_.size());
75 EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ? nodes_[node_ref]->forward_edges
76 : nodes_[node_ref]->backward_edges;
77 int vec_size = vec.size();
78 if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
79 EDGE_INDEX start = 0;
80 EDGE_INDEX end = vec_size - 1;
81 EDGE_INDEX k;
82 int compare;
83 while (start <= end) {
84 k = (start + end) >> 1; // (start + end) / 2
85 compare = given_greater_than_edge_rec(next_node, word_end, unichar_id, vec[k]);
86 if (compare == 0) { // given == vec[k]
87 *edge_ptr = &(vec[k]);
88 *edge_index = k;
89 return true;
90 } else if (compare == 1) { // given > vec[k]
91 start = k + 1;
92 } else { // given < vec[k]
93 end = k - 1;
94 }
95 }
96 } else { // linear search
97 for (int i = 0; i < vec_size; ++i) {
98 EDGE_RECORD &edge_rec = vec[i];
99 if (edge_rec_match(next_node, word_end, unichar_id, next_node_from_edge_rec(edge_rec),
101 *edge_ptr = &(edge_rec);
102 *edge_index = i;
103 return true;
104 }
105 }
106 }
107 return false; // not found
108}
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:275
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:223
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:123
void print_node(NODE_REF node, int max_num_edges) const override
Definition: trie.cpp:702

◆ edge_char_of() [2/2]

EDGE_REF tesseract::Trie::edge_char_of ( NODE_REF  node_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlineoverridevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 95 of file trie.h.

95 {
96 EDGE_RECORD *edge_ptr;
97 EDGE_INDEX edge_index;
98 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
99 &edge_index)) {
100 return NO_EDGE;
101 }
102 return make_edge_ref(node_ref, edge_index);
103 }
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:291

◆ edge_letter()

UNICHAR_ID tesseract::Trie::edge_letter ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 142 of file trie.h.

142 {
143 if (edge_ref == NO_EDGE || num_edges_ == 0) {
144 return INVALID_UNICHAR_ID;
145 }
146 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
147 }
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:284

◆ eliminate_redundant_edges()

bool tesseract::Trie::eliminate_redundant_edges ( NODE_REF  node,
const EDGE_RECORD edge1,
const EDGE_RECORD edge2 
)
protected

Definition at line 553 of file trie.cpp.

554 {
555 if (debug_level_ > 1) {
556 tprintf("\nCollapsing node %" PRIi64 ":\n", node);
558 tprintf("Candidate edges: ");
559 print_edge_rec(edge1);
560 tprintf(", ");
561 print_edge_rec(edge2);
562 tprintf("\n\n");
563 }
564 NODE_REF next_node1 = next_node_from_edge_rec(edge1);
565 NODE_REF next_node2 = next_node_from_edge_rec(edge2);
566 TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
567 // Translate all edges going to/from next_node2 to go to/from next_node1.
568 EDGE_RECORD *edge_ptr = nullptr;
569 EDGE_INDEX edge_index;
570 // The backward link in node to next_node2 will be zeroed out by the caller.
571 // Copy all the backward links in next_node2 to node next_node1
572 for (unsigned i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
573 const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
574 NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
575 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
576 int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
577 bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
578 add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE, curr_word_end,
579 curr_unichar_id);
580 // Relocate the corresponding forward edge in curr_next_node
581 ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE, curr_word_end,
582 curr_unichar_id, &edge_ptr, &edge_index));
583 set_next_node_in_edge_rec(edge_ptr, next_node1);
584 }
585 int next_node2_num_edges =
586 (next_node2_ptr->forward_edges.size() + next_node2_ptr->backward_edges.size());
587 if (debug_level_ > 1) {
588 tprintf("removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2);
589 }
590 next_node2_ptr->forward_edges.clear();
591 next_node2_ptr->backward_edges.clear();
592 num_edges_ -= next_node2_num_edges;
593 return true;
594}
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:79
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:232
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:214

◆ end_of_word()

bool tesseract::Trie::end_of_word ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 134 of file trie.h.

134 {
135 if (edge_ref == NO_EDGE || num_edges_ == 0) {
136 return false;
137 }
138 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139 }

◆ get_reverse_policy_name()

const char * tesseract::Trie::get_reverse_policy_name ( RTLReversePolicy  reverse_policy)
static

Definition at line 45 of file trie.cpp.

45 {
46 return RTLReversePolicyNames[reverse_policy];
47}
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:36

◆ initialize_patterns()

void tesseract::Trie::initialize_patterns ( UNICHARSET unicharset)

Definition at line 332 of file trie.cpp.

332 {
333 unicharset->unichar_insert(kAlphaPatternUnicode);
334 alpha_pattern_ = unicharset->unichar_to_id(kAlphaPatternUnicode);
335 unicharset->unichar_insert(kDigitPatternUnicode);
336 digit_pattern_ = unicharset->unichar_to_id(kDigitPatternUnicode);
337 unicharset->unichar_insert(kAlphanumPatternUnicode);
338 alphanum_pattern_ = unicharset->unichar_to_id(kAlphanumPatternUnicode);
339 unicharset->unichar_insert(kPuncPatternUnicode);
340 punc_pattern_ = unicharset->unichar_to_id(kPuncPatternUnicode);
341 unicharset->unichar_insert(kLowerPatternUnicode);
342 lower_pattern_ = unicharset->unichar_to_id(kLowerPatternUnicode);
343 unicharset->unichar_insert(kUpperPatternUnicode);
344 upper_pattern_ = unicharset->unichar_to_id(kUpperPatternUnicode);
346 unicharset_size_ = unicharset->size();
347}
bool initialized_patterns_
Definition: trie.h:416
static const char kLowerPatternUnicode[]
Definition: trie.h:70
static const char kAlphanumPatternUnicode[]
Definition: trie.h:68
static const char kUpperPatternUnicode[]
Definition: trie.h:71
static const char kDigitPatternUnicode[]
Definition: trie.h:67
static const char kPuncPatternUnicode[]
Definition: trie.h:69
static const char kAlphaPatternUnicode[]
Definition: trie.h:66

◆ KillEdge()

void tesseract::Trie::KillEdge ( EDGE_RECORD edge_rec) const
inline

Definition at line 150 of file trie.h.

150 {
151 *edge_rec &= ~letter_mask_;
152 *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
153 }

◆ link_edge()

void tesseract::Trie::link_edge ( EDGE_RECORD edge,
NODE_REF  nxt,
bool  repeats,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Sets up this edge record to the requested values.

Definition at line 295 of file trie.h.

296 {
297 EDGE_RECORD flags = 0;
298 if (repeats) {
299 flags |= MARKER_FLAG;
300 }
301 if (word_end) {
302 flags |= WERD_END_FLAG;
303 }
304 if (direction == BACKWARD_EDGE) {
305 flags |= DIRECTION_FLAG;
306 }
307 *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
308 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309 }
#define DIRECTION_FLAG
Definition: dawg.h:81

◆ make_edge_ref()

EDGE_REF tesseract::Trie::make_edge_ref ( NODE_REF  node_index,
EDGE_INDEX  edge_index 
) const
inlineprotected

Constructs EDGE_REF from the given node_index and edge_index.

Definition at line 291 of file trie.h.

291 {
292 return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));
293 }

◆ new_dawg_node()

NODE_REF tesseract::Trie::new_dawg_node ( )
protected

Definition at line 267 of file trie.cpp.

267 {
268 auto *node = new TRIE_NODE_RECORD();
269 nodes_.push_back(node);
270 return nodes_.size() - 1;
271}

◆ next_node()

NODE_REF tesseract::Trie::next_node ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 123 of file trie.h.

123 {
124 if (edge_ref == NO_EDGE || num_edges_ == 0) {
125 return NO_EDGE;
126 }
127 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
128 }

◆ pattern_loop_edge()

EDGE_REF tesseract::Trie::pattern_loop_edge ( EDGE_REF  edge_ref,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
inlineoverridevirtual

Returns the given EDGE_REF if the EDGE_RECORD that it points to has a self loop and the given unichar_id matches the unichar_id stored in the EDGE_RECORD, returns NO_EDGE otherwise.

Reimplemented from tesseract::Dawg.

Definition at line 239 of file trie.h.

240 {
241 if (edge_ref == NO_EDGE) {
242 return NO_EDGE;
243 }
244 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245 return (marker_flag_from_edge_rec(*edge_rec) &&
246 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247 word_end == end_of_word_from_edge_rec(*edge_rec))
248 ? edge_ref
249 : NO_EDGE;
250 }

◆ print_all()

void tesseract::Trie::print_all ( const char *  msg,
int  max_num_edges 
)
inlineprotected

Definition at line 326 of file trie.h.

326 {
327 tprintf("\n__________________________\n%s\n", msg);
328 for (size_t i = 0; i < nodes_.size(); ++i) {
329 print_node(i, max_num_edges);
330 }
331 tprintf("__________________________\n");
332 }

◆ print_edge_rec()

void tesseract::Trie::print_edge_rec ( const EDGE_RECORD edge_rec) const
inlineprotected

Prints the given EDGE_RECORD.

Definition at line 311 of file trie.h.

311 {
312 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
313 marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315 end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
316 }
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:218

◆ print_node()

void tesseract::Trie::print_node ( NODE_REF  node,
int  max_num_edges 
) const
overridevirtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 702 of file trie.cpp.

702 {
703 if (node == NO_EDGE) {
704 return; // nothing to print
705 }
706 TRIE_NODE_RECORD *node_ptr = nodes_[node];
707 int num_fwd = node_ptr->forward_edges.size();
708 int num_bkw = node_ptr->backward_edges.size();
709 EDGE_VECTOR *vec;
710 for (int dir = 0; dir < 2; ++dir) {
711 if (dir == 0) {
712 vec = &(node_ptr->forward_edges);
713 tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
714 } else {
715 vec = &(node_ptr->backward_edges);
716 tprintf("\t");
717 }
718 int i;
719 for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) && i < max_num_edges; ++i) {
720 if (DeadEdge((*vec)[i])) {
721 continue;
722 }
723 print_edge_rec((*vec)[i]);
724 tprintf(" ");
725 }
726 if (dir == 0 ? i < num_fwd : i < num_bkw) {
727 tprintf("...");
728 }
729 tprintf("\n");
730 }
731}
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:154

◆ read_and_add_word_list()

bool tesseract::Trie::read_and_add_word_list ( const char *  filename,
const UNICHARSET unicharset,
Trie::RTLReversePolicy  reverse 
)

Definition at line 273 of file trie.cpp.

274 {
275 std::vector<std::string> word_list;
276 if (!read_word_list(filename, &word_list)) {
277 return false;
278 }
279 std::sort(word_list.begin(), word_list.end(),
280 [](auto &s1, auto &s2) { return s1.size() > s2.size(); });
281 return add_word_list(word_list, unicharset, reverse_policy);
282}
bool add_word_list(const std::vector< std::string > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
Definition: trie.cpp:310
bool read_word_list(const char *filename, std::vector< std::string > *words)
Definition: trie.cpp:284

◆ read_pattern_list()

bool tesseract::Trie::read_pattern_list ( const char *  filename,
const UNICHARSET unicharset 
)

Definition at line 390 of file trie.cpp.

390 {
392 tprintf("please call initialize_patterns() before read_pattern_list()\n");
393 return false;
394 }
395
396 FILE *pattern_file = fopen(filename, "rb");
397 if (pattern_file == nullptr) {
398 tprintf("Error opening pattern file %s\n", filename);
399 return false;
400 }
401
402 int pattern_count = 0;
403 char string[CHARS_PER_LINE];
404 while (fgets(string, CHARS_PER_LINE, pattern_file) != nullptr) {
405 chomp_string(string); // remove newline
406 // Parse the pattern and construct a unichar id vector.
407 // Record the number of repetitions of each unichar in the parallel vector.
408 WERD_CHOICE word(&unicharset);
409 std::vector<bool> repetitions_vec;
410 const char *str_ptr = string;
411 int step = unicharset.step(str_ptr);
412 bool failed = false;
413 while (step > 0) {
414 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
415 if (step == 1 && *str_ptr == '\\') {
416 ++str_ptr;
417 if (*str_ptr == '\\') { // regular '\' unichar that was escaped
418 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
419 } else {
420#if 0 // TODO: This code should be enabled if kSaneNumConcreteChars != 0.
421 if (word.length() < kSaneNumConcreteChars) {
422 tprintf(
423 "Please provide at least %d concrete characters at the"
424 " beginning of the pattern\n",
426 failed = true;
427 break;
428 }
429#endif
430 // Parse character class from expression.
431 curr_unichar_id = character_class_to_pattern(*str_ptr);
432 }
433 } else {
434 curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
435 }
436 if (curr_unichar_id == INVALID_UNICHAR_ID) {
437 failed = true;
438 break; // failed to parse this pattern
439 }
440 word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
441 repetitions_vec.push_back(false);
442 str_ptr += step;
443 step = unicharset.step(str_ptr);
444 // Check if there is a repetition pattern specified after this unichar.
445 if (step == 1 && *str_ptr == '\\' && *(str_ptr + 1) == '*') {
446 repetitions_vec[repetitions_vec.size() - 1] = true;
447 str_ptr += 2;
448 step = unicharset.step(str_ptr);
449 }
450 }
451 if (failed) {
452 tprintf("Invalid user pattern %s\n", string);
453 continue;
454 }
455 // Insert the pattern into the trie.
456 if (debug_level_ > 2) {
457 tprintf("Inserting expanded user pattern %s\n", word.debug_string().c_str());
458 }
459 if (!this->word_in_dawg(word)) {
460 this->add_word_to_dawg(word, &repetitions_vec);
461 if (!this->word_in_dawg(word)) {
462 tprintf("Error: failed to insert pattern '%s'\n", string);
463 }
464 }
465 ++pattern_count;
466 }
467 if (debug_level_) {
468 tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
469 }
470 fclose(pattern_file);
471 return true;
472}
#define CHARS_PER_LINE
Definition: dict.h:44
void chomp_string(char *str)
Definition: helpers.h:91
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:372
static const int kSaneNumConcreteChars
Definition: trie.h:62

◆ read_word_list()

bool tesseract::Trie::read_word_list ( const char *  filename,
std::vector< std::string > *  words 
)

Definition at line 284 of file trie.cpp.

284 {
285 FILE *word_file;
286 char line_str[CHARS_PER_LINE];
287 int word_count = 0;
288
289 word_file = fopen(filename, "rb");
290 if (word_file == nullptr) {
291 return false;
292 }
293
294 while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
295 chomp_string(line_str); // remove newline
296 std::string word_str(line_str);
297 ++word_count;
298 if (debug_level_ && word_count % 10000 == 0) {
299 tprintf("Read %d words so far\n", word_count);
300 }
301 words->push_back(word_str);
302 }
303 if (debug_level_) {
304 tprintf("Read %d words total.\n", word_count);
305 }
306 fclose(word_file);
307 return true;
308}

◆ reduce_lettered_edges()

bool tesseract::Trie::reduce_lettered_edges ( EDGE_INDEX  edge_index,
UNICHAR_ID  unichar_id,
NODE_REF  node,
EDGE_VECTOR backward_edges,
std::vector< bool > &  reduced_nodes 
)
protected

Definition at line 596 of file trie.cpp.

597 {
598 if (debug_level_ > 1) {
599 tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
600 }
601 // Compare each of the edge pairs with the given unichar_id.
602 bool did_something = false;
603 for (unsigned i = edge_index; i < backward_edges->size() - 1; ++i) {
604 // Find the first edge that can be eliminated.
605 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
606 while (i < backward_edges->size()) {
607 if (!DeadEdge((*backward_edges)[i])) {
608 curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
609 if (curr_unichar_id != unichar_id) {
610 return did_something;
611 }
612 if (can_be_eliminated((*backward_edges)[i])) {
613 break;
614 }
615 }
616 ++i;
617 }
618 if (i == backward_edges->size()) {
619 break;
620 }
621 const EDGE_RECORD &edge_rec = (*backward_edges)[i];
622 // Compare it to the rest of the edges with the given unichar_id.
623 for (auto j = i + 1; j < backward_edges->size(); ++j) {
624 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
625 if (DeadEdge(next_edge_rec)) {
626 continue;
627 }
628 UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
629 if (next_id != unichar_id) {
630 break;
631 }
632 if (end_of_word_from_edge_rec(next_edge_rec) == end_of_word_from_edge_rec(edge_rec) &&
633 can_be_eliminated(next_edge_rec) &&
634 eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
635 reduced_nodes[next_node_from_edge_rec(edge_rec)] = false;
636 did_something = true;
637 KillEdge(&(*backward_edges)[j]);
638 }
639 }
640 }
641 return did_something;
642}
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:150
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:553
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:319

◆ reduce_node_input()

void tesseract::Trie::reduce_node_input ( NODE_REF  node,
std::vector< bool > &  reduced_nodes 
)
protected

Eliminates any redundant edges from this node in the Trie.

Definition at line 660 of file trie.cpp.

660 {
661 EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
662 sort_edges(&backward_edges);
663 if (debug_level_ > 1) {
664 tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
666 }
667
668 EDGE_INDEX edge_index = 0;
669 while (static_cast<size_t>(edge_index) < backward_edges.size()) {
670 if (DeadEdge(backward_edges[edge_index])) {
671 continue;
672 }
673 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(backward_edges[edge_index]);
674 while (reduce_lettered_edges(edge_index, unichar_id, node, &backward_edges, reduced_nodes)) {
675 ;
676 }
677 while (static_cast<size_t>(++edge_index) < backward_edges.size()) {
678 UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
679 if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) {
680 break;
681 }
682 }
683 }
684 reduced_nodes[node] = true; // mark as reduced
685
686 if (debug_level_ > 1) {
687 tprintf("Node " REFFORMAT " after reduction:\n", node);
689 }
690
691 for (auto &backward_edge : backward_edges) {
692 if (DeadEdge(backward_edge)) {
693 continue;
694 }
696 if (next_node != 0 && !reduced_nodes[next_node]) {
697 reduce_node_input(next_node, reduced_nodes);
698 }
699 }
700}
void reduce_node_input(NODE_REF node, std::vector< bool > &reduced_nodes)
Definition: trie.cpp:660
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:644
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, std::vector< bool > &reduced_nodes)
Definition: trie.cpp:596

◆ remove_edge()

void tesseract::Trie::remove_edge ( NODE_REF  node1,
NODE_REF  node2,
bool  word_end,
UNICHAR_ID  unichar_id 
)
inlineprotected

Definition at line 369 of file trie.h.

369 {
370 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
371 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
372 }
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:474

◆ remove_edge_linkage()

void tesseract::Trie::remove_edge_linkage ( NODE_REF  node1,
NODE_REF  node2,
int  direction,
bool  word_end,
UNICHAR_ID  unichar_id 
)
protected

Definition at line 474 of file trie.cpp.

475 {
476 EDGE_RECORD *edge_ptr = nullptr;
477 EDGE_INDEX edge_index = 0;
478 ASSERT_HOST(edge_char_of(node1, node2, direction, word_end, unichar_id, &edge_ptr, &edge_index));
479 if (debug_level_ > 1) {
480 tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
481 print_edge_rec(*edge_ptr);
482 tprintf("\n");
483 }
484 if (direction == FORWARD_EDGE) {
485 nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
486 } else if (node1 == 0) {
487 KillEdge(&nodes_[node1]->backward_edges[edge_index]);
488 root_back_freelist_.push_back(edge_index);
489 } else {
490 nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
491 }
492 --num_edges_;
493}

◆ sort_edges()

void tesseract::Trie::sort_edges ( EDGE_VECTOR edges)
protected

Order num_edges of consecutive EDGE_RECORDS in the given EDGE_VECTOR in increasing order of unichar ids. This function is normally called for all edges in a single node, and since number of edges in each node is usually quite small, selection sort is used.

Definition at line 644 of file trie.cpp.

644 {
645 int num_edges = edges->size();
646 if (num_edges <= 1) {
647 return;
648 }
649 std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
650 sort_vec.reserve(num_edges);
651 for (int i = 0; i < num_edges; ++i) {
652 sort_vec.emplace_back(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]);
653 }
654 std::sort(sort_vec.begin(), sort_vec.end());
655 for (int i = 0; i < num_edges; ++i) {
656 (*edges)[i] = sort_vec[i].data();
657 }
658}

◆ trie_to_dawg()

SquishedDawg * tesseract::Trie::trie_to_dawg ( )

Definition at line 508 of file trie.cpp.

508 {
509 root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
510 if (debug_level_ > 2) {
511 print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
512 }
513 std::vector<bool> reduced_nodes(nodes_.size());
514 this->reduce_node_input(0, reduced_nodes);
515
516 if (debug_level_ > 2) {
517 print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
518 }
519 // Build a translation map from node indices in nodes_ vector to
520 // their target indices in EDGE_ARRAY.
521 std::vector<NODE_REF> node_ref_map(nodes_.size() + 1);
522 unsigned i;
523 for (i = 0; i < nodes_.size(); ++i) {
524 node_ref_map[i + 1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
525 }
526 int num_forward_edges = node_ref_map[i];
527
528 // Convert nodes_ vector into EDGE_ARRAY translating the next node references
529 // in edges using node_ref_map. Empty nodes and backward edges are dropped.
530 auto edge_array = new EDGE_RECORD[num_forward_edges];
531 EDGE_ARRAY edge_array_ptr = edge_array;
532 for (i = 0; i < nodes_.size(); ++i) {
533 TRIE_NODE_RECORD *node_ptr = nodes_[i];
534 int end = node_ptr->forward_edges.size();
535 for (int j = 0; j < end; ++j) {
536 EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
537 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
538 ASSERT_HOST(static_cast<size_t>(node_ref) < nodes_.size());
539 UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
540 link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
541 end_of_word_from_edge_rec(edge_rec), unichar_id);
542 if (j == end - 1) {
543 set_marker_flag_in_edge_rec(edge_array_ptr);
544 }
545 ++edge_array_ptr;
546 }
547 }
548
549 return new SquishedDawg(edge_array, num_forward_edges, type_, lang_, perm_, unicharset_size_,
551}
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:48
std::string lang_
Definition: dawg.h:302
DawgType type_
Definition: dawg.h:303
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:305
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:326

◆ unichar_id_to_patterns()

void tesseract::Trie::unichar_id_to_patterns ( UNICHAR_ID  unichar_id,
const UNICHARSET unicharset,
std::vector< UNICHAR_ID > *  vec 
) const
overridevirtual

Fills vec with unichar ids that represent the character classes of the given unichar_id.

Reimplemented from tesseract::Dawg.

Definition at line 349 of file trie.cpp.

350 {
351 bool is_alpha = unicharset.get_isalpha(unichar_id);
352 if (is_alpha) {
353 vec->push_back(alpha_pattern_);
354 vec->push_back(alphanum_pattern_);
355 if (unicharset.get_islower(unichar_id)) {
356 vec->push_back(lower_pattern_);
357 } else if (unicharset.get_isupper(unichar_id)) {
358 vec->push_back(upper_pattern_);
359 }
360 }
361 if (unicharset.get_isdigit(unichar_id)) {
362 vec->push_back(digit_pattern_);
363 if (!is_alpha) {
364 vec->push_back(alphanum_pattern_);
365 }
366 }
367 if (unicharset.get_ispunctuation(unichar_id)) {
368 vec->push_back(punc_pattern_);
369 }
370}

◆ unichar_ids_of()

void tesseract::Trie::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlineoverridevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 109 of file trie.h.

109 {
110 const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
111 for (auto &edge : forward_edges) {
112 if (!word_end || end_of_word_from_edge_rec(edge)) {
113 vec->push_back(
114 NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
115 }
116 }
117 }

Member Data Documentation

◆ alpha_pattern_

UNICHAR_ID tesseract::Trie::alpha_pattern_ = 0
protected

Definition at line 410 of file trie.h.

◆ alphanum_pattern_

UNICHAR_ID tesseract::Trie::alphanum_pattern_ = 0
protected

Definition at line 412 of file trie.h.

◆ deref_direction_mask_

uint64_t tesseract::Trie::deref_direction_mask_ = 0
protected

Definition at line 406 of file trie.h.

◆ deref_node_index_mask_

uint64_t tesseract::Trie::deref_node_index_mask_ = 0
protected

Definition at line 407 of file trie.h.

◆ digit_pattern_

UNICHAR_ID tesseract::Trie::digit_pattern_ = 0
protected

Definition at line 411 of file trie.h.

◆ initialized_patterns_

bool tesseract::Trie::initialized_patterns_ = false
protected

Definition at line 416 of file trie.h.

◆ kAlphanumPatternUnicode

const char tesseract::Trie::kAlphanumPatternUnicode = "\u2002"
static

Definition at line 68 of file trie.h.

◆ kAlphaPatternUnicode

const char tesseract::Trie::kAlphaPatternUnicode = "\u2000"
static

Definition at line 66 of file trie.h.

◆ kDigitPatternUnicode

const char tesseract::Trie::kDigitPatternUnicode = "\u2001"
static

Definition at line 67 of file trie.h.

◆ kLowerPatternUnicode

const char tesseract::Trie::kLowerPatternUnicode = "\u2004"
static

Definition at line 70 of file trie.h.

◆ kPuncPatternUnicode

const char tesseract::Trie::kPuncPatternUnicode = "\u2003"
static

Definition at line 69 of file trie.h.

◆ kSaneNumConcreteChars

const int tesseract::Trie::kSaneNumConcreteChars = 0
static

Definition at line 62 of file trie.h.

◆ kUpperPatternUnicode

const char tesseract::Trie::kUpperPatternUnicode = "\u2005"
static

Definition at line 71 of file trie.h.

◆ lower_pattern_

UNICHAR_ID tesseract::Trie::lower_pattern_ = 0
protected

Definition at line 414 of file trie.h.

◆ nodes_

TRIE_NODES tesseract::Trie::nodes_
protected

Definition at line 402 of file trie.h.

◆ num_edges_

uint64_t tesseract::Trie::num_edges_ = 0
protected

Definition at line 405 of file trie.h.

◆ punc_pattern_

UNICHAR_ID tesseract::Trie::punc_pattern_ = 0
protected

Definition at line 413 of file trie.h.

◆ root_back_freelist_

std::vector<EDGE_INDEX> tesseract::Trie::root_back_freelist_
protected

Definition at line 404 of file trie.h.

◆ upper_pattern_

UNICHAR_ID tesseract::Trie::upper_pattern_ = 0
protected

Definition at line 415 of file trie.h.


The documentation for this class was generated from the following files: