tesseract v5.3.3.20231005
trie.h
Go to the documentation of this file.
1/******************************************************************************
2 *
3 * File: trie.h
4 * Description: Functions to build a trie data structure.
5 * Author: Mark Seaman, SW Productivity
6 *
7 * (c) Copyright 1987, Hewlett-Packard Company.
8 ** Licensed under the Apache License, Version 2.0 (the "License");
9 ** you may not use this file except in compliance with the License.
10 ** You may obtain a copy of the License at
11 ** http://www.apache.org/licenses/LICENSE-2.0
12 ** Unless required by applicable law or agreed to in writing, software
13 ** distributed under the License is distributed on an "AS IS" BASIS,
14 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 ** See the License for the specific language governing permissions and
16 ** limitations under the License.
17 *
18 *****************************************************************************/
19#ifndef TRIE_H
20#define TRIE_H
21
22#include "dawg.h"
23
24namespace tesseract {
25
26class UNICHARSET;
27
28// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
29// max int32, we will need to change vector to use int64 for size
30// and address indices. This does not seem to be needed immediately,
31// since currently the largest number of edges limit used by tesseract
32// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
33// There are also int casts below to satisfy the WIN32 compiler that would
34// need to be changed.
35// It might be cleanest to change the types of most of the Trie/Dawg related
36// typedefs to int and restrict the casts to extracting these values from
37// the 64 bit EDGE_RECORD.
38using EDGE_INDEX = int64_t; // index of an edge in a given node
39using EDGE_VECTOR = std::vector<EDGE_RECORD>;
40
44};
45using TRIE_NODES = std::vector<TRIE_NODE_RECORD *>;
46
53class TESS_API Trie : public Dawg {
54public:
59 };
60
61 // Minimum number of concrete characters at the beginning of user patterns.
62 static const int kSaneNumConcreteChars = 0;
63 // Various unicode whitespace characters are used to denote unichar patterns,
64 // (character classifier would never produce these whitespace characters as a
65 // valid classification).
66 static const char kAlphaPatternUnicode[];
67 static const char kDigitPatternUnicode[];
68 static const char kAlphanumPatternUnicode[];
69 static const char kPuncPatternUnicode[];
70 static const char kLowerPatternUnicode[];
71 static const char kUpperPatternUnicode[];
72
73 static const char *get_reverse_policy_name(RTLReversePolicy reverse_policy);
74
75 // max_num_edges argument allows limiting the amount of memory this
76 // Trie can consume (if a new word insert would cause the Trie to
77 // contain more edges than max_num_edges, all the edges are cleared
78 // so that new inserts can proceed).
79 Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
80 : Dawg(type, lang, perm, debug_level) {
81 init(unicharset_size);
82 deref_node_index_mask_ = ~letter_mask_;
83 new_dawg_node(); // need to allocate node 0
84 }
85 ~Trie() override {
86 for (auto node : nodes_) {
87 delete node;
88 }
89 }
90
91 // Reset the Trie to empty.
92 void clear();
93
95 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override {
96 EDGE_RECORD *edge_ptr;
97 EDGE_INDEX edge_index;
98 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
99 &edge_index)) {
100 return NO_EDGE;
101 }
102 return make_edge_ref(node_ref, edge_index);
103 }
104
109 void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override {
110 const EDGE_VECTOR &forward_edges = nodes_[static_cast<int>(node)]->forward_edges;
111 for (auto &edge : forward_edges) {
112 if (!word_end || end_of_word_from_edge_rec(edge)) {
113 vec->push_back(
114 NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
115 }
116 }
117 }
118
123 NODE_REF next_node(EDGE_REF edge_ref) const override {
124 if (edge_ref == NO_EDGE || num_edges_ == 0) {
125 return NO_EDGE;
126 }
127 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
128 }
129
134 bool end_of_word(EDGE_REF edge_ref) const override {
135 if (edge_ref == NO_EDGE || num_edges_ == 0) {
136 return false;
137 }
138 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
139 }
140
142 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override {
143 if (edge_ref == NO_EDGE || num_edges_ == 0) {
144 return INVALID_UNICHAR_ID;
145 }
146 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
147 }
148 // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
149 // the edge dead.
150 void KillEdge(EDGE_RECORD *edge_rec) const {
151 *edge_rec &= ~letter_mask_;
152 *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
153 }
154 bool DeadEdge(const EDGE_RECORD &edge_rec) const {
155 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
156 }
157
158 // Prints the contents of the node indicated by the given NODE_REF.
159 // At most max_num_edges will be printed.
160 void print_node(NODE_REF node, int max_num_edges) const override;
161
162 // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
163 // Eliminates redundant edges and returns the pointer to the SquishedDawg.
164 // Note: the caller is responsible for deallocating memory associated
165 // with the returned SquishedDawg pointer.
166 SquishedDawg *trie_to_dawg();
167
168 // Reads a list of words from the given file and adds into the Trie.
169 // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
170 // policy and information in the unicharset.
171 // Returns false on error.
172 bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
173 Trie::RTLReversePolicy reverse);
174
175 // Reads a list of words from the given file.
176 // Returns false on error.
177 bool read_word_list(const char *filename, std::vector<std::string> *words);
178 // Adds a list of words previously read using read_word_list to the trie
179 // using the given unicharset and reverse_policy to convert to unichar-ids.
180 // Returns false on error.
181 bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
182 Trie::RTLReversePolicy reverse_policy);
183
184 // Inserts the list of patterns from the given file into the Trie.
185 // The pattern list file should contain one pattern per line in UTF-8 format.
186 //
187 // Each pattern can contain any non-whitespace characters, however only the
188 // patterns that contain characters from the unicharset of the corresponding
189 // language will be useful.
190 // The only meta character is '\'. To be used in a pattern as an ordinary
191 // string it should be escaped with '\' (e.g. string "C:\Documents" should
192 // be written in the patterns file as "C:\\Documents").
193 // This function supports a very limited regular expression syntax. One can
194 // express a character, a certain character class and a number of times the
195 // entity should be repeated in the pattern.
196 //
197 // To denote a character class use one of:
198 // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
199 // \d - unichar for which UNICHARSET::get_isdigit() is true
200 // \n - unichar for which UNICHARSET::get_isdigit() or
201 // UNICHARSET::isalpha() are true
202 // \p - unichar for which UNICHARSET::get_ispunct() is true
203 // \a - unichar for which UNICHARSET::get_islower() is true
204 // \A - unichar for which UNICHARSET::get_isupper() is true
205 //
206 // \* could be specified after each character or pattern to indicate that
207 // the character/pattern can be repeated any number of times before the next
208 // character/pattern occurs.
209 //
210 // Examples:
211 // 1-8\d\d-GOOG-411 will be expanded to strings:
212 // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
213 //
214 // http://www.\n\*.com will be expanded to strings like:
215 // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
216 //
217 // Note: In choosing which patterns to include please be aware of the fact
218 // providing very generic patterns will make tesseract run slower.
219 // For example \n\* at the beginning of the pattern will make Tesseract
220 // consider all the combinations of proposed character choices for each
221 // of the segmentations, which will be unacceptably slow.
222 // Because of potential problems with speed that could be difficult to
223 // identify, each user pattern has to have at least kSaneNumConcreteChars
224 // concrete characters from the unicharset at the beginning.
225 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
226
227 // Initializes the values of *_pattern_ unichar ids.
228 // This function should be called before calling read_pattern_list().
229 void initialize_patterns(UNICHARSET *unicharset);
230
231 // Fills in the given unichar id vector with the unichar ids that represent
232 // the patterns of the character classes of the given unichar_id.
233 void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
234 std::vector<UNICHAR_ID> *vec) const override;
235
236 // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
237 // a self loop and the given unichar_id matches the unichar_id stored in the
238 // EDGE_RECORD, returns NO_EDGE otherwise.
240 bool word_end) const override {
241 if (edge_ref == NO_EDGE) {
242 return NO_EDGE;
243 }
244 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
245 return (marker_flag_from_edge_rec(*edge_rec) &&
246 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247 word_end == end_of_word_from_edge_rec(*edge_rec))
248 ? edge_ref
249 : NO_EDGE;
250 }
251
252 // Adds a word to the Trie (creates the necessary nodes and edges).
253 //
254 // If repetitions vector is not nullptr, each entry in the vector indicates
255 // whether the unichar id with the corresponding index in the word is allowed
256 // to repeat an unlimited number of times. For each entry that is true, MARKER
257 // flag of the corresponding edge created for this unichar id is set to true).
258 //
259 // Return true if add succeeded, false otherwise (e.g. when a word contained
260 // an invalid unichar id or the trie was getting too large and was cleared).
261 bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions);
262 bool add_word_to_dawg(const WERD_CHOICE &word) {
263 return add_word_to_dawg(word, nullptr);
264 }
265
266protected:
267 // The structure of an EDGE_REF for Trie edges is as follows:
268 // [LETTER_START_BIT, flag_start_bit_):
269 // edge index in *_edges in a TRIE_NODE_RECORD
270 // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
271 //
272 // With this arrangement there are enough bits to represent edge indices
273 // (each node can have at most unicharset_size_ forward edges and
274 // the position of flag_start_bit is set to be log2(unicharset_size_)).
275 // It is also possible to accommodate a maximum number of nodes that is at
276 // least as large as that of the SquishedDawg representation (in SquishedDawg
277 // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
278 // the next node index).
279 //
280
281 // Returns the pointer to EDGE_RECORD after decoding the location
282 // of the edge from the information in the given EDGE_REF.
283 // This function assumes that EDGE_REF holds valid node/edge indices.
284 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
285 int edge_index = static_cast<int>((edge_ref & letter_mask_) >> LETTER_START_BIT);
286 int node_index = static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
287 TRIE_NODE_RECORD *node_rec = nodes_[node_index];
288 return &(node_rec->forward_edges[edge_index]);
289 }
291 inline EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const {
292 return ((node_index << flag_start_bit_) | (edge_index << LETTER_START_BIT));
293 }
295 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end,
296 UNICHAR_ID unichar_id) {
297 EDGE_RECORD flags = 0;
298 if (repeats) {
299 flags |= MARKER_FLAG;
300 }
301 if (word_end) {
302 flags |= WERD_END_FLAG;
303 }
304 if (direction == BACKWARD_EDGE) {
305 flags |= DIRECTION_FLAG;
306 }
307 *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
308 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
309 }
311 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
312 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
313 marker_flag_from_edge_rec(edge_rec) ? "R," : "",
314 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
315 end_of_word_from_edge_rec(edge_rec) ? ",E" : "", unichar_id_from_edge_rec(edge_rec));
316 }
317 // Returns true if the next node in recorded the given EDGE_RECORD
318 // has exactly one forward edge.
319 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
320 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321 return (node_ref != NO_EDGE && nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
322 }
323
324 // Prints the contents of the Trie.
325 // At most max_num_edges will be printed for each node.
326 void print_all(const char *msg, int max_num_edges) {
327 tprintf("\n__________________________\n%s\n", msg);
328 for (size_t i = 0; i < nodes_.size(); ++i) {
329 print_node(i, max_num_edges);
330 }
331 tprintf("__________________________\n");
332 }
333
334 // Finds the edge with the given direction, word_end and unichar_id
335 // in the node indicated by node_ref. Fills in the pointer to the
336 // EDGE_RECORD and the index of the edge with the values
337 // corresponding to the edge found. Returns true if an edge was found.
338 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node, int direction, bool word_end,
339 UNICHAR_ID unichar_id, EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
340
341 // Adds an single edge linkage between node1 and node2 in the direction
342 // indicated by direction argument.
343 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end,
344 UNICHAR_ID unichar_id);
345
346 // Adds forward edge linkage from node1 to node2 and the corresponding
347 // backward edge linkage in the other direction.
348 bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end,
349 UNICHAR_ID unichar_id) {
350 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE, word_end, unichar_id) &&
351 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE, word_end, unichar_id));
352 }
353
354 // Sets the word ending flags in an already existing edge pair.
355 // Returns true on success.
356 void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats,
357 UNICHAR_ID unichar_id);
358
359 // Allocates space for a new node in the Trie.
360 NODE_REF new_dawg_node();
361
362 // Removes a single edge linkage to between node1 and node2 in the
363 // direction indicated by direction argument.
364 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end,
365 UNICHAR_ID unichar_id);
366
367 // Removes forward edge linkage from node1 to node2 and the corresponding
368 // backward edge linkage in the other direction.
369 void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id) {
370 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
371 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
372 }
373
374 // Compares edge1 and edge2 in the given node to see if they point to two
375 // next nodes that could be collapsed. If they do, performs the reduction
376 // and returns true.
377 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2);
378
379 // Assuming that edge_index indicates the first edge in a group of edges
380 // in this node with a particular letter value, looks through these edges
381 // to see if any of them can be collapsed. If so does it. Returns to the
382 // caller when all edges with this letter have been reduced.
383 // Returns true if further reduction is possible with this same letter.
384 bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node,
385 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
386
393 void sort_edges(EDGE_VECTOR *edges);
394
396 void reduce_node_input(NODE_REF node, std::vector<bool> &reduced_nodes);
397
398 // Returns the pattern unichar id for the given character class code.
399 UNICHAR_ID character_class_to_pattern(char ch);
400
401 // Member variables
402 TRIE_NODES nodes_; // vector of nodes in the Trie
403 // Freelist of edges in the root backwards node that were previously zeroed.
404 std::vector<EDGE_INDEX> root_back_freelist_;
405 uint64_t num_edges_ = 0; // sum of all edges (forward and backward)
406 uint64_t deref_direction_mask_ = 0; // mask for EDGE_REF to extract direction
407 uint64_t deref_node_index_mask_ = 0; // mask for EDGE_REF to extract node index
408 // Variables for translating character class codes denoted in user patterns
409 // file to the unichar ids used to represent them in a Trie.
410 UNICHAR_ID alpha_pattern_ = 0;
411 UNICHAR_ID digit_pattern_ = 0;
412 UNICHAR_ID alphanum_pattern_ = 0;
413 UNICHAR_ID punc_pattern_ = 0;
414 UNICHAR_ID lower_pattern_ = 0;
415 UNICHAR_ID upper_pattern_ = 0;
416 bool initialized_patterns_ = false;
417};
418
419} // namespace tesseract
420
421#endif
#define WERD_END_FLAG
Definition: dawg.h:82
#define BACKWARD_EDGE
Definition: dawg.h:78
#define FORWARD_EDGE
Definition: dawg.h:77
#define MARKER_FLAG
Definition: dawg.h:80
#define REFFORMAT
Definition: dawg.h:85
#define LETTER_START_BIT
Definition: dawg.h:83
#define DIRECTION_FLAG
Definition: dawg.h:81
DawgType
Definition: dawg.h:64
int64_t EDGE_REF
Definition: dawg.h:49
uint64_t EDGE_RECORD
Definition: dawg.h:47
std::vector< EDGE_RECORD > EDGE_VECTOR
Definition: trie.h:39
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int64_t NODE_REF
Definition: dawg.h:50
std::vector< TRIE_NODE_RECORD * > TRIE_NODES
Definition: trie.h:45
int64_t EDGE_INDEX
Definition: trie.h:38
int UNICHAR_ID
Definition: unichar.h:34
std::vector< NodeChild > NodeChildVector
Definition: dawg.h:60
PermuterType
Definition: ratngs.h:235
type
Definition: upload.py:458
EDGE_VECTOR backward_edges
Definition: trie.h:43
EDGE_VECTOR forward_edges
Definition: trie.h:42
~Trie() override
Definition: trie.h:85
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:295
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:150
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Definition: trie.h:109
Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:79
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:369
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:348
NODE_REF next_node(EDGE_REF edge_ref) const override
Definition: trie.h:123
TRIE_NODES nodes_
Definition: trie.h:402
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:326
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:95
bool add_word_to_dawg(const WERD_CHOICE &word)
Definition: trie.h:262
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Definition: trie.h:142
EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
Definition: trie.h:239
RTLReversePolicy
Definition: trie.h:55
@ RRP_DO_NO_REVERSE
Definition: trie.h:56
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
@ RRP_FORCE_REVERSE
Definition: trie.h:58
bool end_of_word(EDGE_REF edge_ref) const override
Definition: trie.h:134
std::vector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:404
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:284
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:154
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:291
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:319
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:311
#define TESS_API
Definition: export.h:32