62 static const int kSaneNumConcreteChars = 0;
66 static const char kAlphaPatternUnicode[];
67 static const char kDigitPatternUnicode[];
68 static const char kAlphanumPatternUnicode[];
69 static const char kPuncPatternUnicode[];
70 static const char kLowerPatternUnicode[];
71 static const char kUpperPatternUnicode[];
80 :
Dawg(
type, lang, perm, debug_level) {
81 init(unicharset_size);
82 deref_node_index_mask_ = ~letter_mask_;
86 for (
auto node : nodes_) {
98 if (!edge_char_of(node_ref, NO_EDGE,
FORWARD_EDGE, word_end, unichar_id, &edge_ptr,
102 return make_edge_ref(node_ref, edge_index);
110 const EDGE_VECTOR &forward_edges = nodes_[
static_cast<int>(node)]->forward_edges;
111 for (
auto &edge : forward_edges) {
112 if (!word_end || end_of_word_from_edge_rec(edge)) {
114 NodeChild(unichar_id_from_edge_rec(edge), make_edge_ref(node, &edge - &forward_edges[0])));
124 if (edge_ref == NO_EDGE || num_edges_ == 0) {
127 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
135 if (edge_ref == NO_EDGE || num_edges_ == 0) {
138 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
143 if (edge_ref == NO_EDGE || num_edges_ == 0) {
144 return INVALID_UNICHAR_ID;
146 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
151 *edge_rec &= ~letter_mask_;
155 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
160 void print_node(
NODE_REF node,
int max_num_edges)
const override;
172 bool read_and_add_word_list(
const char *filename,
const UNICHARSET &unicharset,
177 bool read_word_list(
const char *filename, std::vector<std::string> *words);
181 bool add_word_list(
const std::vector<std::string> &words,
const UNICHARSET &unicharset,
225 bool read_pattern_list(
const char *filename,
const UNICHARSET &unicharset);
229 void initialize_patterns(
UNICHARSET *unicharset);
234 std::vector<UNICHAR_ID> *vec)
const override;
240 bool word_end)
const override {
241 if (edge_ref == NO_EDGE) {
245 return (marker_flag_from_edge_rec(*edge_rec) &&
246 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
247 word_end == end_of_word_from_edge_rec(*edge_rec))
261 bool add_word_to_dawg(
const WERD_CHOICE &word,
const std::vector<bool> *repetitions);
263 return add_word_to_dawg(word,
nullptr);
285 int edge_index =
static_cast<int>((edge_ref & letter_mask_) >>
LETTER_START_BIT);
286 int node_index =
static_cast<int>((edge_ref & deref_node_index_mask_) >> flag_start_bit_);
292 return ((node_index << flag_start_bit_) | (edge_index <<
LETTER_START_BIT));
307 *edge = ((nxt << next_node_start_bit_) | (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
313 marker_flag_from_edge_rec(edge_rec) ?
"R," :
"",
314 (direction_from_edge_rec(edge_rec) ==
FORWARD_EDGE) ?
"F" :
"B",
315 end_of_word_from_edge_rec(edge_rec) ?
",E" :
"", unichar_id_from_edge_rec(edge_rec));
320 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
321 return (node_ref != NO_EDGE && nodes_[
static_cast<int>(node_ref)]->forward_edges.size() == 1);
327 tprintf(
"\n__________________________\n%s\n", msg);
328 for (
size_t i = 0;
i < nodes_.size(); ++
i) {
329 print_node(
i, max_num_edges);
331 tprintf(
"__________________________\n");
338 bool edge_char_of(
NODE_REF node_ref,
NODE_REF next_node,
int direction,
bool word_end,
343 bool add_edge_linkage(
NODE_REF node1,
NODE_REF node2,
bool repeats,
int direction,
bool word_end,
350 return (add_edge_linkage(node1, node2, repeats,
FORWARD_EDGE, word_end, unichar_id) &&
351 add_edge_linkage(node2, node1, repeats,
BACKWARD_EDGE, word_end, unichar_id));
364 void remove_edge_linkage(
NODE_REF node1,
NODE_REF node2,
int direction,
bool word_end,
370 remove_edge_linkage(node1, node2,
FORWARD_EDGE, word_end, unichar_id);
371 remove_edge_linkage(node2, node1,
BACKWARD_EDGE, word_end, unichar_id);
385 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes);
396 void reduce_node_input(
NODE_REF node, std::vector<bool> &reduced_nodes);
405 uint64_t num_edges_ = 0;
406 uint64_t deref_direction_mask_ = 0;
407 uint64_t deref_node_index_mask_ = 0;
416 bool initialized_patterns_ =
false;
std::vector< EDGE_RECORD > EDGE_VECTOR
void tprintf(const char *format,...)
std::vector< TRIE_NODE_RECORD * > TRIE_NODES
std::vector< NodeChild > NodeChildVector
EDGE_VECTOR backward_edges
EDGE_VECTOR forward_edges
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void KillEdge(EDGE_RECORD *edge_rec) const
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
NODE_REF next_node(EDGE_REF edge_ref) const override
void print_all(const char *msg, int max_num_edges)
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
bool add_word_to_dawg(const WERD_CHOICE &word)
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const override
bool end_of_word(EDGE_REF edge_ref) const override
std::vector< EDGE_INDEX > root_back_freelist_
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
bool DeadEdge(const EDGE_RECORD &edge_rec) const
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
void print_edge_rec(const EDGE_RECORD &edge_rec) const