37# define NO_EDGE (int64_t)0xffffffffffffffffi64
40# define NO_EDGE (int64_t)0xffffffffffffffffll
77#define FORWARD_EDGE (int32_t)0
78#define BACKWARD_EDGE (int32_t)1
79#define MAX_NODE_EDGES_DISPLAY (int64_t)100
80#define MARKER_FLAG (int64_t)1
81#define DIRECTION_FLAG (int64_t)2
82#define WERD_END_FLAG (int64_t)4
83#define LETTER_START_BIT 0
84#define NUM_FLAG_BITS 3
85#define REFFORMAT "%" PRId64
88 {
false,
true,
true,
false},
89 {
true,
false,
false,
false},
90 {
true,
false,
false,
false},
91 {
false,
false,
false,
false},
94static const char kWildcard[] =
"*";
113 static const int16_t kDawgMagicNumber = 42;
122 inline const std::string &
lang()
const {
136 bool prefix_in_dawg(
const WERD_CHOICE &prefix,
bool requires_complete)
const;
140 int check_for_words(
const char *filename,
const UNICHARSET &unicharset,
141 bool enable_wildcard)
const;
145 void iterate_words(
const UNICHARSET &unicharset,
146 std::function<
void(
const WERD_CHOICE *)> cb)
const;
150 void iterate_words(
const UNICHARSET &unicharset,
151 const std::function<
void(
const char *)> &cb)
const;
157 bool word_end)
const = 0;
162 bool word_end)
const = 0;
183 std::vector<UNICHAR_ID> *vec)
const {
193 bool word_end)
const {
207 debug_level_(debug_level) {}
211 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
215 return (edge_rec & (
MARKER_FLAG << flag_start_bit_)) != 0;
233 *edge_rec &= (~next_node_mask_);
234 *edge_rec |= ((
value << next_node_start_bit_) & next_node_mask_);
250 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
251 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
252 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
253 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
254 curr_word_end, curr_unichar_id)) {
257 if (unichar_id > curr_unichar_id) {
260 if (unichar_id == curr_unichar_id) {
261 if (next_node > curr_next_node) {
264 if (next_node == curr_next_node) {
265 if (word_end > curr_word_end) {
279 return ((unichar_id == other_unichar_id) &&
280 (next_node == NO_EDGE || next_node == other_next_node) &&
281 (!word_end || (word_end == other_word_end)));
286 void init(
int unicharset_size);
297 void iterate_words_rec(
299 const std::function<
void(
const WERD_CHOICE *)> &cb)
const;
310 uint64_t next_node_mask_ = 0;
311 uint64_t flags_mask_ = 0;
312 uint64_t letter_mask_ = 0;
314 int flag_start_bit_ = 0;
315 int next_node_start_bit_ = 0;
384 const char *debug_msg) {
385 for (
auto &&position : *
this) {
386 if (position == new_pos) {
412 :
Dawg(
type, lang, perm, debug_level) {}
415 :
Dawg(
type, lang, perm, debug_level) {
419 num_forward_edges_in_node0 = num_forward_edges(0);
422 const std::string &lang,
PermuterType perm,
int unicharset_size,
424 :
Dawg(
type, lang, perm, debug_level),
426 num_edges_(num_edges) {
427 init(unicharset_size);
428 num_forward_edges_in_node0 = num_forward_edges(0);
429 if (debug_level > 3) {
430 print_all(
"SquishedDawg:");
437 if (!read_squished_dawg(fp)) {
440 num_forward_edges_in_node0 = num_forward_edges(0);
450 bool word_end)
const override;
455 bool word_end)
const override {
457 if (!edge_occupied(edge) || edge == NO_EDGE) {
460 assert(forward_edge(edge));
462 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
463 vec->push_back(
NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
465 }
while (!last_edge(edge++));
471 return next_node_from_edge_rec((edges_[edge]));
477 return end_of_word_from_edge_rec((edges_[edge_ref]));
482 return unichar_id_from_edge_rec((edges_[edge_ref]));
487 void print_node(
NODE_REF node,
int max_num_edges)
const override;
496 file.OpenWrite(
nullptr);
497 if (!this->write_squished_dawg(&
file)) {
498 tprintf(
"Error serializing %s\n", filename);
501 if (!
file.CloseWrite(filename,
nullptr)) {
502 tprintf(
"Error writing file %s\n", filename);
511 set_next_node_in_edge_rec(&(edges_[edge_ref]),
value);
514 inline void set_empty_edge(
EDGE_REF edge_ref) {
515 (edges_[edge_ref] = next_node_mask_);
518 inline void clear_all_edges() {
519 for (
int edge = 0; edge < num_edges_; edge++) {
520 set_empty_edge(edge);
524 inline void clear_marker_flag(
EDGE_REF edge_ref) {
525 (edges_[edge_ref] &= ~(
MARKER_FLAG << flag_start_bit_));
528 inline bool forward_edge(
EDGE_REF edge_ref)
const {
529 return (edge_occupied(edge_ref) &&
530 (
FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
533 inline bool backward_edge(
EDGE_REF edge_ref)
const {
534 return (edge_occupied(edge_ref) &&
535 (
BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
538 inline bool edge_occupied(
EDGE_REF edge_ref)
const {
539 return (edges_[edge_ref] != next_node_mask_);
542 inline bool last_edge(
EDGE_REF edge_ref)
const {
543 return (edges_[edge_ref] & (
MARKER_FLAG << flag_start_bit_)) != 0;
547 int32_t num_forward_edges(
NODE_REF node)
const;
550 bool read_squished_dawg(TFile *
file);
553 void print_edge(
EDGE_REF edge)
const;
556 void print_all(
const char *msg) {
557 tprintf(
"\n__________________________\n%s\n", msg);
558 for (
int i = 0;
i < num_edges_; ++
i) {
561 tprintf(
"__________________________\n");
564 std::unique_ptr<EDGE_REF[]> build_node_map(int32_t *num_nodes)
const;
568 int32_t num_edges_ = 0;
569 int num_forward_edges_in_node0 = 0;
std::vector< int > SuccessorList
std::vector< SuccessorList * > SuccessorListsVector
void tprintf(const char *format,...)
std::vector< NodeChild > NodeChildVector
NodeChild(UNICHAR_ID id, EDGE_REF ref)
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
virtual bool end_of_word(EDGE_REF edge_ref) const =0
Dawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
const std::string & lang() const
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
virtual void print_node(NODE_REF node, int max_num_edges) const =0
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
PermuterType permuter() const
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
bool operator==(const DawgPosition &other) const
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
bool end_of_word(EDGE_REF edge_ref) const override
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const override
SquishedDawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
NODE_REF next_node(EDGE_REF edge) const override
SquishedDawg(const char *filename, DawgType type, const std::string &lang, PermuterType perm, int debug_level)
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
bool write_squished_dawg(const char *filename)
static FILE * Open(const std::string &filename, const std::string &mode)