42 bool requires_complete)
const {
44 return !requires_complete;
47 int end_index = word.
length() - 1;
48 for (
int i = 0;
i < end_index;
i++) {
50 if (edge == NO_EDGE) {
69 bool enable_wildcard)
const {
70 if (filename ==
nullptr) {
79 word_file = fopen(filename,
"r");
80 if (word_file ==
nullptr) {
81 tprintf(
"Error: Could not open file %s\n", filename);
90 enable_wildcard ? wildcard : INVALID_UNICHAR_ID)) {
91 tprintf(
"Missing word: %s\n",
string);
95 tprintf(
"Failed to create a valid word from %s\n",
string);
101 tprintf(
"Number of lost words=%d\n", misses);
107 std::function<
void(
const WERD_CHOICE *)> cb)
const {
112static void CallWithUTF8(
const std::function<
void(
const char *)> &cb,
120 const std::function<
void(
const char *)> &cb)
const {
121 using namespace std::placeholders;
123 std::bind(CallWithUTF8, cb, _1));
130 const std::function<
void(
const WERD_CHOICE *)> &cb)
const {
133 for (
auto &
i : children) {
148 if (wildcard != INVALID_UNICHAR_ID && word->
unichar_id(index) == wildcard) {
149 bool any_matched =
false;
152 for (
auto &
i : vec) {
161 auto word_end = index == word->
length() - 1;
163 if (edge != NO_EDGE) {
167 word->
print(
"match_words() found: ");
170 }
else if (node != 0) {
171 return match_words(word, index + 1, node, wildcard);
199 bool word_end)
const {
203 EDGE_REF end = num_forward_edges_in_node0 - 1;
205 while (start <= end) {
206 edge = (start + end) >> 1;
211 }
else if (compare == 1) {
218 if (edge != NO_EDGE && edge_occupied(edge)) {
224 }
while (!last_edge(edge++));
230int32_t SquishedDawg::num_forward_edges(
NODE_REF node)
const {
234 if (forward_edge(edge)) {
237 }
while (!last_edge(edge++));
244 if (node == NO_EDGE) {
249 const char *forward_string =
"FORWARD";
250 const char *backward_string =
" ";
252 const char *last_string =
"LAST";
253 const char *not_last_string =
" ";
255 const char *eow_string =
"EOW";
256 const char *not_eow_string =
" ";
258 const char *direction;
264 if (edge_occupied(edge)) {
266 direction = forward_edge(edge) ? forward_string : backward_string;
267 is_last = last_edge(edge) ? last_string : not_last_string;
268 eow =
end_of_word(edge) ? eow_string : not_eow_string;
272 edge,
next_node(edge), unichar_id, direction, is_last, eow);
274 if (edge - node > max_num_edges) {
277 }
while (!last_edge(edge++));
279 if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
281 direction = forward_edge(edge) ? forward_string : backward_string;
282 is_last = last_edge(edge) ? last_string : not_last_string;
283 eow =
end_of_word(edge) ? eow_string : not_eow_string;
287 ", unichar_id = %d, %s %s %s\n",
288 edge,
next_node(edge), unichar_id, direction, is_last, eow);
293 }
while (!last_edge(edge++));
301void SquishedDawg::print_edge(
EDGE_REF edge)
const {
302 if (edge == NO_EDGE) {
307 (forward_edge(edge) ?
"FORWARD" :
" "),
308 (last_edge(edge) ?
"LAST" :
" "),
313bool SquishedDawg::read_squished_dawg(TFile *
file) {
315 tprintf(
"Reading squished dawg\n");
321 if (!
file->DeSerialize(&magic)) {
329 int32_t unicharset_size;
330 if (!
file->DeSerialize(&unicharset_size)) {
333 if (!
file->DeSerialize(&num_edges_)) {
340 if (!
file->DeSerialize(&edges_[0], num_edges_)) {
344 tprintf(
"type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n",
346 for (
EDGE_REF edge = 0; edge < num_edges_; ++edge) {
353std::unique_ptr<EDGE_REF[]> SquishedDawg::build_node_map(
354 int32_t *num_nodes)
const {
356 std::unique_ptr<EDGE_REF[]> node_map(
new EDGE_REF[num_edges_]);
357 int32_t node_counter;
360 for (edge = 0; edge < num_edges_; edge++) {
364 node_counter = num_forward_edges(0);
367 for (edge = 0; edge < num_edges_; edge++) {
369 if (forward_edge(edge)) {
371 node_map[edge] = (edge ? node_counter : 0);
372 num_edges = num_forward_edges(edge);
374 node_counter += num_edges;
377 if (edge >= num_edges_) {
380 if (backward_edge(edge)) {
381 while (!last_edge(edge++)) {
394 int32_t node_count = 0;
399 tprintf(
"write_squished_dawg\n");
402 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
406 if (!
file->Serialize(&magic)) {
415 for (edge = 0; edge < num_edges_; edge++) {
416 if (forward_edge(edge)) {
422 if (!
file->Serialize(&num_edges)) {
427 tprintf(
"%d nodes in DAWG\n", node_count);
428 tprintf(
"%d edges in DAWG\n", num_edges);
431 for (edge = 0; edge < num_edges_; edge++) {
432 if (forward_edge(edge)) {
435 set_next_node(edge, node_map[old_index]);
436 temp_record = edges_[edge];
437 if (!
file->Serialize(&temp_record)) {
440 set_next_node(edge, old_index);
441 }
while (!last_edge(edge++));
443 if (edge >= num_edges_) {
446 if (backward_edge(edge)) {
447 while (!last_edge(edge++)) {
#define MAX_NODE_EDGES_DISPLAY
void tprintf(const char *format,...)
void chomp_string(char *str)
std::vector< NodeChild > NodeChildVector
void set_unichar_id(UNICHAR_ID unichar_id, unsigned index)
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void string_and_lengths(std::string *word_str, std::string *word_lengths_str) const
UNICHAR_ID unichar_id(unsigned index) const
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const =0
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
virtual bool end_of_word(EDGE_REF edge_ref) const =0
void iterate_words(const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
bool match_words(WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const
void init(int unicharset_size)
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
virtual NODE_REF next_node(EDGE_REF edge_ref) const =0
void iterate_words_rec(const WERD_CHOICE &word_so_far, NODE_REF to_explore, const std::function< void(const WERD_CHOICE *)> &cb) const
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
bool end_of_word(EDGE_REF edge_ref) const override
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
Returns the edge that corresponds to the letter out of this node.
NODE_REF next_node(EDGE_REF edge) const override
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
void print_node(NODE_REF node, int max_num_edges) const override