65 " direction %d word_end %d unichar_id %d, exploring node:\n",
66 node_ref,
next_node, direction, word_end, unichar_id);
67 if (node_ref != NO_EDGE) {
71 if (node_ref == NO_EDGE) {
74 assert(
static_cast<size_t>(node_ref) <
nodes_.size());
76 :
nodes_[node_ref]->backward_edges;
77 int vec_size = vec.size();
83 while (start <= end) {
84 k = (start + end) >> 1;
87 *edge_ptr = &(vec[k]);
90 }
else if (compare == 1) {
97 for (
int i = 0;
i < vec_size; ++
i) {
101 *edge_ptr = &(edge_rec);
113 : &(
nodes_[node1]->backward_edges);
114 unsigned search_index;
117 while (search_index < vec->size() &&
122 search_index = vec->size();
125 link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
129 (*vec)[edge_index] = edge_rec;
130 }
else if (search_index < vec->size()) {
131 vec->insert(vec->begin() + search_index, edge_rec);
133 vec->push_back(edge_rec);
163 if (repetitions !=
nullptr) {
167 for (
unsigned i = 0;
i < word.
length(); ++
i) {
176 bool marker_flag =
false;
178 int32_t still_finding_chars =
true;
179 int32_t word_end =
false;
180 bool add_failed =
false;
184 word.
print(
"\nAdding word: ");
189 for (
i = 0;
i < word.
length() - 1; ++
i) {
191 marker_flag = (repetitions !=
nullptr) ? (*repetitions)[
i] :
false;
193 tprintf(
"Adding letter %d\n", unichar_id);
195 if (still_finding_chars) {
202 still_finding_chars =
false;
212 still_finding_chars =
false;
222 if (!still_finding_chars) {
227 if (the_next_node == 0) {
231 if (!
add_new_edge(last_node, the_next_node, marker_flag, word_end, unichar_id)) {
236 last_node = the_next_node;
241 marker_flag = (repetitions !=
nullptr) ? (*repetitions)[
i] :
false;
243 tprintf(
"Adding letter %d\n", unichar_id);
245 if (still_finding_chars &&
254 if (!add_failed && !
add_new_edge(last_node, the_next_node, marker_flag,
true, unichar_id)) {
259 tprintf(
"Re-initializing document dictionary...\n");
275 std::vector<std::string> word_list;
279 std::sort(word_list.begin(), word_list.end(),
280 [](
auto &s1,
auto &s2) { return s1.size() > s2.size(); });
289 word_file = fopen(filename,
"rb");
290 if (word_file ==
nullptr) {
294 while (fgets(line_str,
sizeof(line_str), word_file) !=
nullptr) {
296 std::string word_str(line_str);
299 tprintf(
"Read %d words so far\n", word_count);
301 words->push_back(word_str);
304 tprintf(
"Read %d words total.\n", word_count);
312 for (
const auto &
i : words) {
324 tprintf(
"Error: word '%s' not in DAWG after adding it\n",
i.c_str());
350 std::vector<UNICHAR_ID> *vec)
const {
351 bool is_alpha = unicharset.
get_isalpha(unichar_id);
375 }
else if (
ch ==
'd') {
377 }
else if (
ch ==
'n') {
379 }
else if (
ch ==
'p') {
381 }
else if (
ch ==
'a') {
383 }
else if (
ch ==
'A') {
386 return INVALID_UNICHAR_ID;
392 tprintf(
"please call initialize_patterns() before read_pattern_list()\n");
396 FILE *pattern_file = fopen(filename,
"rb");
397 if (pattern_file ==
nullptr) {
398 tprintf(
"Error opening pattern file %s\n", filename);
402 int pattern_count = 0;
409 std::vector<bool> repetitions_vec;
410 const char *str_ptr = string;
411 int step = unicharset.
step(str_ptr);
414 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
415 if (step == 1 && *str_ptr ==
'\\') {
417 if (*str_ptr ==
'\\') {
423 "Please provide at least %d concrete characters at the"
424 " beginning of the pattern\n",
436 if (curr_unichar_id == INVALID_UNICHAR_ID) {
441 repetitions_vec.push_back(
false);
443 step = unicharset.
step(str_ptr);
445 if (step == 1 && *str_ptr ==
'\\' && *(str_ptr + 1) ==
'*') {
446 repetitions_vec[repetitions_vec.size() - 1] =
true;
448 step = unicharset.
step(str_ptr);
452 tprintf(
"Invalid user pattern %s\n",
string);
462 tprintf(
"Error: failed to insert pattern '%s'\n",
string);
468 tprintf(
"Read %d valid patterns from %s\n", pattern_count, filename);
470 fclose(pattern_file);
485 nodes_[node1]->forward_edges.erase(
nodes_[node1]->forward_edges.begin() + edge_index);
486 }
else if (node1 == 0) {
490 nodes_[node1]->backward_edges.erase(
nodes_[node1]->backward_edges.begin() + edge_index);
513 std::vector<bool> reduced_nodes(
nodes_.size());
521 std::vector<NODE_REF> node_ref_map(
nodes_.size() + 1);
524 node_ref_map[
i + 1] = node_ref_map[
i] +
nodes_[
i]->forward_edges.size();
526 int num_forward_edges = node_ref_map[
i];
530 auto edge_array =
new EDGE_RECORD[num_forward_edges];
535 for (
int j = 0; j < end; ++j) {
556 tprintf(
"\nCollapsing node %" PRIi64
":\n", node);
582 curr_unichar_id, &edge_ptr, &edge_index));
585 int next_node2_num_edges =
588 tprintf(
"removed %d edges from node " REFFORMAT "\n", next_node2_num_edges, next_node2);
597 EDGE_VECTOR *backward_edges, std::vector<bool> &reduced_nodes) {
602 bool did_something =
false;
603 for (
unsigned i = edge_index;
i < backward_edges->size() - 1; ++
i) {
605 UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
606 while (i < backward_edges->size()) {
609 if (curr_unichar_id != unichar_id) {
610 return did_something;
618 if (
i == backward_edges->size()) {
623 for (
auto j =
i + 1; j < backward_edges->size(); ++j) {
624 const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
629 if (next_id != unichar_id) {
636 did_something =
true;
641 return did_something;
645 int num_edges = edges->size();
646 if (num_edges <= 1) {
649 std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
650 sort_vec.reserve(num_edges);
651 for (
int i = 0;
i < num_edges; ++
i) {
654 std::sort(sort_vec.begin(), sort_vec.end());
655 for (
int i = 0;
i < num_edges; ++
i) {
656 (*edges)[
i] = sort_vec[
i].data();
669 while (
static_cast<size_t>(edge_index) < backward_edges.size()) {
670 if (
DeadEdge(backward_edges[edge_index])) {
677 while (
static_cast<size_t>(++edge_index) < backward_edges.size()) {
679 if (!
DeadEdge(backward_edges[edge_index]) &&
id != unichar_id) {
684 reduced_nodes[node] =
true;
691 for (
auto &backward_edge : backward_edges) {
703 if (node == NO_EDGE) {
710 for (
int dir = 0; dir < 2; ++dir) {
719 for (
i = 0; (dir == 0 ?
i < num_fwd :
i < num_bkw) &&
i < max_num_edges; ++
i) {
726 if (dir == 0 ?
i < num_fwd :
i < num_bkw) {
#define MAX_NODE_EDGES_DISPLAY
const char kDoNotReverse[]
std::vector< EDGE_RECORD > EDGE_VECTOR
const char kReverseIfHasRTL[]
void tprintf(const char *format,...)
void chomp_string(char *str)
const char kForceReverse[]
const char *const RTLReversePolicyNames[]
bool has_rtl_unichar_id() const
std::string debug_string() const
bool contains_unichar_id(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_id(unsigned index) const
void reverse_and_mirror_unichar_ids()
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool get_isalpha(UNICHAR_ID unichar_id) const
bool get_islower(UNICHAR_ID unichar_id) const
int step(const char *str) const
bool get_isupper(UNICHAR_ID unichar_id) const
bool get_isdigit(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
EDGE_VECTOR backward_edges
EDGE_VECTOR forward_edges
UNICHAR_ID alpha_pattern_
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
void KillEdge(EDGE_RECORD *edge_rec) const
bool initialized_patterns_
static const char kLowerPatternUnicode[]
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
static const char kAlphanumPatternUnicode[]
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
NODE_REF next_node(EDGE_REF edge_ref) const override
void initialize_patterns(UNICHARSET *unicharset)
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
void print_all(const char *msg, int max_num_edges)
static const char kUpperPatternUnicode[]
UNICHAR_ID digit_pattern_
SquishedDawg * trie_to_dawg()
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const override
UNICHAR_ID lower_pattern_
static const char kDigitPatternUnicode[]
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
void reduce_node_input(NODE_REF node, std::vector< bool > &reduced_nodes)
UNICHAR_ID upper_pattern_
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
void sort_edges(EDGE_VECTOR *edges)
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, std::vector< bool > &reduced_nodes)
UNICHAR_ID alphanum_pattern_
static const char kPuncPatternUnicode[]
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const override
bool add_word_to_dawg(const WERD_CHOICE &word, const std::vector< bool > *repetitions)
std::vector< EDGE_INDEX > root_back_freelist_
UNICHAR_ID character_class_to_pattern(char ch)
bool add_word_list(const std::vector< std::string > &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy)
void print_node(NODE_REF node, int max_num_edges) const override
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
static const char kAlphaPatternUnicode[]
bool DeadEdge(const EDGE_RECORD &edge_rec) const
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
static const int kSaneNumConcreteChars
void print_edge_rec(const EDGE_RECORD &edge_rec) const
bool read_word_list(const char *filename, std::vector< std::string > *words)
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)