tesseract v5.3.3.20231005
tesseract::SquishedDawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::SquishedDawg:
tesseract::Dawg

Public Member Functions

 SquishedDawg (DawgType type, const std::string &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (const char *filename, DawgType type, const std::string &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
 
 ~SquishedDawg () override
 
bool Load (TFile *fp)
 
int NumEdges ()
 
EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const override
 Returns the edge that corresponds to the letter out of this node. More...
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const override
 
NODE_REF next_node (EDGE_REF edge) const override
 
bool end_of_word (EDGE_REF edge_ref) const override
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const override
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
 
void print_node (NODE_REF node, int max_num_edges) const override
 
bool write_squished_dawg (TFile *file)
 Writes the squished/reduced Dawg to a file. More...
 
bool write_squished_dawg (const char *filename)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const std::string & lang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, std::function< void(const WERD_CHOICE *)> cb) const
 
void iterate_words (const UNICHARSET &unicharset, const std::function< void(const char *)> &cb) const
 
virtual EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
 Returns the edge that corresponds to the letter out of this node. More...
 
virtual void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const =0
 
virtual NODE_REF next_node (EDGE_REF edge_ref) const =0
 
virtual bool end_of_word (EDGE_REF edge_ref) const =0
 
virtual UNICHAR_ID edge_letter (EDGE_REF edge_ref) const =0
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
 
virtual void print_node (NODE_REF node, int max_num_edges) const =0
 
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, std::vector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::Dawg
static const int16_t kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const std::string &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, uint32_t index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, const std::function< void(const WERD_CHOICE *)> &cb) const
 
- Protected Attributes inherited from tesseract::Dawg
std::string lang_
 
DawgType type_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
uint64_t next_node_mask_ = 0
 
uint64_t flags_mask_ = 0
 
uint64_t letter_mask_ = 0
 
int unicharset_size_
 
int flag_start_bit_ = 0
 
int next_node_start_bit_ = 0
 
int debug_level_
 

Detailed Description

Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words cannot be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).

Definition at line 408 of file dawg.h.

Constructor & Destructor Documentation

◆ SquishedDawg() [1/3]

tesseract::SquishedDawg::SquishedDawg ( DawgType  type,
const std::string &  lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 410 of file dawg.h.

412 : Dawg(type, lang, perm, debug_level) {}
Dawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
Definition: dawg.h:201
const std::string & lang() const
Definition: dawg.h:122
DawgType type() const
Definition: dawg.h:119

◆ SquishedDawg() [2/3]

tesseract::SquishedDawg::SquishedDawg ( const char *  filename,
DawgType  type,
const std::string &  lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 413 of file dawg.h.

415 : Dawg(type, lang, perm, debug_level) {
416 TFile file;
417 ASSERT_HOST(file.Open(filename, nullptr));
418 ASSERT_HOST(read_squished_dawg(&file));
419 num_forward_edges_in_node0 = num_forward_edges(0);
420 }
#define ASSERT_HOST(x)
Definition: errcode.h:54
static FILE * Open(const std::string &filename, const std::string &mode)
Definition: fileio.cpp:41

◆ SquishedDawg() [3/3]

tesseract::SquishedDawg::SquishedDawg ( EDGE_ARRAY  edges,
int  num_edges,
DawgType  type,
const std::string &  lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 421 of file dawg.h.

424 : Dawg(type, lang, perm, debug_level),
425 edges_(edges),
426 num_edges_(num_edges) {
427 init(unicharset_size);
428 num_forward_edges_in_node0 = num_forward_edges(0);
429 if (debug_level > 3) {
430 print_all("SquishedDawg:");
431 }
432 }
void init(int unicharset_size)
Definition: dawg.cpp:178

◆ ~SquishedDawg()

tesseract::SquishedDawg::~SquishedDawg ( )
override

Definition at line 194 of file dawg.cpp.

194 {
195 delete[] edges_;
196}

Member Function Documentation

◆ edge_char_of()

EDGE_REF tesseract::SquishedDawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
overridevirtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 198 of file dawg.cpp.

199 {
200 EDGE_REF edge = node;
201 if (node == 0) { // binary search
202 EDGE_REF start = 0;
203 EDGE_REF end = num_forward_edges_in_node0 - 1;
204 int compare;
205 while (start <= end) {
206 edge = (start + end) >> 1; // (start + end) / 2
207 compare = given_greater_than_edge_rec(NO_EDGE, word_end, unichar_id,
208 edges_[edge]);
209 if (compare == 0) { // given == vec[k]
210 return edge;
211 } else if (compare == 1) { // given > vec[k]
212 start = edge + 1;
213 } else { // given < vec[k]
214 end = edge - 1;
215 }
216 }
217 } else { // linear search
218 if (edge != NO_EDGE && edge_occupied(edge)) {
219 do {
220 if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
221 (!word_end || end_of_word_from_edge_rec(edges_[edge]))) {
222 return (edge);
223 }
224 } while (!last_edge(edge++));
225 }
226 }
227 return (NO_EDGE); // not found
228}
int64_t EDGE_REF
Definition: dawg.h:49
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:247
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:223
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:227

◆ edge_letter()

UNICHAR_ID tesseract::SquishedDawg::edge_letter ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 481 of file dawg.h.

481 {
482 return unichar_id_from_edge_rec((edges_[edge_ref]));
483 }

◆ end_of_word()

bool tesseract::SquishedDawg::end_of_word ( EDGE_REF  edge_ref) const
inlineoverridevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 476 of file dawg.h.

476 {
477 return end_of_word_from_edge_rec((edges_[edge_ref]));
478 }

◆ Load()

bool tesseract::SquishedDawg::Load ( TFile fp)
inline

Definition at line 436 of file dawg.h.

436 {
437 if (!read_squished_dawg(fp)) {
438 return false;
439 }
440 num_forward_edges_in_node0 = num_forward_edges(0);
441 return true;
442 }

◆ next_node()

NODE_REF tesseract::SquishedDawg::next_node ( EDGE_REF  edge) const
inlineoverridevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 470 of file dawg.h.

470 {
471 return next_node_from_edge_rec((edges_[edge]));
472 }
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:210

◆ NumEdges()

int tesseract::SquishedDawg::NumEdges ( )
inline

Definition at line 444 of file dawg.h.

444 {
445 return num_edges_;
446 }

◆ print_node()

void tesseract::SquishedDawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const
overridevirtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 243 of file dawg.cpp.

243 {
244 if (node == NO_EDGE) {
245 return; // nothing to print
246 }
247
248 EDGE_REF edge = node;
249 const char *forward_string = "FORWARD";
250 const char *backward_string = " ";
251
252 const char *last_string = "LAST";
253 const char *not_last_string = " ";
254
255 const char *eow_string = "EOW";
256 const char *not_eow_string = " ";
257
258 const char *direction;
259 const char *is_last;
260 const char *eow;
261
262 UNICHAR_ID unichar_id;
263
264 if (edge_occupied(edge)) {
265 do {
266 direction = forward_edge(edge) ? forward_string : backward_string;
267 is_last = last_edge(edge) ? last_string : not_last_string;
268 eow = end_of_word(edge) ? eow_string : not_eow_string;
269
270 unichar_id = edge_letter(edge);
271 tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
272 edge, next_node(edge), unichar_id, direction, is_last, eow);
273
274 if (edge - node > max_num_edges) {
275 return;
276 }
277 } while (!last_edge(edge++));
278
279 if (edge < num_edges_ && edge_occupied(edge) && backward_edge(edge)) {
280 do {
281 direction = forward_edge(edge) ? forward_string : backward_string;
282 is_last = last_edge(edge) ? last_string : not_last_string;
283 eow = end_of_word(edge) ? eow_string : not_eow_string;
284
285 unichar_id = edge_letter(edge);
286 tprintf(REFFORMAT " : next = " REFFORMAT
287 ", unichar_id = %d, %s %s %s\n",
288 edge, next_node(edge), unichar_id, direction, is_last, eow);
289
290 if (edge - node > MAX_NODE_EDGES_DISPLAY) {
291 return;
292 }
293 } while (!last_edge(edge++));
294 }
295 } else {
296 tprintf(REFFORMAT " : no edges in this node\n", node);
297 }
298 tprintf("\n");
299}
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:79
#define REFFORMAT
Definition: dawg.h:85
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int UNICHAR_ID
Definition: unichar.h:34
bool end_of_word(EDGE_REF edge_ref) const override
Definition: dawg.h:476
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const override
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:481
NODE_REF next_node(EDGE_REF edge) const override
Definition: dawg.h:470

◆ unichar_ids_of()

void tesseract::SquishedDawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlineoverridevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 454 of file dawg.h.

455 {
456 EDGE_REF edge = node;
457 if (!edge_occupied(edge) || edge == NO_EDGE) {
458 return;
459 }
460 assert(forward_edge(edge)); // we don't expect any backward edges to
461 do { // be present when this function is called
462 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
463 vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
464 }
465 } while (!last_edge(edge++));
466 }

◆ write_squished_dawg() [1/2]

bool tesseract::SquishedDawg::write_squished_dawg ( const char *  filename)
inline

Opens the file with the given filename and writes the squished/reduced Dawg to the file.

Definition at line 494 of file dawg.h.

494 {
495 TFile file;
496 file.OpenWrite(nullptr);
497 if (!this->write_squished_dawg(&file)) {
498 tprintf("Error serializing %s\n", filename);
499 return false;
500 }
501 if (!file.CloseWrite(filename, nullptr)) {
502 tprintf("Error writing file %s\n", filename);
503 return false;
504 }
505 return true;
506 }
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:391

◆ write_squished_dawg() [2/2]

bool tesseract::SquishedDawg::write_squished_dawg ( TFile file)

Writes the squished/reduced Dawg to a file.

Definition at line 391 of file dawg.cpp.

391 {
392 EDGE_REF edge;
393 int32_t num_edges;
394 int32_t node_count = 0;
395 EDGE_REF old_index;
396 EDGE_RECORD temp_record;
397
398 if (debug_level_) {
399 tprintf("write_squished_dawg\n");
400 }
401
402 std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
403
404 // Write the magic number to help detecting a change in endianness.
405 int16_t magic = kDawgMagicNumber;
406 if (!file->Serialize(&magic)) {
407 return false;
408 }
409 if (!file->Serialize(&unicharset_size_)) {
410 return false;
411 }
412
413 // Count the number of edges in this Dawg.
414 num_edges = 0;
415 for (edge = 0; edge < num_edges_; edge++) {
416 if (forward_edge(edge)) {
417 num_edges++;
418 }
419 }
420
421 // Write edge count to file.
422 if (!file->Serialize(&num_edges)) {
423 return false;
424 }
425
426 if (debug_level_) {
427 tprintf("%d nodes in DAWG\n", node_count);
428 tprintf("%d edges in DAWG\n", num_edges);
429 }
430
431 for (edge = 0; edge < num_edges_; edge++) {
432 if (forward_edge(edge)) { // write forward edges
433 do {
434 old_index = next_node_from_edge_rec(edges_[edge]);
435 set_next_node(edge, node_map[old_index]);
436 temp_record = edges_[edge];
437 if (!file->Serialize(&temp_record)) {
438 return false;
439 }
440 set_next_node(edge, old_index);
441 } while (!last_edge(edge++));
442
443 if (edge >= num_edges_) {
444 break;
445 }
446 if (backward_edge(edge)) { // skip back links
447 while (!last_edge(edge++)) {
448 ;
449 }
450 }
451
452 edge--;
453 }
454 }
455 return true;
456}
uint64_t EDGE_RECORD
Definition: dawg.h:47
int unicharset_size_
Definition: dawg.h:313
int debug_level_
Definition: dawg.h:317
static const int16_t kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:113

The documentation for this class was generated from the following files: