tesseract  4.00.00dev
tesseract::SquishedDawg Class Reference

#include <dawg.h>

Inheritance diagram for tesseract::SquishedDawg:
tesseract::Dawg

Public Member Functions

 SquishedDawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
 SquishedDawg (EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
 
virtual ~SquishedDawg ()
 
bool Load (TFile *fp)
 
int NumEdges ()
 
EDGE_REF edge_char_of (NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const
 Returns the edge that corresponds to the letter out of this node. More...
 
void unichar_ids_of (NODE_REF node, NodeChildVector *vec, bool word_end) const
 
NODE_REF next_node (EDGE_REF edge) const
 
bool end_of_word (EDGE_REF edge_ref) const
 
UNICHAR_ID edge_letter (EDGE_REF edge_ref) const
 Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF. More...
 
void print_node (NODE_REF node, int max_num_edges) const
 
bool write_squished_dawg (TFile *file)
 Writes the squished/reduced Dawg to a file. More...
 
bool write_squished_dawg (const char *filename)
 
- Public Member Functions inherited from tesseract::Dawg
DawgType type () const
 
const STRINGlang () const
 
PermuterType permuter () const
 
virtual ~Dawg ()
 
bool word_in_dawg (const WERD_CHOICE &word) const
 Returns true if the given word is in the Dawg. More...
 
bool prefix_in_dawg (const WERD_CHOICE &prefix, bool requires_complete) const
 
int check_for_words (const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const WERD_CHOICE *> *cb) const
 
void iterate_words (const UNICHARSET &unicharset, TessCallback1< const char *> *cb) const
 
virtual void unichar_id_to_patterns (UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
 
virtual EDGE_REF pattern_loop_edge (EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
 

Additional Inherited Members

- Static Public Attributes inherited from tesseract::Dawg
static const inT16 kDawgMagicNumber = 42
 Magic number to determine endianness when reading the Dawg from file. More...
 
static const UNICHAR_ID kPatternUnicharID = 0
 
- Protected Member Functions inherited from tesseract::Dawg
 Dawg (DawgType type, const STRING &lang, PermuterType perm, int debug_level)
 
NODE_REF next_node_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the next node visited by following this edge. More...
 
bool marker_flag_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the marker flag of this edge. More...
 
int direction_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns the direction flag of this edge. More...
 
bool end_of_word_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns true if this edge marks the end of a word. More...
 
UNICHAR_ID unichar_id_from_edge_rec (const EDGE_RECORD &edge_rec) const
 Returns UNICHAR_ID recorded in this edge. More...
 
void set_next_node_in_edge_rec (EDGE_RECORD *edge_rec, EDGE_REF value)
 Sets the next node link for this edge in the Dawg. More...
 
void set_marker_flag_in_edge_rec (EDGE_RECORD *edge_rec)
 Sets this edge record to be the last one in a sequence of edges. More...
 
int given_greater_than_edge_rec (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
 
bool edge_rec_match (NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
 
void init (int unicharset_size)
 
bool match_words (WERD_CHOICE *word, inT32 index, NODE_REF node, UNICHAR_ID wildcard) const
 
void iterate_words_rec (const WERD_CHOICE &word_so_far, NODE_REF to_explore, TessCallback1< const WERD_CHOICE *> *cb) const
 
- Protected Attributes inherited from tesseract::Dawg
DawgType type_
 
STRING lang_
 
PermuterType perm_
 Permuter code that should be used if the word is found in this Dawg. More...
 
int unicharset_size_
 
int flag_start_bit_
 
int next_node_start_bit_
 
uinT64 next_node_mask_
 
uinT64 flags_mask_
 
uinT64 letter_mask_
 
int debug_level_
 

Detailed Description

Concrete class that can operate on a compacted (squished) Dawg (read, search and write to file). This class is read-only in the sense that new words can not be added to an instance of SquishedDawg. The underlying representation of the nodes and edges in SquishedDawg is stored as a contiguous EDGE_ARRAY (read from file or given as an argument to the constructor).

Definition at line 413 of file dawg.h.

Constructor & Destructor Documentation

◆ SquishedDawg() [1/3]

tesseract::SquishedDawg::SquishedDawg ( DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 415 of file dawg.h.

417  : Dawg(type, lang, perm, debug_level) {}
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:205
DawgType type() const
Definition: dawg.h:128

◆ SquishedDawg() [2/3]

tesseract::SquishedDawg::SquishedDawg ( const char *  filename,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  debug_level 
)
inline

Definition at line 418 of file dawg.h.

420  : Dawg(type, lang, perm, debug_level) {
421  TFile file;
422  ASSERT_HOST(file.Open(filename, nullptr));
423  ASSERT_HOST(read_squished_dawg(&file));
424  num_forward_edges_in_node0 = num_forward_edges(0);
425  }
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:205
DawgType type() const
Definition: dawg.h:128
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ SquishedDawg() [3/3]

tesseract::SquishedDawg::SquishedDawg ( EDGE_ARRAY  edges,
int  num_edges,
DawgType  type,
const STRING lang,
PermuterType  perm,
int  unicharset_size,
int  debug_level 
)
inline

Definition at line 426 of file dawg.h.

429  : Dawg(type, lang, perm, debug_level),
430  edges_(edges),
431  num_edges_(num_edges) {
432  init(unicharset_size);
433  num_forward_edges_in_node0 = num_forward_edges(0);
434  if (debug_level > 3) print_all("SquishedDawg:");
435  }
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:205
DawgType type() const
Definition: dawg.h:128
void init(int unicharset_size)
Definition: dawg.cpp:176

◆ ~SquishedDawg()

tesseract::SquishedDawg::~SquishedDawg ( )
virtual

Definition at line 193 of file dawg.cpp.

193 { delete[] edges_; }

Member Function Documentation

◆ edge_char_of()

EDGE_REF tesseract::SquishedDawg::edge_char_of ( NODE_REF  node,
UNICHAR_ID  unichar_id,
bool  word_end 
) const
virtual

Returns the edge that corresponds to the letter out of this node.

Implements tesseract::Dawg.

Definition at line 195 of file dawg.cpp.

197  {
198  EDGE_REF edge = node;
199  if (node == 0) { // binary search
200  EDGE_REF start = 0;
201  EDGE_REF end = num_forward_edges_in_node0 - 1;
202  int compare;
203  while (start <= end) {
204  edge = (start + end) >> 1; // (start + end) / 2
205  compare = given_greater_than_edge_rec(NO_EDGE, word_end,
206  unichar_id, edges_[edge]);
207  if (compare == 0) { // given == vec[k]
208  return edge;
209  } else if (compare == 1) { // given > vec[k]
210  start = edge + 1;
211  } else { // given < vec[k]
212  end = edge - 1;
213  }
214  }
215  } else { // linear search
216  if (edge != NO_EDGE && edge_occupied(edge)) {
217  do {
218  if ((unichar_id_from_edge_rec(edges_[edge]) == unichar_id) &&
219  (!word_end || end_of_word_from_edge_rec(edges_[edge])))
220  return (edge);
221  } while (!last_edge(edge++));
222  }
223  }
224  return (NO_EDGE); // not found
225 }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
inT64 EDGE_REF
Definition: dawg.h:55
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:251
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ edge_letter()

UNICHAR_ID tesseract::SquishedDawg::edge_letter ( EDGE_REF  edge_ref) const
inlinevirtual

Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 478 of file dawg.h.

478  {
479  return unichar_id_from_edge_rec((edges_[edge_ref]));
480  }
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ end_of_word()

bool tesseract::SquishedDawg::end_of_word ( EDGE_REF  edge_ref) const
inlinevirtual

Returns true if the edge indicated by the given EDGE_REF marks the end of a word.

Implements tesseract::Dawg.

Definition at line 473 of file dawg.h.

473  {
474  return end_of_word_from_edge_rec((edges_[edge_ref]));
475  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226

◆ Load()

bool tesseract::SquishedDawg::Load ( TFile fp)
inline

Definition at line 439 of file dawg.h.

439  {
440  if (!read_squished_dawg(fp)) return false;
441  num_forward_edges_in_node0 = num_forward_edges(0);
442  return true;
443  }

◆ next_node()

NODE_REF tesseract::SquishedDawg::next_node ( EDGE_REF  edge) const
inlinevirtual

Returns the next node visited by following the edge indicated by the given EDGE_REF.

Implements tesseract::Dawg.

Definition at line 467 of file dawg.h.

467  {
468  return next_node_from_edge_rec((edges_[edge]));
469  }
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213

◆ NumEdges()

int tesseract::SquishedDawg::NumEdges ( )
inline

Definition at line 445 of file dawg.h.

445 { return num_edges_; }

◆ print_node()

void tesseract::SquishedDawg::print_node ( NODE_REF  node,
int  max_num_edges 
) const
virtual

Prints the contents of the node indicated by the given NODE_REF. At most max_num_edges will be printed.

Implements tesseract::Dawg.

Definition at line 240 of file dawg.cpp.

240  {
241  if (node == NO_EDGE) return; // nothing to print
242 
243  EDGE_REF edge = node;
244  const char *forward_string = "FORWARD";
245  const char *backward_string = " ";
246 
247  const char *last_string = "LAST";
248  const char *not_last_string = " ";
249 
250  const char *eow_string = "EOW";
251  const char *not_eow_string = " ";
252 
253  const char *direction;
254  const char *is_last;
255  const char *eow;
256 
257  UNICHAR_ID unichar_id;
258 
259  if (edge_occupied(edge)) {
260  do {
261  direction =
262  forward_edge(edge) ? forward_string : backward_string;
263  is_last = last_edge(edge) ? last_string : not_last_string;
264  eow = end_of_word(edge) ? eow_string : not_eow_string;
265 
266  unichar_id = edge_letter(edge);
267  tprintf(REFFORMAT " : next = " REFFORMAT ", unichar_id = %d, %s %s %s\n",
268  edge, next_node(edge), unichar_id,
269  direction, is_last, eow);
270 
271  if (edge - node > max_num_edges) return;
272  } while (!last_edge(edge++));
273 
274  if (edge < num_edges_ &&
275  edge_occupied(edge) && backward_edge(edge)) {
276  do {
277  direction =
278  forward_edge(edge) ? forward_string : backward_string;
279  is_last = last_edge(edge) ? last_string : not_last_string;
280  eow = end_of_word(edge) ? eow_string : not_eow_string;
281 
282  unichar_id = edge_letter(edge);
283  tprintf(REFFORMAT " : next = " REFFORMAT
284  ", unichar_id = %d, %s %s %s\n",
285  edge, next_node(edge), unichar_id,
286  direction, is_last, eow);
287 
288  if (edge - node > MAX_NODE_EDGES_DISPLAY) return;
289  } while (!last_edge(edge++));
290  }
291  }
292  else {
293  tprintf(REFFORMAT " : no edges in this node\n", node);
294  }
295  tprintf("\n");
296 }
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:87
#define REFFORMAT
Definition: dawg.h:93
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:478
inT64 EDGE_REF
Definition: dawg.h:55
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:467
#define tprintf(...)
Definition: tprintf.h:31
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:473
int UNICHAR_ID
Definition: unichar.h:35

◆ unichar_ids_of()

void tesseract::SquishedDawg::unichar_ids_of ( NODE_REF  node,
NodeChildVector vec,
bool  word_end 
) const
inlinevirtual

Fills the given NodeChildVector with all the unichar ids (and the corresponding EDGE_REFs) for which there is an edge out of this node.

Implements tesseract::Dawg.

Definition at line 453 of file dawg.h.

454  {
455  EDGE_REF edge = node;
456  if (!edge_occupied(edge) || edge == NO_EDGE) return;
457  assert(forward_edge(edge)); // we don't expect any backward edges to
458  do { // be present when this function is called
459  if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
460  vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
461  }
462  } while (!last_edge(edge++));
463  }
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:226
inT64 EDGE_REF
Definition: dawg.h:55
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:230

◆ write_squished_dawg() [1/2]

bool tesseract::SquishedDawg::write_squished_dawg ( TFile file)

Writes the squished/reduced Dawg to a file.

Definition at line 371 of file dawg.cpp.

371  {
372  EDGE_REF edge;
373  inT32 num_edges;
374  inT32 node_count = 0;
375  EDGE_REF old_index;
376  EDGE_RECORD temp_record;
377 
378  if (debug_level_) tprintf("write_squished_dawg\n");
379 
380  std::unique_ptr<EDGE_REF[]> node_map(build_node_map(&node_count));
381 
382  // Write the magic number to help detecting a change in endianness.
383  inT16 magic = kDawgMagicNumber;
384  if (file->FWrite(&magic, sizeof(magic), 1) != 1) return false;
385  if (file->FWrite(&unicharset_size_, sizeof(unicharset_size_), 1) != 1)
386  return false;
387 
388  // Count the number of edges in this Dawg.
389  num_edges = 0;
390  for (edge=0; edge < num_edges_; edge++)
391  if (forward_edge(edge))
392  num_edges++;
393 
394  // Write edge count to file.
395  if (file->FWrite(&num_edges, sizeof(num_edges), 1) != 1) return false;
396 
397  if (debug_level_) {
398  tprintf("%d nodes in DAWG\n", node_count);
399  tprintf("%d edges in DAWG\n", num_edges);
400  }
401 
402  for (edge = 0; edge < num_edges_; edge++) {
403  if (forward_edge(edge)) { // write forward edges
404  do {
405  old_index = next_node_from_edge_rec(edges_[edge]);
406  set_next_node(edge, node_map[old_index]);
407  temp_record = edges_[edge];
408  if (file->FWrite(&temp_record, sizeof(temp_record), 1) != 1)
409  return false;
410  set_next_node(edge, old_index);
411  } while (!last_edge(edge++));
412 
413  if (edge >= num_edges_) break;
414  if (backward_edge(edge)) // skip back links
415  while (!last_edge(edge++));
416 
417  edge--;
418  }
419  }
420  return true;
421 }
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:213
int unicharset_size_
Definition: dawg.h:309
int16_t inT16
Definition: host.h:36
inT64 EDGE_REF
Definition: dawg.h:55
static const inT16 kDawgMagicNumber
Magic number to determine endianness when reading the Dawg from file.
Definition: dawg.h:122
#define tprintf(...)
Definition: tprintf.h:31
int32_t inT32
Definition: host.h:38
uinT64 EDGE_RECORD
Definition: dawg.h:51
int debug_level_
Definition: dawg.h:316

◆ write_squished_dawg() [2/2]

bool tesseract::SquishedDawg::write_squished_dawg ( const char *  filename)
inline

Opens the file with the given filename and writes the squished/reduced Dawg to the file.

Definition at line 491 of file dawg.h.

491  {
492  TFile file;
493  file.OpenWrite(nullptr);
494  if (!this->write_squished_dawg(&file)) {
495  tprintf("Error serializing %s\n", filename);
496  return false;
497  }
498  if (!file.CloseWrite(filename, nullptr)) {
499  tprintf("Error writing file %s\n", filename);
500  return false;
501  }
502  return true;
503  }
bool write_squished_dawg(TFile *file)
Writes the squished/reduced Dawg to a file.
Definition: dawg.cpp:371
#define tprintf(...)
Definition: tprintf.h:31

The documentation for this class was generated from the following files: