All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::WordListLangModel Class Reference

#include <word_list_lang_model.h>

Inheritance diagram for tesseract::WordListLangModel:
tesseract::LangModel

Public Member Functions

 WordListLangModel (CubeRecoContext *cntxt)
 
 ~WordListLangModel ()
 
LangModEdgeRoot ()
 
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
 
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **edges)
 
bool IsLeadingPunc (char_32 ch)
 
bool IsTrailingPunc (char_32 ch)
 
bool IsDigit (char_32 ch)
 
bool AddString (const char *char_ptr)
 
bool AddString32 (const char_32 *char_32_ptr)
 
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
 
virtual ~LangModel ()
 
bool OOD ()
 
bool Numeric ()
 
bool WordList ()
 
bool Punc ()
 
void SetOOD (bool ood)
 
void SetNumeric (bool numeric)
 
void SetWordList (bool word_list)
 
void SetPunc (bool punc_enabled)
 

Static Public Member Functions

static void WordVariants (const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
 
bool numeric_enabled_
 
bool word_list_enabled_
 
bool punc_enabled_
 

Detailed Description

Definition at line 39 of file word_list_lang_model.h.

Constructor & Destructor Documentation

tesseract::WordListLangModel::WordListLangModel ( CubeRecoContext cntxt)
explicit

Definition at line 29 of file word_list_lang_model.cpp.

29  {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }
#define NULL
Definition: host.h:144
tesseract::WordListLangModel::~WordListLangModel ( )

Definition at line 35 of file word_list_lang_model.cpp.

35  {
36  Cleanup();
37 }

Member Function Documentation

bool tesseract::WordListLangModel::AddString ( const char *  char_ptr)

Definition at line 166 of file word_list_lang_model.cpp.

166  {
167  if (!init_ && !Init()) { // initialize if necessary
168  return false;
169  }
170 
171  string_32 str32;
172  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
173  if (str32.length() < 1) {
174  return false;
175  }
176  return AddString32(str32.c_str());
177 }
bool AddString32(const char_32 *char_32_ptr)
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
bool tesseract::WordListLangModel::AddString32 ( const char_32 char_32_ptr)

Definition at line 180 of file word_list_lang_model.cpp.

180  {
181  if (char_32_ptr == NULL) {
182  return false;
183  }
184  // get all the word variants
185  vector<WERD_CHOICE *> word_variants;
186  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
187  char_32_ptr, &word_variants);
188 
189  if (word_variants.size() > 0) {
190  // find the shortest variant
191  int shortest_word = 0;
192  for (int word = 1; word < word_variants.size(); word++) {
193  if (word_variants[shortest_word]->length() >
194  word_variants[word]->length()) {
195  shortest_word = word;
196  }
197  }
198  // only add the shortest grapheme interpretation of string to the word list
199  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
200  }
201  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
202  return true;
203 }
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
const UNICHARSET * TessUnicharset() const
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
CharSet * CharacterSet() const
#define NULL
Definition: host.h:144
LangModEdge ** tesseract::WordListLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 70 of file word_list_lang_model.cpp.

72  {
73  // initialize if necessary
74  if (init_ == false) {
75  if (Init() == false) {
76  return NULL;
77  }
78  }
79 
80  (*edge_cnt) = 0;
81 
82  EDGE_REF edge_ref;
83 
84  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
85 
86  if (tess_lm_edge == NULL) {
87  edge_ref = 0;
88  } else {
89  edge_ref = tess_lm_edge->EndEdge();
90 
91  // advance node
92  edge_ref = dawg_->next_node(edge_ref);
93  if (edge_ref == 0) {
94  return NULL;
95  }
96  }
97 
98  // allocate memory for edges
99  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
100  if (edge_array == NULL) {
101  return NULL;
102  }
103 
104  // now get all the emerging edges
105  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
106  edge_array + (*edge_cnt));
107 
108  return edge_array;
109 }
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
inT64 EDGE_REF
Definition: dawg.h:54
#define NULL
Definition: host.h:144
bool tesseract::WordListLangModel::IsDigit ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 58 of file word_list_lang_model.h.

58 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsLeadingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 56 of file word_list_lang_model.h.

56 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsTrailingPunc ( char_32  ch)
inlinevirtual

Implements tesseract::LangModel.

Definition at line 57 of file word_list_lang_model.h.

57 { return false; } // not yet implemented
bool tesseract::WordListLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  edges 
)
virtual

Implements tesseract::LangModel.

Definition at line 113 of file word_list_lang_model.cpp.

114  {
115  return false;
116 }
LangModEdge * tesseract::WordListLangModel::Root ( )
virtual

Implements tesseract::LangModel.

Definition at line 65 of file word_list_lang_model.cpp.

65  {
66  return NULL;
67 }
#define NULL
Definition: host.h:144
void tesseract::WordListLangModel::WordVariants ( const CharSet char_set,
const UNICHARSET uchset,
string_32  str32,
vector< WERD_CHOICE * > *  word_variants 
)
static

Definition at line 153 of file word_list_lang_model.cpp.

155  {
156  for (int i = 0; i < word_variants->size(); i++) {
157  delete (*word_variants)[i];
158  }
159  word_variants->clear();
160  string_32 prefix_str32;
161  WERD_CHOICE word_so_far(uchset);
162  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
163 }
basic_string< char_32 > string_32
Definition: string_32.h:41
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)

The documentation for this class was generated from the following files: