All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::TessLangModel Class Reference

#include <tess_lang_model.h>

Inheritance diagram for tesseract::TessLangModel:
tesseract::LangModel

Public Member Functions

 TessLangModel (const string &lm_params, const string &data_file_path, bool load_system_dawg, TessdataManager *tessdata_manager, CubeRecoContext *cntxt)
 
 ~TessLangModel ()
 
TessLangModEdgeRoot ()
 
LangModEdge ** GetEdges (CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
 
bool IsValidSequence (const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
 
bool IsLeadingPunc (char_32 ch)
 
bool IsTrailingPunc (char_32 ch)
 
bool IsDigit (char_32 ch)
 
void RemoveInvalidCharacters (string *lm_str)
 
- Public Member Functions inherited from tesseract::LangModel
 LangModel ()
 
virtual ~LangModel ()
 
bool OOD ()
 
bool Numeric ()
 
bool WordList ()
 
bool Punc ()
 
void SetOOD (bool ood)
 
void SetNumeric (bool numeric)
 
void SetWordList (bool word_list)
 
void SetPunc (bool punc_enabled)
 

Additional Inherited Members

- Protected Attributes inherited from tesseract::LangModel
bool ood_enabled_
 
bool numeric_enabled_
 
bool word_list_enabled_
 
bool punc_enabled_
 

Detailed Description

Definition at line 38 of file tess_lang_model.h.

Constructor & Destructor Documentation

tesseract::TessLangModel::TessLangModel ( const string &  lm_params,
const string &  data_file_path,
bool  load_system_dawg,
TessdataManager tessdata_manager,
CubeRecoContext cntxt 
)

Definition at line 60 of file tess_lang_model.cpp.

64  {
65  cntxt_ = cntxt;
66  has_case_ = cntxt_->HasCase();
67  // Load the rest of the language model elements from file
68  LoadLangModelElements(lm_params);
69  // Load word_dawgs_ if needed.
70  if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) {
71  word_dawgs_ = new DawgVector();
72  if (load_system_dawg &&
73  tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) {
74  // The last parameter to the Dawg constructor (the debug level) is set to
75  // false, until Cube has a way to express its preferred debug level.
76  *word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(),
78  cntxt_->Lang().c_str(),
79  SYSTEM_DAWG_PERM, false);
80  }
81  } else {
82  word_dawgs_ = NULL;
83  }
84 }
GenericVector< Dawg * > DawgVector
Definition: dict.h:50
#define NULL
Definition: host.h:144
const string & Lang() const
tesseract::TessLangModel::~TessLangModel ( )
inline

Definition at line 45 of file tess_lang_model.h.

45  {
46  if (word_dawgs_ != NULL) {
47  word_dawgs_->delete_data_pointers();
48  delete word_dawgs_;
49  }
50  }
void delete_data_pointers()
#define NULL
Definition: host.h:144

Member Function Documentation

LangModEdge ** tesseract::TessLangModel::GetEdges ( CharAltList alt_list,
LangModEdge edge,
int *  edge_cnt 
)
virtual

Implements tesseract::LangModel.

Definition at line 169 of file tess_lang_model.cpp.

171  {
172  TessLangModEdge *tess_lm_edge =
173  reinterpret_cast<TessLangModEdge *>(lang_mod_edge);
174  LangModEdge **edge_array = NULL;
175  (*edge_cnt) = 0;
176 
177  // if we are starting from the root, we'll instantiate every DAWG
178  // and get the all the edges that emerge from the root
179  if (tess_lm_edge == NULL) {
180  // get DAWG count from Tesseract
181  int dawg_cnt = NumDawgs();
182  // preallocate the edge buffer
183  (*edge_cnt) = dawg_cnt * max_edge_;
184  edge_array = new LangModEdge *[(*edge_cnt)];
185  if (edge_array == NULL) {
186  return NULL;
187  }
188 
189  for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) {
190  const Dawg *curr_dawg = GetDawg(dawg_idx);
191  // Only look through word Dawgs (since there is a special way of
192  // handling numbers and punctuation).
193  if (curr_dawg->type() == DAWG_TYPE_WORD) {
194  (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true,
195  edge_array + (*edge_cnt));
196  }
197  } // dawg
198 
199  (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true,
200  edge_array + (*edge_cnt));
201 
202  // OOD: it is intentionally not added to the list to make sure it comes
203  // at the end
204  (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true,
205  edge_array + (*edge_cnt));
206 
207  // set the root flag for all root edges
208  for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) {
209  edge_array[edge_idx]->SetRoot(true);
210  }
211  } else { // not starting at the root
212  // preallocate the edge buffer
213  (*edge_cnt) = max_edge_;
214  // allocate memory for edges
215  edge_array = new LangModEdge *[(*edge_cnt)];
216  if (edge_array == NULL) {
217  return NULL;
218  }
219 
220  // get the FanOut edges from the root of each dawg
221  (*edge_cnt) = FanOut(alt_list,
222  tess_lm_edge->GetDawg(),
223  tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(),
224  tess_lm_edge->EdgeString(), false, edge_array);
225  }
226  return edge_array;
227 }
#define NULL
Definition: host.h:144
bool tesseract::TessLangModel::IsDigit ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 162 of file tess_lang_model.cpp.

162  {
163  return digits_.find(ch) != string::npos;
164 }
bool tesseract::TessLangModel::IsLeadingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 154 of file tess_lang_model.cpp.

154  {
155  return lead_punc_.find(ch) != string::npos;
156 }
bool tesseract::TessLangModel::IsTrailingPunc ( char_32  ch)
virtual

Implements tesseract::LangModel.

Definition at line 158 of file tess_lang_model.cpp.

158  {
159  return trail_punc_.find(ch) != string::npos;
160 }
bool tesseract::TessLangModel::IsValidSequence ( const char_32 sequence,
bool  eow_flag,
LangModEdge **  final_edge = NULL 
)
virtual

Implements tesseract::LangModel.

Definition at line 145 of file tess_lang_model.cpp.

146  {
147  if (final_edge != NULL) {
148  (*final_edge) = NULL;
149  }
150 
151  return IsValidSequence(NULL, sequence, eow_flag, final_edge);
152 }
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **final_edge=NULL)
#define NULL
Definition: host.h:144
void tesseract::TessLangModel::RemoveInvalidCharacters ( string *  lm_str)

Definition at line 482 of file tess_lang_model.cpp.

482  {
483  CharSet *char_set = cntxt_->CharacterSet();
484  tesseract::string_32 lm_str32;
485  CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32);
486 
487  int len = CubeUtils::StrLen(lm_str32.c_str());
488  char_32 *clean_str32 = new char_32[len + 1];
489  if (!clean_str32)
490  return;
491  int clean_len = 0;
492  for (int i = 0; i < len; ++i) {
493  int class_id = char_set->ClassID((char_32)lm_str32[i]);
494  if (class_id != INVALID_UNICHAR_ID) {
495  clean_str32[clean_len] = lm_str32[i];
496  ++clean_len;
497  }
498  }
499  clean_str32[clean_len] = 0;
500  if (clean_len < len) {
501  lm_str->clear();
502  CubeUtils::UTF32ToUTF8(clean_str32, lm_str);
503  }
504  delete [] clean_str32;
505 }
basic_string< char_32 > string_32
Definition: string_32.h:41
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
CharSet * CharacterSet() const
signed int char_32
Definition: string_32.h:40
TessLangModEdge* tesseract::TessLangModel::Root ( )
inlinevirtual

Implements tesseract::LangModel.

Definition at line 53 of file tess_lang_model.h.

53  {
54  return NULL;
55  }
#define NULL
Definition: host.h:144

The documentation for this class was generated from the following files: