All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
word_list_lang_model.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_list_lang_model.cpp
3  * Description: Implementation of the Word List Language Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 #include <vector>
22 #include "word_list_lang_model.h"
23 #include "cube_utils.h"
24 
25 #include "ratngs.h"
26 #include "trie.h"
27 
28 namespace tesseract {
30  cntxt_ = cntxt;
31  dawg_ = NULL;
32  init_ = false;
33 }
34 
36  Cleanup();
37 }
38 
39 // Cleanup
40 void WordListLangModel::Cleanup() {
41  if (dawg_ != NULL) {
42  delete dawg_;
43  dawg_ = NULL;
44  }
45  init_ = false;
46 }
47 
48 // Initialize the language model
49 bool WordListLangModel::Init() {
50  if (init_ == true) {
51  return true;
52  }
53  // The last parameter to the Trie constructor (the debug level) is set to
54  // false for now, until Cube has a way to express its preferred debug level.
55  dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM,
56  cntxt_->CharacterSet()->ClassCount(), false);
57  if (dawg_ == NULL) {
58  return false;
59  }
60  init_ = true;
61  return true;
62 }
63 
64 // return a pointer to the root
66  return NULL;
67 }
68 
69 // return the edges emerging from the current state
71  LangModEdge *edge,
72  int *edge_cnt) {
73  // initialize if necessary
74  if (init_ == false) {
75  if (Init() == false) {
76  return NULL;
77  }
78  }
79 
80  (*edge_cnt) = 0;
81 
82  EDGE_REF edge_ref;
83 
84  TessLangModEdge *tess_lm_edge = reinterpret_cast<TessLangModEdge *>(edge);
85 
86  if (tess_lm_edge == NULL) {
87  edge_ref = 0;
88  } else {
89  edge_ref = tess_lm_edge->EndEdge();
90 
91  // advance node
92  edge_ref = dawg_->next_node(edge_ref);
93  if (edge_ref == 0) {
94  return NULL;
95  }
96  }
97 
98  // allocate memory for edges
99  LangModEdge **edge_array = new LangModEdge *[kMaxEdge];
100  if (edge_array == NULL) {
101  return NULL;
102  }
103 
104  // now get all the emerging edges
105  (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref,
106  edge_array + (*edge_cnt));
107 
108  return edge_array;
109 }
110 
111 // returns true if the char_32 is supported by the language model
112 // TODO(ahmadab) currently not implemented
114  bool terminal, LangModEdge **edges) {
115  return false;
116 }
117 
118 // Recursive helper function for WordVariants().
119 void WordListLangModel::WordVariants(const CharSet &char_set,
120  string_32 prefix_str32,
121  WERD_CHOICE *word_so_far,
122  string_32 str32,
123  vector<WERD_CHOICE *> *word_variants) {
124  int str_len = str32.length();
125  if (str_len == 0) {
126  if (word_so_far->length() > 0) {
127  word_variants->push_back(new WERD_CHOICE(*word_so_far));
128  }
129  } else {
130  // Try out all the possible prefixes of the str32.
131  for (int len = 1; len <= str_len; len++) {
132  // Check if prefix is supported in character set.
133  string_32 str_pref32 = str32.substr(0, len);
134  int class_id = char_set.ClassID(reinterpret_cast<const char_32 *>(
135  str_pref32.c_str()));
136  if (class_id <= 0) {
137  continue;
138  } else {
139  string_32 new_prefix_str32 = prefix_str32 + str_pref32;
140  string_32 new_str32 = str32.substr(len);
141  word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0);
142  WordVariants(char_set, new_prefix_str32, word_so_far, new_str32,
143  word_variants);
144  word_so_far->remove_last_unichar_id();
145  }
146  }
147  }
148 }
149 
150 // Compute all the variants of a 32-bit string in terms of the class-ids
151 // This is needed for languages that have ligatures. A word can then have more
152 // than one spelling in terms of the class-ids
154  const UNICHARSET *uchset, string_32 str32,
155  vector<WERD_CHOICE *> *word_variants) {
156  for (int i = 0; i < word_variants->size(); i++) {
157  delete (*word_variants)[i];
158  }
159  word_variants->clear();
160  string_32 prefix_str32;
161  WERD_CHOICE word_so_far(uchset);
162  WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants);
163 }
164 
165 // add a new UTF-8 string to the lang model
166 bool WordListLangModel::AddString(const char *char_ptr) {
167  if (!init_ && !Init()) { // initialize if necessary
168  return false;
169  }
170 
171  string_32 str32;
172  CubeUtils::UTF8ToUTF32(char_ptr, &str32);
173  if (str32.length() < 1) {
174  return false;
175  }
176  return AddString32(str32.c_str());
177 }
178 
179 // add a new UTF-32 string to the lang model
180 bool WordListLangModel::AddString32(const char_32 *char_32_ptr) {
181  if (char_32_ptr == NULL) {
182  return false;
183  }
184  // get all the word variants
185  vector<WERD_CHOICE *> word_variants;
186  WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(),
187  char_32_ptr, &word_variants);
188 
189  if (word_variants.size() > 0) {
190  // find the shortest variant
191  int shortest_word = 0;
192  for (int word = 1; word < word_variants.size(); word++) {
193  if (word_variants[shortest_word]->length() >
194  word_variants[word]->length()) {
195  shortest_word = word;
196  }
197  }
198  // only add the shortest grapheme interpretation of string to the word list
199  dawg_->add_word_to_dawg(*word_variants[shortest_word]);
200  }
201  for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; }
202  return true;
203 }
204 
205 }
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
bool AddString32(const char_32 *char_32_ptr)
int length() const
Definition: ratngs.h:300
bool IsValidSequence(const char_32 *sequence, bool eow_flag, LangModEdge **edges)
basic_string< char_32 > string_32
Definition: string_32.h:41
WordListLangModel(CubeRecoContext *cntxt)
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:178
static int CreateChildren(CubeRecoContext *cntxt, const Dawg *edges, NODE_REF edge_reg, LangModEdge **lm_edges)
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
int ClassID(const char_32 *str) const
Definition: char_set.h:54
void remove_last_unichar_id()
Definition: ratngs.h:480
const UNICHARSET * TessUnicharset() const
LangModEdge ** GetEdges(CharAltList *alt_list, LangModEdge *edge, int *edge_cnt)
static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, string_32 str32, vector< WERD_CHOICE * > *word_variants)
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:132
CharSet * CharacterSet() const
inT64 EDGE_REF
Definition: dawg.h:54
signed int char_32
Definition: string_32.h:40
int ClassCount() const
Definition: char_set.h:111
#define NULL
Definition: host.h:144
bool AddString(const char *char_ptr)