All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
tesseract::WordUnigrams Class Reference

#include <word_unigrams.h>

Public Member Functions

 WordUnigrams ()
 
 ~WordUnigrams ()
 
int Cost (const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const
 

Static Public Member Functions

static WordUnigramsCreate (const string &data_file_path, const string &lang)
 

Protected Member Functions

int CostInternal (const char *str) const
 

Detailed Description

Definition at line 34 of file word_unigrams.h.

Constructor & Destructor Documentation

tesseract::WordUnigrams::WordUnigrams ( )

Definition at line 32 of file word_unigrams.cpp.

32  {
33  costs_ = NULL;
34  words_ = NULL;
35  word_cnt_ = 0;
36 }
#define NULL
Definition: host.h:144
tesseract::WordUnigrams::~WordUnigrams ( )

Definition at line 38 of file word_unigrams.cpp.

38  {
39  if (words_ != NULL) {
40  if (words_[0] != NULL) {
41  delete []words_[0];
42  }
43 
44  delete []words_;
45  words_ = NULL;
46  }
47 
48  if (costs_ != NULL) {
49  delete []costs_;
50  }
51 }
#define NULL
Definition: host.h:144

Member Function Documentation

int tesseract::WordUnigrams::Cost ( const char_32 key_str32,
LangModel lang_mod,
CharSet char_set 
) const

Split input into space-separated tokens, strip trailing punctuation from each, determine case properties, call UTF-8 flavor of cost function on each word, and aggregate all into single mean word cost.

Definition at line 154 of file word_unigrams.cpp.

156  {
157  if (!key_str32)
158  return 0;
159  // convert string to UTF8 to split into space-separated words
160  string key_str;
161  CubeUtils::UTF32ToUTF8(key_str32, &key_str);
162  vector<string> words;
163  CubeUtils::SplitStringUsing(key_str, " \t", &words);
164 
165  // no words => no cost
166  if (words.size() <= 0) {
167  return 0;
168  }
169 
170  // aggregate the costs of all the words
171  int cost = 0;
172  for (int word_idx = 0; word_idx < words.size(); word_idx++) {
173  // convert each word back to UTF32 for analyzing case and punctuation
174  string_32 str32;
175  CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
176  int len = CubeUtils::StrLen(str32.c_str());
177 
178  // strip all trailing punctuation
179  string clean_str;
180  int clean_len = len;
181  bool trunc = false;
182  while (clean_len > 0 &&
183  lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
184  --clean_len;
185  trunc = true;
186  }
187 
188  // If either the original string was not truncated (no trailing
189  // punctuation) or the entire string was removed (all characters
190  // are trailing punctuation), evaluate original word as is;
191  // otherwise, copy all but the trailing punctuation characters
192  char_32 *clean_str32 = NULL;
193  if (clean_len == 0 || !trunc) {
194  clean_str32 = CubeUtils::StrDup(str32.c_str());
195  } else {
196  clean_str32 = new char_32[clean_len + 1];
197  for (int i = 0; i < clean_len; ++i) {
198  clean_str32[i] = str32[i];
199  }
200  clean_str32[clean_len] = '\0';
201  }
202  ASSERT_HOST(clean_str32 != NULL);
203 
204  string str8;
205  CubeUtils::UTF32ToUTF8(clean_str32, &str8);
206  int word_cost = CostInternal(str8.c_str());
207 
208  // if case invariant, get costs of all-upper-case and all-lower-case
209  // versions and return the min cost
210  if (clean_len >= kMinLengthNumOrCaseInvariant &&
211  CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
212  char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
213  if (lower_32) {
214  string lower_8;
215  CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
216  word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
217  delete [] lower_32;
218  }
219  char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
220  if (upper_32) {
221  string upper_8;
222  CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
223  word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
224  delete [] upper_32;
225  }
226  }
227 
228  if (clean_len >= kMinLengthNumOrCaseInvariant) {
229  // if characters are all numeric, incur 0 word cost
230  bool is_numeric = true;
231  for (int i = 0; i < clean_len; ++i) {
232  if (!lang_mod->IsDigit(clean_str32[i]))
233  is_numeric = false;
234  }
235  if (is_numeric)
236  word_cost = 0;
237  }
238  delete [] clean_str32;
239  cost += word_cost;
240  } // word_idx
241 
242  // return the mean cost
243  return static_cast<int>(cost / static_cast<double>(words.size()));
244 }
#define MIN(x, y)
Definition: ndminx.h:28
basic_string< char_32 > string_32
Definition: string_32.h:41
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:348
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
int CostInternal(const char *str) const
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
signed int char_32
Definition: string_32.h:40
#define NULL
Definition: host.h:144
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:381
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:294
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:90
int tesseract::WordUnigrams::CostInternal ( const char *  key_str) const
protected

Search for UTF-8 string using binary search of sorted words_ array.

Definition at line 249 of file word_unigrams.cpp.

249  {
250  if (strlen(key_str) == 0)
251  return not_in_list_cost_;
252  int hi = word_cnt_ - 1;
253  int lo = 0;
254  while (lo <= hi) {
255  int current = (hi + lo) / 2;
256  int comp = strcmp(key_str, words_[current]);
257  // a match
258  if (comp == 0) {
259  return costs_[current];
260  }
261  if (comp < 0) {
262  // go lower
263  hi = current - 1;
264  } else {
265  // go higher
266  lo = current + 1;
267  }
268  }
269  return not_in_list_cost_;
270 }
WordUnigrams * tesseract::WordUnigrams::Create ( const string &  data_file_path,
const string &  lang 
)
static

Load the word-list and unigrams from file and create an object The word list is assumed to be sorted in lexicographic order.

Definition at line 57 of file word_unigrams.cpp.

58  {
59  string file_name;
60  string str;
61 
62  file_name = data_file_path + lang;
63  file_name += ".cube.word-freq";
64 
65  // load the string into memory
66  if (CubeUtils::ReadFileToString(file_name, &str) == false) {
67  return NULL;
68  }
69 
70  // split into lines
71  vector<string> str_vec;
72  CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
73  if (str_vec.size() < 2) {
74  return NULL;
75  }
76 
77  // allocate memory
78  WordUnigrams *word_unigrams_obj = new WordUnigrams();
79  if (word_unigrams_obj == NULL) {
80  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create "
81  "word unigrams object.\n");
82  return NULL;
83  }
84 
85  int full_len = str.length();
86  int word_cnt = str_vec.size() / 2;
87  word_unigrams_obj->words_ = new char*[word_cnt];
88  word_unigrams_obj->costs_ = new int[word_cnt];
89 
90  if (word_unigrams_obj->words_ == NULL ||
91  word_unigrams_obj->costs_ == NULL) {
92  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
93  "word unigram fields.\n");
94  delete word_unigrams_obj;
95  return NULL;
96  }
97 
98  word_unigrams_obj->words_[0] = new char[full_len];
99  if (word_unigrams_obj->words_[0] == NULL) {
100  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
101  "word unigram fields.\n");
102  delete word_unigrams_obj;
103  return NULL;
104  }
105 
106  // construct sorted list of words and costs
107  word_unigrams_obj->word_cnt_ = 0;
108  char *char_buff = word_unigrams_obj->words_[0];
109  word_cnt = 0;
110  int max_cost = 0;
111 
112  for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
113  word_unigrams_obj->words_[word_cnt] = char_buff;
114 
115  strcpy(char_buff, str_vec[wrd].c_str());
116  char_buff += (str_vec[wrd].length() + 1);
117 
118  if (sscanf(str_vec[wrd + 1].c_str(), "%d",
119  word_unigrams_obj->costs_ + word_cnt) != 1) {
120  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
121  "word unigram data.\n");
122  delete word_unigrams_obj;
123  return NULL;
124  }
125  // update max cost
126  max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
127  word_cnt++;
128  }
129  word_unigrams_obj->word_cnt_ = word_cnt;
130 
131  // compute the not-in-list-cost by assuming that a word not in the list
132  // [ahmadab]: This can be computed as follows:
133  // - Given that the distribution of words follow Zipf's law:
134  // (F = K / (rank ^ S)), where s is slightly > 1.0
135  // - Number of words in the list is N
136  // - The mean frequency of a word that did not appear in the list is the
137  // area under the rest of the Zipf's curve divided by 2 (the mean)
138  // - The area would be the bound integral from N to infinity =
139  // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
140  // - Given that cost = -LOG(prob), the cost of an unlisted word would be
141  // = max_cost + 2*LOG(N)
142  word_unigrams_obj->not_in_list_cost_ = max_cost +
143  (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
144  // success
145  return word_unigrams_obj;
146 }
#define MAX(x, y)
Definition: ndminx.h:24
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:37
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:195
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
#define NULL
Definition: host.h:144

The documentation for this class was generated from the following files: