All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
word_unigrams.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_unigrams.cpp
3  * Description: Implementation of the Word Unigrams Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include <algorithm>
24 
25 #include "const.h"
26 #include "cube_utils.h"
27 #include "ndminx.h"
28 #include "word_unigrams.h"
29 
30 namespace tesseract {
31 
33  costs_ = NULL;
34  words_ = NULL;
35  word_cnt_ = 0;
36 }
37 
39  if (words_ != NULL) {
40  if (words_[0] != NULL) {
41  delete []words_[0];
42  }
43 
44  delete []words_;
45  words_ = NULL;
46  }
47 
48  if (costs_ != NULL) {
49  delete []costs_;
50  }
51 }
52 
57 WordUnigrams *WordUnigrams::Create(const string &data_file_path,
58  const string &lang) {
59  string file_name;
60  string str;
61 
62  file_name = data_file_path + lang;
63  file_name += ".cube.word-freq";
64 
65  // load the string into memory
66  if (CubeUtils::ReadFileToString(file_name, &str) == false) {
67  return NULL;
68  }
69 
70  // split into lines
71  vector<string> str_vec;
72  CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec);
73  if (str_vec.size() < 2) {
74  return NULL;
75  }
76 
77  // allocate memory
78  WordUnigrams *word_unigrams_obj = new WordUnigrams();
79  if (word_unigrams_obj == NULL) {
80  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create "
81  "word unigrams object.\n");
82  return NULL;
83  }
84 
85  int full_len = str.length();
86  int word_cnt = str_vec.size() / 2;
87  word_unigrams_obj->words_ = new char*[word_cnt];
88  word_unigrams_obj->costs_ = new int[word_cnt];
89 
90  if (word_unigrams_obj->words_ == NULL ||
91  word_unigrams_obj->costs_ == NULL) {
92  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
93  "word unigram fields.\n");
94  delete word_unigrams_obj;
95  return NULL;
96  }
97 
98  word_unigrams_obj->words_[0] = new char[full_len];
99  if (word_unigrams_obj->words_[0] == NULL) {
100  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating "
101  "word unigram fields.\n");
102  delete word_unigrams_obj;
103  return NULL;
104  }
105 
106  // construct sorted list of words and costs
107  word_unigrams_obj->word_cnt_ = 0;
108  char *char_buff = word_unigrams_obj->words_[0];
109  word_cnt = 0;
110  int max_cost = 0;
111 
112  for (int wrd = 0; wrd < str_vec.size(); wrd += 2) {
113  word_unigrams_obj->words_[word_cnt] = char_buff;
114 
115  strcpy(char_buff, str_vec[wrd].c_str());
116  char_buff += (str_vec[wrd].length() + 1);
117 
118  if (sscanf(str_vec[wrd + 1].c_str(), "%d",
119  word_unigrams_obj->costs_ + word_cnt) != 1) {
120  fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading "
121  "word unigram data.\n");
122  delete word_unigrams_obj;
123  return NULL;
124  }
125  // update max cost
126  max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]);
127  word_cnt++;
128  }
129  word_unigrams_obj->word_cnt_ = word_cnt;
130 
131  // compute the not-in-list-cost by assuming that a word not in the list
132  // [ahmadab]: This can be computed as follows:
133  // - Given that the distribution of words follow Zipf's law:
134  // (F = K / (rank ^ S)), where s is slightly > 1.0
135  // - Number of words in the list is N
136  // - The mean frequency of a word that did not appear in the list is the
137  // area under the rest of the Zipf's curve divided by 2 (the mean)
138  // - The area would be the bound integral from N to infinity =
139  // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2)
140  // - Given that cost = -LOG(prob), the cost of an unlisted word would be
141  // = max_cost + 2*LOG(N)
142  word_unigrams_obj->not_in_list_cost_ = max_cost +
143  (2 * CubeUtils::Prob2Cost(1.0 / word_cnt));
144  // success
145  return word_unigrams_obj;
146 }
147 
154 int WordUnigrams::Cost(const char_32 *key_str32,
155  LangModel *lang_mod,
156  CharSet *char_set) const {
157  if (!key_str32)
158  return 0;
159  // convert string to UTF8 to split into space-separated words
160  string key_str;
161  CubeUtils::UTF32ToUTF8(key_str32, &key_str);
162  vector<string> words;
163  CubeUtils::SplitStringUsing(key_str, " \t", &words);
164 
165  // no words => no cost
166  if (words.size() <= 0) {
167  return 0;
168  }
169 
170  // aggregate the costs of all the words
171  int cost = 0;
172  for (int word_idx = 0; word_idx < words.size(); word_idx++) {
173  // convert each word back to UTF32 for analyzing case and punctuation
174  string_32 str32;
175  CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32);
176  int len = CubeUtils::StrLen(str32.c_str());
177 
178  // strip all trailing punctuation
179  string clean_str;
180  int clean_len = len;
181  bool trunc = false;
182  while (clean_len > 0 &&
183  lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) {
184  --clean_len;
185  trunc = true;
186  }
187 
188  // If either the original string was not truncated (no trailing
189  // punctuation) or the entire string was removed (all characters
190  // are trailing punctuation), evaluate original word as is;
191  // otherwise, copy all but the trailing punctuation characters
192  char_32 *clean_str32 = NULL;
193  if (clean_len == 0 || !trunc) {
194  clean_str32 = CubeUtils::StrDup(str32.c_str());
195  } else {
196  clean_str32 = new char_32[clean_len + 1];
197  for (int i = 0; i < clean_len; ++i) {
198  clean_str32[i] = str32[i];
199  }
200  clean_str32[clean_len] = '\0';
201  }
202  ASSERT_HOST(clean_str32 != NULL);
203 
204  string str8;
205  CubeUtils::UTF32ToUTF8(clean_str32, &str8);
206  int word_cost = CostInternal(str8.c_str());
207 
208  // if case invariant, get costs of all-upper-case and all-lower-case
209  // versions and return the min cost
210  if (clean_len >= kMinLengthNumOrCaseInvariant &&
211  CubeUtils::IsCaseInvariant(clean_str32, char_set)) {
212  char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set);
213  if (lower_32) {
214  string lower_8;
215  CubeUtils::UTF32ToUTF8(lower_32, &lower_8);
216  word_cost = MIN(word_cost, CostInternal(lower_8.c_str()));
217  delete [] lower_32;
218  }
219  char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set);
220  if (upper_32) {
221  string upper_8;
222  CubeUtils::UTF32ToUTF8(upper_32, &upper_8);
223  word_cost = MIN(word_cost, CostInternal(upper_8.c_str()));
224  delete [] upper_32;
225  }
226  }
227 
228  if (clean_len >= kMinLengthNumOrCaseInvariant) {
229  // if characters are all numeric, incur 0 word cost
230  bool is_numeric = true;
231  for (int i = 0; i < clean_len; ++i) {
232  if (!lang_mod->IsDigit(clean_str32[i]))
233  is_numeric = false;
234  }
235  if (is_numeric)
236  word_cost = 0;
237  }
238  delete [] clean_str32;
239  cost += word_cost;
240  } // word_idx
241 
242  // return the mean cost
243  return static_cast<int>(cost / static_cast<double>(words.size()));
244 }
245 
249 int WordUnigrams::CostInternal(const char *key_str) const {
250  if (strlen(key_str) == 0)
251  return not_in_list_cost_;
252  int hi = word_cnt_ - 1;
253  int lo = 0;
254  while (lo <= hi) {
255  int current = (hi + lo) / 2;
256  int comp = strcmp(key_str, words_[current]);
257  // a match
258  if (comp == 0) {
259  return costs_[current];
260  }
261  if (comp < 0) {
262  // go lower
263  hi = current - 1;
264  } else {
265  // go higher
266  lo = current + 1;
267  }
268  }
269  return not_in_list_cost_;
270 }
271 } // namespace tesseract
#define MAX(x, y)
Definition: ndminx.h:24
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:37
#define MIN(x, y)
Definition: ndminx.h:28
basic_string< char_32 > string_32
Definition: string_32.h:41
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:195
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:348
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
static WordUnigrams * Create(const string &data_file_path, const string &lang)
int Cost(const char_32 *str32, LangModel *lang_mod, CharSet *char_set) const
int CostInternal(const char *str) const
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
virtual bool IsDigit(char_32 ch)=0
signed int char_32
Definition: string_32.h:40
#define NULL
Definition: host.h:144
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:381
virtual bool IsTrailingPunc(char_32 ch)=0
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:294
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:90