All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
word_size_model.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: word_size_model.cpp
3  * Description: Implementation of the Word Size Model Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include "word_size_model.h"
24 #include "cube_utils.h"
25 
26 namespace tesseract {
27 
28 WordSizeModel::WordSizeModel(CharSet * char_set, bool contextual) {
29  char_set_ = char_set;
30  contextual_ = contextual;
31 }
32 
34  for (int fnt = 0; fnt < font_pair_size_models_.size(); fnt++) {
35  FontPairSizeInfo fnt_info = font_pair_size_models_[fnt];
36  delete []fnt_info.pair_size_info[0];
37  delete []fnt_info.pair_size_info;
38  }
39 }
40 
41 WordSizeModel *WordSizeModel::Create(const string &data_file_path,
42  const string &lang,
43  CharSet *char_set,
44  bool contextual) {
45  WordSizeModel *obj = new WordSizeModel(char_set, contextual);
46  if (!obj) {
47  fprintf(stderr, "Cube ERROR (WordSizeModel::Create): unable to allocate "
48  "new word size model object\n");
49  return NULL;
50  }
51 
52  if (!obj->Init(data_file_path, lang)) {
53  delete obj;
54  return NULL;
55  }
56  return obj;
57 }
58 
59 bool WordSizeModel::Init(const string &data_file_path, const string &lang) {
60  string stats_file_name;
61  stats_file_name = data_file_path + lang;
62  stats_file_name += ".cube.size";
63 
64  // read file to memory
65  string str_data;
66 
67  if (!CubeUtils::ReadFileToString(stats_file_name, &str_data)) {
68  return false;
69  }
70 
71  // split to words
72  vector<string> tokens;
73  CubeUtils::SplitStringUsing(str_data, "\t\r\n", &tokens);
74  if (tokens.size() < 1) {
75  fprintf(stderr, "Cube ERROR (WordSizeModel::Init): invalid "
76  "file contents: %s\n", stats_file_name.c_str());
77  return false;
78  }
79 
80  font_pair_size_models_.clear();
81 
82  // token count per line depends on whether the language is contextual or not
83  int token_cnt = contextual_ ?
84  (kExpectedTokenCount + 4) : kExpectedTokenCount;
85  // the count of size classes depends on whether the language is contextual
86  // or not. For non contextual languages (Ex: Eng), it is equal to the class
87  // count. For contextual languages (Ex: Ara), it is equal to the class count
88  // multiplied by the position count (4: start, middle, final, isolated)
89  int size_class_cnt = contextual_ ?
90  (char_set_->ClassCount() * 4) : char_set_->ClassCount();
91  string fnt_name = "";
92 
93  for (int tok = 0; tok < tokens.size(); tok += token_cnt) {
94  // a new font, write the old font data and re-init
95  if (tok == 0 || fnt_name != tokens[tok]) {
96  FontPairSizeInfo fnt_info;
97 
98  fnt_info.pair_size_info = new PairSizeInfo *[size_class_cnt];
99  if (!fnt_info.pair_size_info) {
100  fprintf(stderr, "Cube ERROR (WordSizeModel::Init): error allcoating "
101  "memory for font pair size info\n");
102  return false;
103  }
104 
105  fnt_info.pair_size_info[0] =
106  new PairSizeInfo[size_class_cnt * size_class_cnt];
107  if (!fnt_info.pair_size_info[0]) {
108  fprintf(stderr, "Cube ERROR (WordSizeModel::Init): error allocating "
109  "memory for font pair size info\n");
110  return false;
111  }
112 
113  memset(fnt_info.pair_size_info[0], 0, size_class_cnt * size_class_cnt *
114  sizeof(PairSizeInfo));
115 
116  for (int cls = 1; cls < size_class_cnt; cls++) {
117  fnt_info.pair_size_info[cls] =
118  fnt_info.pair_size_info[cls - 1] + size_class_cnt;
119  }
120 
121  // strip out path and extension
122  string stripped_font_name = tokens[tok].substr(0, tokens[tok].find('.'));
123  string::size_type strt_pos = stripped_font_name.find_last_of("/\\");
124  if (strt_pos != string::npos) {
125  fnt_info.font_name = stripped_font_name.substr(strt_pos);
126  } else {
127  fnt_info.font_name = stripped_font_name;
128  }
129  font_pair_size_models_.push_back(fnt_info);
130  }
131 
132  // parse the data
133  int cls_0;
134  int cls_1;
135  double delta_top;
136  double wid_0;
137  double hgt_0;
138  double wid_1;
139  double hgt_1;
140  int size_code_0;
141  int size_code_1;
142 
143  // read and parse the tokens
144  if (contextual_) {
145  int start_0;
146  int end_0;
147  int start_1;
148  int end_1;
149  // The expected format for a character size bigram is as follows:
150  // ClassId0<delim>Start-flag0<delim>End-flag0<delim>String0(ignored)
151  // Width0<delim>Height0<delim>
152  // ClassId1<delim>Start-flag1<delim>End-flag1<delim>String1(ignored)
153  // HeightDelta<delim>Width1<delim>Height0<delim>
154  // In case of non-contextual languages, the Start and End flags are
155  // omitted
156  if (sscanf(tokens[tok + 1].c_str(), "%d", &cls_0) != 1 ||
157  sscanf(tokens[tok + 2].c_str(), "%d", &start_0) != 1 ||
158  sscanf(tokens[tok + 3].c_str(), "%d", &end_0) != 1 ||
159  sscanf(tokens[tok + 5].c_str(), "%lf", &wid_0) != 1 ||
160  sscanf(tokens[tok + 6].c_str(), "%lf", &hgt_0) != 1 ||
161  sscanf(tokens[tok + 7].c_str(), "%d", &cls_1) != 1 ||
162  sscanf(tokens[tok + 8].c_str(), "%d", &start_1) != 1 ||
163  sscanf(tokens[tok + 9].c_str(), "%d", &end_1) != 1 ||
164  sscanf(tokens[tok + 11].c_str(), "%lf", &delta_top) != 1 ||
165  sscanf(tokens[tok + 12].c_str(), "%lf", &wid_1) != 1 ||
166  sscanf(tokens[tok + 13].c_str(), "%lf", &hgt_1) != 1 ||
167  (start_0 != 0 && start_0 != 1) || (end_0 != 0 && end_0 != 1) ||
168  (start_1 != 0 && start_1 != 1) || (end_1 != 0 && end_1 != 1)) {
169  fprintf(stderr, "Cube ERROR (WordSizeModel::Init): bad format at "
170  "line %d\n", 1 + (tok / token_cnt));
171  return false;
172  }
173  size_code_0 = SizeCode(cls_0, start_0, end_0);
174  size_code_1 = SizeCode(cls_1, start_1, end_1);
175  } else {
176  if (sscanf(tokens[tok + 1].c_str(), "%d", &cls_0) != 1 ||
177  sscanf(tokens[tok + 3].c_str(), "%lf", &wid_0) != 1 ||
178  sscanf(tokens[tok + 4].c_str(), "%lf", &hgt_0) != 1 ||
179  sscanf(tokens[tok + 5].c_str(), "%d", &cls_1) != 1 ||
180  sscanf(tokens[tok + 7].c_str(), "%lf", &delta_top) != 1 ||
181  sscanf(tokens[tok + 8].c_str(), "%lf", &wid_1) != 1 ||
182  sscanf(tokens[tok + 9].c_str(), "%lf", &hgt_1) != 1) {
183  fprintf(stderr, "Cube ERROR (WordSizeModel::Init): bad format at "
184  "line %d\n", 1 + (tok / token_cnt));
185  return false;
186  }
187  size_code_0 = cls_0;
188  size_code_1 = cls_1;
189  }
190 
191  // copy the data to the size tables
192  FontPairSizeInfo fnt_info = font_pair_size_models_.back();
193  fnt_info.pair_size_info[size_code_0][size_code_1].delta_top =
194  static_cast<int>(delta_top * kShapeModelScale);
195  fnt_info.pair_size_info[size_code_0][size_code_1].wid_0 =
196  static_cast<int>(wid_0 * kShapeModelScale);
197  fnt_info.pair_size_info[size_code_0][size_code_1].hgt_0 =
198  static_cast<int>(hgt_0 * kShapeModelScale);
199  fnt_info.pair_size_info[size_code_0][size_code_1].wid_1 =
200  static_cast<int>(wid_1 * kShapeModelScale);
201  fnt_info.pair_size_info[size_code_0][size_code_1].hgt_1 =
202  static_cast<int>(hgt_1 * kShapeModelScale);
203 
204  fnt_name = tokens[tok];
205  }
206 
207  return true;
208 }
209 
210 int WordSizeModel::Cost(CharSamp **samp_array, int samp_cnt) const {
211  if (samp_cnt < 2) {
212  return 0;
213  }
214  double best_dist = static_cast<double>(WORST_COST);
215  int best_fnt = -1;
216  for (int fnt = 0; fnt < font_pair_size_models_.size(); fnt++) {
217  const FontPairSizeInfo *fnt_info = &font_pair_size_models_[fnt];
218  double mean_dist = 0;
219  int pair_cnt = 0;
220 
221  for (int smp_0 = 0; smp_0 < samp_cnt; smp_0++) {
222  int cls_0 = char_set_->ClassID(samp_array[smp_0]->StrLabel());
223  if (cls_0 < 1) {
224  continue;
225  }
226  // compute size code for samp 0 based on class id and position
227  int size_code_0;
228  if (contextual_) {
229  size_code_0 = SizeCode(cls_0,
230  samp_array[smp_0]->FirstChar() == 0 ? 0 : 1,
231  samp_array[smp_0]->LastChar() == 0 ? 0 : 1);
232  } else {
233  size_code_0 = cls_0;
234  }
235 
236  int char0_height = samp_array[smp_0]->Height();
237  int char0_width = samp_array[smp_0]->Width();
238  int char0_top = samp_array[smp_0]->Top();
239 
240  for (int smp_1 = smp_0 + 1; smp_1 < samp_cnt; smp_1++) {
241  int cls_1 = char_set_->ClassID(samp_array[smp_1]->StrLabel());
242  if (cls_1 < 1) {
243  continue;
244  }
245  // compute size code for samp 0 based on class id and position
246  int size_code_1;
247  if (contextual_) {
248  size_code_1 = SizeCode(cls_1,
249  samp_array[smp_1]->FirstChar() == 0 ? 0 : 1,
250  samp_array[smp_1]->LastChar() == 0 ? 0 : 1);
251  } else {
252  size_code_1 = cls_1;
253  }
254  double dist = PairCost(
255  char0_width, char0_height, char0_top, samp_array[smp_1]->Width(),
256  samp_array[smp_1]->Height(), samp_array[smp_1]->Top(),
257  fnt_info->pair_size_info[size_code_0][size_code_1]);
258  if (dist > 0) {
259  mean_dist += dist;
260  pair_cnt++;
261  }
262  } // smp_1
263  } // smp_0
264  if (pair_cnt == 0) {
265  continue;
266  }
267  mean_dist /= pair_cnt;
268  if (best_fnt == -1 || mean_dist < best_dist) {
269  best_dist = mean_dist;
270  best_fnt = fnt;
271  }
272  }
273  if (best_fnt == -1) {
274  return static_cast<int>(WORST_COST);
275  } else {
276  return static_cast<int>(best_dist);
277  }
278 }
279 
280 double WordSizeModel::PairCost(int width_0, int height_0, int top_0,
281  int width_1, int height_1, int top_1,
282  const PairSizeInfo& pair_info) {
283  double scale_factor = static_cast<double>(pair_info.hgt_0) /
284  static_cast<double>(height_0);
285  double dist = 0.0;
286  if (scale_factor > 0) {
287  double norm_width_0 = width_0 * scale_factor;
288  double norm_width_1 = width_1 * scale_factor;
289  double norm_height_1 = height_1 * scale_factor;
290  double norm_delta_top = (top_1 - top_0) * scale_factor;
291 
292  // accumulate the distance between the model character and the
293  // predicted one on all dimensions of the pair
294  dist += fabs(pair_info.wid_0 - norm_width_0);
295  dist += fabs(pair_info.wid_1 - norm_width_1);
296  dist += fabs(pair_info.hgt_1 - norm_height_1);
297  dist += fabs(pair_info.delta_top - norm_delta_top);
298  }
299  return dist;
300 }
301 } // namespace tesseract
#define WORST_COST
Definition: cube_const.h:30
static int SizeCode(int cls_id, int start, int end)
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:195
int Cost(CharSamp **samp_array, int samp_cnt) const
WordSizeModel(CharSet *, bool contextual)
unsigned short Width() const
Definition: bmp_8.h:48
int ClassID(const char_32 *str) const
Definition: char_set.h:54
static double PairCost(int width_0, int height_0, int top_0, int width_1, int height_1, int top_1, const PairSizeInfo &pair_info)
static WordSizeModel * Create(const string &data_file_path, const string &lang, CharSet *char_set, bool contextual)
unsigned short Top() const
Definition: char_samp.h:48
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
PairSizeInfo ** pair_size_info
unsigned short Height() const
Definition: bmp_8.h:50
int ClassCount() const
Definition: char_set.h:111
#define NULL
Definition: host.h:144