All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
char_bigrams.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: char_bigrams.cpp
3  * Description: Implementation of a Character Bigrams Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <algorithm>
21 #include <math.h>
22 #include <string>
23 #include <vector>
24 
25 #include "char_bigrams.h"
26 #include "cube_utils.h"
27 #include "ndminx.h"
28 #include "cube_const.h"
29 
30 namespace tesseract {
31 
33  memset(&bigram_table_, 0, sizeof(bigram_table_));
34 }
35 
37  if (bigram_table_.char_bigram != NULL) {
38  for (int ch1 = 0; ch1 <= bigram_table_.max_char; ch1++) {
39  CharBigram *char_bigram = bigram_table_.char_bigram + ch1;
40 
41  if (char_bigram->bigram != NULL) {
42  delete []char_bigram->bigram;
43  }
44  }
45  delete []bigram_table_.char_bigram;
46  }
47 }
48 
49 CharBigrams *CharBigrams::Create(const string &data_file_path,
50  const string &lang) {
51  string file_name;
52  string str;
53 
54  file_name = data_file_path + lang;
55  file_name += ".cube.bigrams";
56 
57  // load the string into memory
58  if (!CubeUtils::ReadFileToString(file_name, &str)) {
59  return NULL;
60  }
61 
62  // construct a new object
63  CharBigrams *char_bigrams_obj = new CharBigrams();
64  if (char_bigrams_obj == NULL) {
65  fprintf(stderr, "Cube ERROR (CharBigrams::Create): could not create "
66  "character bigrams object.\n");
67  return NULL;
68  }
69  CharBigramTable *table = &char_bigrams_obj->bigram_table_;
70 
71  table->total_cnt = 0;
72  table->max_char = -1;
73  table->char_bigram = NULL;
74 
75  // split into lines
76  vector<string> str_vec;
77  CubeUtils::SplitStringUsing(str, "\r\n", &str_vec);
78 
79  for (int big = 0; big < str_vec.size(); big++) {
80  char_32 ch1;
81  char_32 ch2;
82  int cnt;
83  if (sscanf(str_vec[big].c_str(), "%d %x %x", &cnt, &ch1, &ch2) != 3) {
84  fprintf(stderr, "Cube ERROR (CharBigrams::Create): invalid format "
85  "reading line: %s\n", str_vec[big].c_str());
86  delete char_bigrams_obj;
87  return NULL;
88  }
89 
90  // expand the bigram table
91  if (ch1 > table->max_char) {
92  CharBigram *char_bigram = new CharBigram[ch1 + 1];
93  if (char_bigram == NULL) {
94  fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating "
95  "additional memory for character bigram table.\n");
96  return NULL;
97  }
98 
99  if (table->char_bigram != NULL && table->max_char >= 0) {
100  memcpy(char_bigram, table->char_bigram,
101  (table->max_char + 1) * sizeof(*char_bigram));
102 
103  delete []table->char_bigram;
104  }
105  table->char_bigram = char_bigram;
106 
107  // init
108  for (int new_big = table->max_char + 1; new_big <= ch1; new_big++) {
109  table->char_bigram[new_big].total_cnt = 0;
110  table->char_bigram[new_big].max_char = -1;
111  table->char_bigram[new_big].bigram = NULL;
112  }
113  table->max_char = ch1;
114  }
115 
116  if (ch2 > table->char_bigram[ch1].max_char) {
117  Bigram *bigram = new Bigram[ch2 + 1];
118  if (bigram == NULL) {
119  fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating "
120  "memory for bigram.\n");
121  delete char_bigrams_obj;
122  return NULL;
123  }
124 
125  if (table->char_bigram[ch1].bigram != NULL &&
126  table->char_bigram[ch1].max_char >= 0) {
127  memcpy(bigram, table->char_bigram[ch1].bigram,
128  (table->char_bigram[ch1].max_char + 1) * sizeof(*bigram));
129  delete []table->char_bigram[ch1].bigram;
130  }
131  table->char_bigram[ch1].bigram = bigram;
132 
133  // init
134  for (int new_big = table->char_bigram[ch1].max_char + 1;
135  new_big <= ch2; new_big++) {
136  table->char_bigram[ch1].bigram[new_big].cnt = 0;
137  }
138  table->char_bigram[ch1].max_char = ch2;
139  }
140 
141  table->char_bigram[ch1].bigram[ch2].cnt = cnt;
142  table->char_bigram[ch1].total_cnt += cnt;
143  table->total_cnt += cnt;
144  }
145 
146  // compute costs (-log probs)
147  table->worst_cost = static_cast<int>(
148  -PROB2COST_SCALE * log(0.5 / table->total_cnt));
149  for (char_32 ch1 = 0; ch1 <= table->max_char; ch1++) {
150  for (char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) {
151  int cnt = table->char_bigram[ch1].bigram[ch2].cnt;
152  table->char_bigram[ch1].bigram[ch2].cost =
153  static_cast<int>(-PROB2COST_SCALE *
154  log(MAX(0.5, static_cast<double>(cnt)) /
155  table->total_cnt));
156  }
157  }
158  return char_bigrams_obj;
159 }
160 
161 int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const {
162  if (ch1 > bigram_table_.max_char) {
163  return bigram_table_.worst_cost;
164  }
165  if (ch2 > bigram_table_.char_bigram[ch1].max_char) {
166  return bigram_table_.worst_cost;
167  }
168  return bigram_table_.char_bigram[ch1].bigram[ch2].cost;
169 }
170 
171 int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const {
172  if (!char_32_ptr || char_32_ptr[0] == 0) {
173  return bigram_table_.worst_cost;
174  }
175  int cost = MeanCostWithSpaces(char_32_ptr);
176  if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant &&
177  CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) {
178  char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set);
179  if (lower_32 && lower_32[0] != 0) {
180  int cost_lower = MeanCostWithSpaces(lower_32);
181  cost = MIN(cost, cost_lower);
182  delete [] lower_32;
183  }
184  char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set);
185  if (upper_32 && upper_32[0] != 0) {
186  int cost_upper = MeanCostWithSpaces(upper_32);
187  cost = MIN(cost, cost_upper);
188  delete [] upper_32;
189  }
190  }
191  return cost;
192 }
193 
194 int CharBigrams::MeanCostWithSpaces(const char_32 *char_32_ptr) const {
195  if (!char_32_ptr)
196  return bigram_table_.worst_cost;
197  int len = CubeUtils::StrLen(char_32_ptr);
198  int cost = 0;
199  int c = 0;
200  cost = PairCost(' ', char_32_ptr[0]);
201  for (c = 1; c < len; c++) {
202  cost += PairCost(char_32_ptr[c - 1], char_32_ptr[c]);
203  }
204  cost += PairCost(char_32_ptr[len - 1], ' ');
205  return static_cast<int>(cost / static_cast<double>(len + 1));
206 }
207 } // namespace tesseract
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:195
static CharBigrams * Create(const string &data_file_path, const string &lang)
#define PROB2COST_SCALE
Definition: cube_const.h:24
int Cost(const char_32 *str, CharSet *char_set) const
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:348
int PairCost(char_32 ch1, char_32 ch2) const
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
int MeanCostWithSpaces(const char_32 *char_32_ptr) const
signed int char_32
Definition: string_32.h:40
#define NULL
Definition: host.h:144
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:381
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:294