All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
cube_utils.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: cube_utils.cpp
3  * Description: Implementation of the Cube Utilities Class
4  * Author: Ahmad Abdulkader
5  * Created: 2008
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <math.h>
21 #include <string>
22 #include <vector>
23 #include "cube_utils.h"
24 #include "char_set.h"
25 #include "unichar.h"
26 
27 namespace tesseract {
29 }
30 
32 }
33 
37 int CubeUtils::Prob2Cost(double prob_val) {
38  if (prob_val < MIN_PROB) {
39  return MIN_PROB_COST;
40  }
41  return static_cast<int>(-log(prob_val) * PROB2COST_SCALE);
42 }
43 
47 double CubeUtils::Cost2Prob(int cost) {
48  return exp(-cost / PROB2COST_SCALE);
49 }
50 
54 int CubeUtils::StrLen(const char_32 *char_32_ptr) {
55  if (char_32_ptr == NULL) {
56  return 0;
57  }
58  int len = -1;
59  while (char_32_ptr[++len]);
60  return len;
61 }
62 
66 int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) {
67  const char_32 *pch1 = str1;
68  const char_32 *pch2 = str2;
69 
70  for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) {
71  if ((*pch1) != (*pch2)) {
72  return (*pch1) - (*pch2);
73  }
74  }
75 
76  if ((*pch1) == 0) {
77  if ((*pch2) == 0) {
78  return 0;
79  } else {
80  return -1;
81  }
82  } else {
83  return 1;
84  }
85 }
86 
91  int len = StrLen(str32);
92  char_32 *new_str = new char_32[len + 1];
93  if (new_str == NULL) {
94  return NULL;
95  }
96  memcpy(new_str, str32, len * sizeof(*str32));
97  new_str[len] = 0;
98  return new_str;
99 }
100 
104 CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top,
105  int wid, int hgt) {
106  // get the raw img data from the image
107  unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt);
108  if (temp_buff == NULL) {
109  return NULL;
110  }
111 
112  // create a char samp from temp buffer
113  CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff);
114 
115  // clean up temp buffer
116  delete []temp_buff;
117  return char_samp;
118 }
119 
124  // parameter check
125  if (char_samp == NULL) {
126  return NULL;
127  }
128 
129  // get the raw data
130  int stride = char_samp->Stride();
131  int wid = char_samp->Width();
132  int hgt = char_samp->Height();
133 
134  Pix *pix = pixCreate(wid, hgt, 1);
135  if (pix == NULL) {
136  return NULL;
137  }
138 
139  // copy the contents
140  unsigned char *line = char_samp->RawData();
141  for (int y = 0; y < hgt ; y++, line += stride) {
142  for (int x = 0; x < wid; x++) {
143  if (line[x] != 0) {
144  pixSetPixel(pix, x, y, 0);
145  } else {
146  pixSetPixel(pix, x, y, 255);
147  }
148  }
149  }
150 
151  return pix;
152 }
153 
157 unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top,
158  int wid, int hgt) {
159  // skip invalid dimensions
160  if (left < 0 || top < 0 || wid < 0 || hgt < 0 ||
161  (left + wid) > pix->w || (top + hgt) > pix->h ||
162  pix->d != 1) {
163  return NULL;
164  }
165 
166  // copy the char img to a temp buffer
167  unsigned char *temp_buff = new unsigned char[wid * hgt];
168  if (temp_buff == NULL) {
169  return NULL;
170  }
171  l_int32 w;
172  l_int32 h;
173  l_int32 d;
174  l_int32 wpl;
175  l_uint32 *line;
176  l_uint32 *data;
177 
178  pixGetDimensions(pix, &w, &h, &d);
179  wpl = pixGetWpl(pix);
180  data = pixGetData(pix);
181  line = data + (top * wpl);
182 
183  for (int y = 0, off = 0; y < hgt ; y++) {
184  for (int x = 0; x < wid; x++, off++) {
185  temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255;
186  }
187  line += wpl;
188  }
189  return temp_buff;
190 }
191 
195 bool CubeUtils::ReadFileToString(const string &file_name, string *str) {
196  str->clear();
197  FILE *fp = fopen(file_name.c_str(), "rb");
198  if (fp == NULL) {
199  return false;
200  }
201 
202  // get the size of the size
203  fseek(fp, 0, SEEK_END);
204  int file_size = ftell(fp);
205  if (file_size < 1) {
206  fclose(fp);
207  return false;
208  }
209  // adjust string size
210  str->reserve(file_size);
211  // read the contents
212  rewind(fp);
213  char *buff = new char[file_size];
214  if (buff == NULL) {
215  fclose(fp);
216  return false;
217  }
218  int read_bytes = fread(buff, 1, static_cast<int>(file_size), fp);
219  if (read_bytes == file_size) {
220  str->append(buff, file_size);
221  }
222  delete []buff;
223  fclose(fp);
224  return (read_bytes == file_size);
225 }
226 
230 void CubeUtils::SplitStringUsing(const string &str,
231  const string &delims,
232  vector<string> *str_vec) {
233  // Optimize the common case where delims is a single character.
234  if (delims[0] != '\0' && delims[1] == '\0') {
235  char c = delims[0];
236  const char* p = str.data();
237  const char* end = p + str.size();
238  while (p != end) {
239  if (*p == c) {
240  ++p;
241  } else {
242  const char* start = p;
243  while (++p != end && *p != c);
244  str_vec->push_back(string(start, p - start));
245  }
246  }
247  return;
248  }
249 
250  string::size_type begin_index, end_index;
251  begin_index = str.find_first_not_of(delims);
252  while (begin_index != string::npos) {
253  end_index = str.find_first_of(delims, begin_index);
254  if (end_index == string::npos) {
255  str_vec->push_back(str.substr(begin_index));
256  return;
257  }
258  str_vec->push_back(str.substr(begin_index, (end_index - begin_index)));
259  begin_index = str.find_first_not_of(delims, end_index);
260  }
261 }
262 
266 void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) {
267  str32->clear();
268  int len = strlen(utf8_str);
269  int step = 0;
270  for (int ch = 0; ch < len; ch += step) {
271  step = UNICHAR::utf8_step(utf8_str + ch);
272  if (step > 0) {
273  UNICHAR uni_ch(utf8_str + ch, step);
274  (*str32) += uni_ch.first_uni();
275  }
276  }
277 }
278 
282 void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) {
283  str->clear();
284  for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) {
285  UNICHAR uni_ch((*ch_32));
286  char *utf8 = uni_ch.utf8_str();
287  if (utf8 != NULL) {
288  (*str) += utf8;
289  delete []utf8;
290  }
291  }
292 }
293 
294 bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) {
295  bool all_one_case = true;
296  bool capitalized;
297  bool prev_upper;
298  bool prev_lower;
299  bool first_upper;
300  bool first_lower;
301  bool cur_upper;
302  bool cur_lower;
303 
304  string str8;
305  if (!char_set) {
306  // If cube char_set is missing, use C-locale-dependent functions
307  // on UTF8 characters to determine case properties.
308  first_upper = isupper(str32[0]);
309  first_lower = islower(str32[0]);
310  if (first_upper)
311  capitalized = true;
312  prev_upper = first_upper;
313  prev_lower = islower(str32[0]);
314  for (int c = 1; str32[c] != 0; ++c) {
315  cur_upper = isupper(str32[c]);
316  cur_lower = islower(str32[c]);
317  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
318  all_one_case = false;
319  if (cur_upper)
320  capitalized = false;
321  prev_upper = cur_upper;
322  prev_lower = cur_lower;
323  }
324  } else {
325  UNICHARSET *unicharset = char_set->InternalUnicharset();
326  // Use UNICHARSET functions to determine case properties
327  first_upper = unicharset->get_isupper(char_set->ClassID(str32[0]));
328  first_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
329  if (first_upper)
330  capitalized = true;
331  prev_upper = first_upper;
332  prev_lower = unicharset->get_islower(char_set->ClassID(str32[0]));
333 
334  for (int c = 1; c < StrLen(str32); ++c) {
335  cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c]));
336  cur_lower = unicharset->get_islower(char_set->ClassID(str32[c]));
337  if ((prev_upper && cur_lower) || (prev_lower && cur_upper))
338  all_one_case = false;
339  if (cur_upper)
340  capitalized = false;
341  prev_upper = cur_upper;
342  prev_lower = cur_lower;
343  }
344  }
345  return all_one_case || capitalized;
346 }
347 
348 char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) {
349  if (!char_set) {
350  return NULL;
351  }
352  UNICHARSET *unicharset = char_set->InternalUnicharset();
353  int len = StrLen(str32);
354  char_32 *lower = new char_32[len + 1];
355  if (!lower)
356  return NULL;
357  for (int i = 0; i < len; ++i) {
358  char_32 ch = str32[i];
359  if (ch == INVALID_UNICHAR_ID) {
360  delete [] lower;
361  return NULL;
362  }
363  // convert upper-case characters to lower-case
364  if (unicharset->get_isupper(char_set->ClassID(ch))) {
365  UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch));
366  const char_32 *str32_lower = char_set->ClassString(uid_lower);
367  // expect lower-case version of character to be a single character
368  if (!str32_lower || StrLen(str32_lower) != 1) {
369  delete [] lower;
370  return NULL;
371  }
372  lower[i] = str32_lower[0];
373  } else {
374  lower[i] = ch;
375  }
376  }
377  lower[len] = 0;
378  return lower;
379 }
380 
381 char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) {
382  if (!char_set) {
383  return NULL;
384  }
385  UNICHARSET *unicharset = char_set->InternalUnicharset();
386  int len = StrLen(str32);
387  char_32 *upper = new char_32[len + 1];
388  if (!upper)
389  return NULL;
390  for (int i = 0; i < len; ++i) {
391  char_32 ch = str32[i];
392  if (ch == INVALID_UNICHAR_ID) {
393  delete [] upper;
394  return NULL;
395  }
396  // convert lower-case characters to upper-case
397  if (unicharset->get_islower(char_set->ClassID(ch))) {
398  UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch));
399  const char_32 *str32_upper = char_set->ClassString(uid_upper);
400  // expect upper-case version of character to be a single character
401  if (!str32_upper || StrLen(str32_upper) != 1) {
402  delete [] upper;
403  return NULL;
404  }
405  upper[i] = str32_upper[0];
406  } else {
407  upper[i] = ch;
408  }
409  }
410  upper[len] = 0;
411  return upper;
412 }
413 } // namespace tesseract
#define MIN_PROB
Definition: cube_const.h:28
static Pix * PixFromCharSample(CharSamp *char_samp)
Definition: cube_utils.cpp:123
static int Prob2Cost(double prob_val)
Definition: cube_utils.cpp:37
unsigned char * RawData() const
Definition: bmp_8.h:51
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:463
basic_string< char_32 > string_32
Definition: string_32.h:41
static bool ReadFileToString(const string &file_name, string *str)
Definition: cube_utils.cpp:195
#define PROB2COST_SCALE
Definition: cube_const.h:24
static char_32 * ToLower(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:348
unsigned short Width() const
Definition: bmp_8.h:48
static CharSamp * FromRawData(int left, int top, int wid, int hgt, unsigned char *data)
Definition: char_samp.cpp:273
UNICHARSET * InternalUnicharset()
Definition: char_set.h:121
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
int ClassID(const char_32 *str) const
Definition: char_set.h:54
int first_uni() const
Definition: unichar.cpp:97
char * utf8_str() const
Definition: unichar.cpp:125
#define MIN_PROB_COST
Definition: cube_const.h:26
int UNICHAR_ID
Definition: unichar.h:33
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:134
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:456
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:631
unsigned short Stride() const
Definition: bmp_8.h:49
static double Cost2Prob(int cost)
Definition: cube_utils.cpp:47
static int StrLen(const char_32 *str)
Definition: cube_utils.cpp:54
static void UTF32ToUTF8(const char_32 *utf32_str, string *str)
Definition: cube_utils.cpp:282
static void SplitStringUsing(const string &str, const string &delims, vector< string > *str_vec)
Definition: cube_utils.cpp:230
static int StrCmp(const char_32 *str1, const char_32 *str2)
Definition: cube_utils.cpp:66
unsigned short Height() const
Definition: bmp_8.h:50
signed int char_32
Definition: string_32.h:40
#define NULL
Definition: host.h:144
static char_32 * ToUpper(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:381
static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set)
Definition: cube_utils.cpp:294
static char_32 * StrDup(const char_32 *str)
Definition: cube_utils.cpp:90
static CharSamp * CharSampleFromPix(Pix *pix, int left, int top, int wid, int hgt)
Definition: cube_utils.cpp:104
const char_32 * ClassString(int class_id) const
Definition: char_set.h:104