All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
char_set.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: char_samp_enum.cpp
3  * Description: Implementation of a Character Set Class
4  * Author: Ahmad Abdulkader
5  * Created: 2007
6  *
7  * (C) Copyright 2008, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string>
21 
22 #include "char_set.h"
23 #include "cube_utils.h"
24 #include "tessdatamanager.h"
25 
26 namespace tesseract {
27 
29  class_cnt_ = 0;
30  class_strings_ = NULL;
31  unicharset_map_ = NULL;
32  init_ = false;
33 
34  // init hash table
35  memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
36 }
37 
39  if (class_strings_ != NULL) {
40  for (int cls = 0; cls < class_cnt_; cls++) {
41  if (class_strings_[cls] != NULL) {
42  delete class_strings_[cls];
43  }
44  }
45  delete []class_strings_;
46  class_strings_ = NULL;
47  }
48  delete []unicharset_map_;
49 }
50 
51 // Creates CharSet object by reading the unicharset from the
52 // TessDatamanager, and mapping Cube's unicharset to Tesseract's if
53 // they differ.
55  UNICHARSET *tess_unicharset) {
56  CharSet *char_set = new CharSet();
57  if (char_set == NULL) {
58  return NULL;
59  }
60 
61  // First look for Cube's unicharset; if not there, use tesseract's
62  bool cube_unicharset_exists;
63  if (!(cube_unicharset_exists =
64  tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) &&
65  !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) {
66  fprintf(stderr, "Cube ERROR (CharSet::Create): could not find "
67  "either cube or tesseract unicharset\n");
68  return NULL;
69  }
70  FILE *charset_fp = tessdata_manager->GetDataFilePtr();
71  if (!charset_fp) {
72  fprintf(stderr, "Cube ERROR (CharSet::Create): could not load "
73  "a unicharset\n");
74  return NULL;
75  }
76 
77  // If we found a cube unicharset separate from tesseract's, load it and
78  // map its unichars to tesseract's; if only one unicharset exists,
79  // just load it.
80  bool loaded;
81  if (cube_unicharset_exists) {
82  char_set->cube_unicharset_.load_from_file(charset_fp);
83  loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET);
84  loaded = loaded && char_set->LoadSupportedCharList(
85  tessdata_manager->GetDataFilePtr(), tess_unicharset);
86  char_set->unicharset_ = &char_set->cube_unicharset_;
87  } else {
88  loaded = char_set->LoadSupportedCharList(charset_fp, NULL);
89  char_set->unicharset_ = tess_unicharset;
90  }
91  if (!loaded) {
92  delete char_set;
93  return NULL;
94  }
95 
96  char_set->init_ = true;
97  return char_set;
98 }
99 
100 // Load the list of supported chars from the given data file pointer.
101 bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) {
102  if (init_)
103  return true;
104 
105  char str_line[256];
106  // init hash table
107  memset(hash_bin_size_, 0, sizeof(hash_bin_size_));
108  // read the char count
109  if (fgets(str_line, sizeof(str_line), fp) == NULL) {
110  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
111  "read char count.\n");
112  return false;
113  }
114  class_cnt_ = atoi(str_line);
115  if (class_cnt_ < 2) {
116  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid "
117  "class count: %d\n", class_cnt_);
118  return false;
119  }
120  // memory for class strings
121  class_strings_ = new string_32*[class_cnt_];
122  if (class_strings_ == NULL) {
123  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
124  "allocate memory for class strings.\n");
125  return false;
126  }
127  // memory for unicharset map
128  if (tess_unicharset) {
129  unicharset_map_ = new int[class_cnt_];
130  if (unicharset_map_ == NULL) {
131  fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not "
132  "allocate memory for unicharset map.\n");
133  return false;
134  }
135  }
136 
137  // Read in character strings and add to hash table
138  for (int class_id = 0; class_id < class_cnt_; class_id++) {
139  // Read the class string
140  if (fgets(str_line, sizeof(str_line), fp) == NULL) {
141  fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): "
142  "could not read class string with class_id=%d.\n", class_id);
143  return false;
144  }
145  // Terminate at space if any
146  char *p = strchr(str_line, ' ');
147  if (p != NULL)
148  *p = '\0';
149  // Convert to UTF32 and store
150  string_32 str32;
151  // Convert NULL to a space
152  if (strcmp(str_line, "NULL") == 0) {
153  strcpy(str_line, " ");
154  }
155  CubeUtils::UTF8ToUTF32(str_line, &str32);
156  class_strings_[class_id] = new string_32(str32);
157  if (class_strings_[class_id] == NULL) {
158  fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): could not "
159  "allocate memory for class string with class_id=%d.\n", class_id);
160  return false;
161  }
162 
163  // Add to hash-table
164  int hash_val = Hash(reinterpret_cast<const char_32 *>(str32.c_str()));
165  if (hash_bin_size_[hash_val] >= kMaxHashSize) {
166  fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash "
167  "table is full.\n");
168  return false;
169  }
170  hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id;
171 
172  if (tess_unicharset != NULL) {
173  // Add class id to unicharset map
174  UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line);
175  if (tess_id == INVALID_UNICHAR_ID) {
176  tess_unicharset->unichar_insert(str_line);
177  tess_id = tess_unicharset->unichar_to_id(str_line);
178  }
179  ASSERT_HOST(tess_id != INVALID_UNICHAR_ID);
180  unicharset_map_[class_id] = tess_id;
181  }
182  }
183  return true;
184 }
185 
186 } // tesseract
FILE * GetDataFilePtr() const
const UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
basic_string< char_32 > string_32
Definition: string_32.h:41
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:346
#define ASSERT_HOST(x)
Definition: errcode.h:84
static void UTF8ToUTF32(const char *utf8_str, string_32 *str32)
Definition: cube_utils.cpp:266
int UNICHAR_ID
Definition: unichar.h:33
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
bool SeekToStart(TessdataType tessdata_type)
#define NULL
Definition: host.h:144
static CharSet * Create(TessdataManager *tessdata_manager, UNICHARSET *tess_unicharset)
Definition: char_set.cpp:54