tesseract  4.00.00dev
unicharcompress.cpp
Go to the documentation of this file.
1 // File: unicharcompress.cpp
3 // Description: Unicode re-encoding using a sequence of smaller numbers in
4 // place of a single large code for CJK, similarly for Indic,
5 // and dissection of ligatures for other scripts.
6 // Author: Ray Smith
7 // Created: Wed Mar 04 14:45:01 PST 2015
8 //
9 // (C) Copyright 2015, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 #include "unicharcompress.h"
23 #include <algorithm>
24 #include <memory>
25 #include "tprintf.h"
26 
27 namespace tesseract {
28 
29 // String used to represent the null_id in direct_set.
30 const char* kNullChar = "<nul>";
31 // Radix to make unique values from the stored radical codes.
32 const int kRadicalRadix = 29;
33 
34 // "Hash" function for const std::vector<int> computes the sum of elements.
35 // Build a unique number for each code sequence that we can use as the index in
36 // a hash map of ints instead of trying to hash the vectors.
37 static int RadicalPreHash(const std::vector<int>& rs) {
38  size_t result = 0;
39  for (int radical : rs) {
40  result *= kRadicalRadix;
41  result += radical;
42  }
43  return result;
44 }
45 
46 // A hash map to convert unicodes to radical encoding.
47 typedef std::unordered_map<int, std::unique_ptr<std::vector<int>>> RSMap;
48 // A hash map to count occurrences of each radical encoding.
49 typedef std::unordered_map<int, int> RSCounts;
50 
51 static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
52  if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
53  return true;
54  GenericVector<STRING> entries;
55  radical_data_line->split(' ', &entries);
56  if (entries.size() < 2) return false;
57  char* end = nullptr;
58  int unicode = strtol(&entries[0][0], &end, 10);
59  if (*end != '\0') return false;
60  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
61  for (int i = 1; i < entries.size(); ++i) {
62  int radical = strtol(&entries[i][0], &end, 10);
63  if (*end != '\0') return false;
64  radicals->push_back(radical);
65  }
66  (*radical_map)[unicode] = std::move(radicals);
67  return true;
68 }
69 
70 // Helper function builds the RSMap from the radical-stroke file, which has
71 // already been read into a STRING. Returns false on error.
72 // The radical_stroke_table is non-const because it gets split and the caller
73 // is unlikely to want to use it again.
74 static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
76  radical_data->split('\n', &lines);
77  for (int i = 0; i < lines.size(); ++i) {
78  if (!DecodeRadicalLine(&lines[i], radical_map)) {
79  tprintf("Invalid format in radical table at line %d: %s\n", i,
80  lines[i].string());
81  return false;
82  }
83  }
84  return true;
85 }
86 
87 UnicharCompress::UnicharCompress() : code_range_(0) {}
91  Cleanup();
92  encoder_ = src.encoder_;
93  code_range_ = src.code_range_;
94  SetupDecoder();
95  return *this;
96 }
97 
98 // Computes the encoding for the given unicharset. It is a requirement that
99 // the file training/langdata/radical-stroke.txt have been read into the
100 // input string radical_stroke_table.
101 // Returns false if the encoding cannot be constructed.
102 bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
103  STRING* radical_stroke_table) {
104  RSMap radical_map;
105  if (radical_stroke_table != nullptr &&
106  !DecodeRadicalTable(radical_stroke_table, &radical_map))
107  return false;
108  encoder_.clear();
109  UNICHARSET direct_set;
110  // To avoid unused codes, clear the special codes from the direct_set.
111  direct_set.clear();
112  // Always keep space as 0;
113  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
114  // Null char is next if we have one.
115  if (null_id >= 0) {
116  direct_set.unichar_insert(kNullChar);
117  }
118  RSCounts radical_counts;
119  // In the initial map, codes [0, unicharset.size()) are
120  // reserved for non-han/hangul sequences of 1 or more unicodes.
121  int hangul_offset = unicharset.size();
122  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
123  const int kTotalJamos = kLCount + kVCount + kTCount;
124  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
125  // to measure the number of radicals and strokes, initially we use the same
126  // code range for all 3 Han code positions, and fix them after.
127  int han_offset = hangul_offset + kTotalJamos;
128  int max_num_strokes = -1;
129  for (int u = 0; u <= unicharset.size(); ++u) {
130  // We special-case allow null_id to be equal to unicharset.size() in case
131  // there is no space in unicharset for it.
132  if (u == unicharset.size() && u != null_id) break; // Finished
133  RecodedCharID code;
134  // Convert to unicodes.
135  std::vector<char32> unicodes;
136  string cleaned;
137  if (u < unicharset.size())
138  cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
139  if (u < unicharset.size() &&
140  (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
141  // Check single unicodes for Hangul/Han and encode if so.
142  int unicode = unicodes[0];
143  int leading, vowel, trailing;
144  auto it = radical_map.find(unicode);
145  if (it != radical_map.end()) {
146  // This is Han. Use the radical codes directly.
147  int num_radicals = it->second->size();
148  for (int c = 0; c < num_radicals; ++c) {
149  code.Set(c, han_offset + (*it->second)[c]);
150  }
151  int pre_hash = RadicalPreHash(*it->second);
152  int num_samples = radical_counts[pre_hash]++;
153  if (num_samples > 0)
154  code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
155  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
156  // This is Hangul. Since we know the exact size of each part at compile
157  // time, it gets the bottom set of codes.
158  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
159  trailing + kLCount + kVCount + hangul_offset);
160  }
161  }
162  // If the code is still empty, it wasn't Han or Hangul.
163  if (code.length() == 0) {
164  // Special cases.
165  if (u == UNICHAR_SPACE) {
166  code.Set(0, 0); // Space.
167  } else if (u == null_id || (unicharset.has_special_codes() &&
169  code.Set(0, direct_set.unichar_to_id(kNullChar));
170  } else {
171  // Add the direct_set unichar-ids of the unicodes in sequence to the
172  // code.
173  for (int i = 0; i < unicodes.size(); ++i) {
174  int position = code.length();
175  if (position >= RecodedCharID::kMaxCodeLen) {
176  tprintf("Unichar %d=%s is too long to encode!!\n", u,
177  unicharset.id_to_unichar(u));
178  return false;
179  }
180  int uni = unicodes[i];
181  UNICHAR unichar(uni);
182  char* utf8 = unichar.utf8_str();
183  if (!direct_set.contains_unichar(utf8))
184  direct_set.unichar_insert(utf8);
185  code.Set(position, direct_set.unichar_to_id(utf8));
186  delete[] utf8;
187  if (direct_set.size() >
188  unicharset.size() + !unicharset.has_special_codes()) {
189  // Code space got bigger!
190  tprintf("Code space expanded from original unicharset!!\n");
191  return false;
192  }
193  }
194  }
195  }
196  encoder_.push_back(code);
197  }
198  // Now renumber Han to make all codes unique. We already added han_offset to
199  // all Han. Now separate out the radical, stroke, and count codes for Han.
200  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
201  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
202  // code uses the rest for the max number of duplicated radical/stroke combos.
203  int code_offset = 0;
204  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
205  int max_offset = 0;
206  for (int u = 0; u < unicharset.size(); ++u) {
207  RecodedCharID* code = &encoder_[u];
208  if (code->length() <= i) continue;
209  max_offset = std::max(max_offset, (*code)(i)-han_offset);
210  code->Set(i, (*code)(i) + code_offset);
211  }
212  if (max_offset == 0) break;
213  code_offset += max_offset + 1;
214  }
215  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
216  SetupDecoder();
217  return true;
218 }
219 
220 // Sets up an encoder that doesn't change the unichars at all, so it just
221 // passes them through unchanged.
224  for (int u = 0; u < unicharset.size(); ++u) {
225  RecodedCharID code;
226  code.Set(0, u);
227  codes.push_back(code);
228  }
229  if (!unicharset.has_special_codes()) {
230  RecodedCharID code;
231  code.Set(0, unicharset.size());
232  codes.push_back(code);
233  }
234  SetupDirect(codes);
235 }
236 
237 // Sets up an encoder directly using the given encoding vector, which maps
238 // unichar_ids to the given codes.
240  encoder_ = codes;
241  ComputeCodeRange();
242  SetupDecoder();
243 }
244 
245 // Renumbers codes to eliminate unused values.
246 void UnicharCompress::DefragmentCodeValues(int encoded_null) {
247  // There may not be any Hangul, but even if there is, it is possible that not
248  // all codes are used. Likewise with the Han encoding, it is possible that not
249  // all numbers of strokes are used.
250  ComputeCodeRange();
251  GenericVector<int> offsets;
252  offsets.init_to_size(code_range_, 0);
253  // Find which codes are used
254  for (int c = 0; c < encoder_.size(); ++c) {
255  const RecodedCharID& code = encoder_[c];
256  for (int i = 0; i < code.length(); ++i) {
257  offsets[code(i)] = 1;
258  }
259  }
260  // Compute offsets based on code use.
261  int offset = 0;
262  for (int i = 0; i < offsets.size(); ++i) {
263  // If not used, decrement everything above here.
264  // We are moving encoded_null to the end, so it is not "used".
265  if (offsets[i] == 0 || i == encoded_null) {
266  --offset;
267  } else {
268  offsets[i] = offset;
269  }
270  }
271  if (encoded_null >= 0) {
272  // The encoded_null is moving to the end, for the benefit of TensorFlow,
273  // which is offsets.size() + offsets.back().
274  offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
275  }
276  // Now apply the offsets.
277  for (int c = 0; c < encoder_.size(); ++c) {
278  RecodedCharID* code = &encoder_[c];
279  for (int i = 0; i < code->length(); ++i) {
280  int value = (*code)(i);
281  code->Set(i, value + offsets[value]);
282  }
283  }
284  ComputeCodeRange();
285 }
286 
287 // Encodes a single unichar_id. Returns the length of the code, or zero if
288 // invalid input, and the encoding itself
289 int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const {
290  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
291  *code = encoder_[unichar_id];
292  return code->length();
293 }
294 
295 // Decodes code, returning the original unichar-id, or
296 // INVALID_UNICHAR_ID if the input is invalid.
298  int len = code.length();
299  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
300  auto it = decoder_.find(code);
301  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
302  return it->second;
303 }
304 
305 // Writes to the given file. Returns false in case of error.
307  return encoder_.SerializeClasses(fp);
308 }
309 
310 // Reads from the given file. Returns false in case of error.
312  if (!encoder_.DeSerializeClasses(fp)) return false;
313  ComputeCodeRange();
314  SetupDecoder();
315  return true;
316 }
317 
318 // Returns a STRING containing a text file that describes the encoding thus:
319 // <index>[,<index>]*<tab><UTF8-str><newline>
320 // In words, a comma-separated list of one or more indices, followed by a tab
321 // and the UTF-8 string that the code represents per line. Most simple scripts
322 // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean
323 // and the Indic scripts will contain a many-to-many mapping.
324 // See the class comment above for details.
326  const UNICHARSET& unicharset) const {
327  STRING encoding;
328  for (int c = 0; c < encoder_.size(); ++c) {
329  const RecodedCharID& code = encoder_[c];
330  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
331  // Don't show the duplicate entry.
332  continue;
333  }
334  encoding.add_str_int("", code(0));
335  for (int i = 1; i < code.length(); ++i) {
336  encoding.add_str_int(",", code(i));
337  }
338  encoding += "\t";
339  if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
340  unicharset.has_special_codes())) {
341  encoding += kNullChar;
342  } else {
343  encoding += unicharset.id_to_unichar(c);
344  }
345  encoding += "\n";
346  }
347  return encoding;
348 }
349 
350 // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing.
351 // Note that the returned values are 0-based indices, NOT unicode Jamo.
352 // Returns false if the input is not in the Hangul unicode range.
353 /* static */
354 bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel,
355  int* trailing) {
356  if (unicode < kFirstHangul) return false;
357  int offset = unicode - kFirstHangul;
358  if (offset >= kNumHangul) return false;
359  const int kNCount = kVCount * kTCount;
360  *leading = offset / kNCount;
361  *vowel = (offset % kNCount) / kTCount;
362  *trailing = offset % kTCount;
363  return true;
364 }
365 
366 // Computes the value of code_range_ from the encoder_.
367 void UnicharCompress::ComputeCodeRange() {
368  code_range_ = -1;
369  for (int c = 0; c < encoder_.size(); ++c) {
370  const RecodedCharID& code = encoder_[c];
371  for (int i = 0; i < code.length(); ++i) {
372  if (code(i) > code_range_) code_range_ = code(i);
373  }
374  }
375  ++code_range_;
376 }
377 
378 // Initializes the decoding hash_map from the encoding array.
379 void UnicharCompress::SetupDecoder() {
380  Cleanup();
381  is_valid_start_.init_to_size(code_range_, false);
382  for (int c = 0; c < encoder_.size(); ++c) {
383  const RecodedCharID& code = encoder_[c];
384  decoder_[code] = c;
385  is_valid_start_[code(0)] = true;
386  RecodedCharID prefix = code;
387  int len = code.length() - 1;
388  prefix.Truncate(len);
389  auto final_it = final_codes_.find(prefix);
390  if (final_it == final_codes_.end()) {
392  code_list->push_back(code(len));
393  final_codes_[prefix] = code_list;
394  while (--len >= 0) {
395  prefix.Truncate(len);
396  auto next_it = next_codes_.find(prefix);
397  if (next_it == next_codes_.end()) {
399  code_list->push_back(code(len));
400  next_codes_[prefix] = code_list;
401  } else {
402  // We still have to search the list as we may get here via multiple
403  // lengths of code.
404  if (!next_it->second->contains(code(len)))
405  next_it->second->push_back(code(len));
406  break; // This prefix has been processed.
407  }
408  }
409  } else {
410  if (!final_it->second->contains(code(len)))
411  final_it->second->push_back(code(len));
412  }
413  }
414 }
415 
416 // Frees allocated memory.
417 void UnicharCompress::Cleanup() {
418  decoder_.clear();
419  is_valid_start_.clear();
420  for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) {
421  delete it->second;
422  }
423  for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) {
424  delete it->second;
425  }
426  next_codes_.clear();
427  final_codes_.clear();
428 }
429 
430 } // namespace tesseract.
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
STRING GetEncodingAsString(const UNICHARSET &unicharset) const
static const int kFirstHangul
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
std::unordered_map< int, std::unique_ptr< std::vector< int > > > RSMap
T & back() const
UnicharCompress & operator=(const UnicharCompress &src)
static const int kMaxCodeLen
int size() const
Definition: genericvector.h:72
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
const char * kNullChar
void SetupDirect(const GenericVector< RecodedCharID > &codes)
#define tprintf(...)
Definition: tprintf.h:31
const int kRadicalRadix
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
int EncodeUnichar(int unichar_id, RecodedCharID *code) const
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
int size() const
Definition: unicharset.h:338
int push_back(T object)
bool has_special_codes() const
Definition: unicharset.h:721
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
Definition: strngs.h:45
void Set(int index, int value)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213
void Truncate(int length)
std::unordered_map< int, int > RSCounts
bool Serialize(TFile *fp) const
char * utf8_str() const
Definition: unichar.cpp:127
void clear()
Definition: unicharset.h:303
void SetupPassThrough(const UNICHARSET &unicharset)
void Set3(int code0, int code1, int code2)
void init_to_size(int size, T t)
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
int DecodeUnichar(const RecodedCharID &code) const
inT32 length() const
Definition: strngs.cpp:193
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288