tesseract  4.00.00dev
tesseract::UnicharCompress Class Reference

#include <unicharcompress.h>

Public Member Functions

 UnicharCompress ()
 
 UnicharCompress (const UnicharCompress &src)
 
 ~UnicharCompress ()
 
UnicharCompressoperator= (const UnicharCompress &src)
 
bool ComputeEncoding (const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)
 
void SetupPassThrough (const UNICHARSET &unicharset)
 
void SetupDirect (const GenericVector< RecodedCharID > &codes)
 
int code_range () const
 
int EncodeUnichar (int unichar_id, RecodedCharID *code) const
 
int DecodeUnichar (const RecodedCharID &code) const
 
bool IsValidFirstCode (int code) const
 
const GenericVector< int > * GetNextCodes (const RecodedCharID &code) const
 
const GenericVector< int > * GetFinalCodes (const RecodedCharID &code) const
 
bool Serialize (TFile *fp) const
 
bool DeSerialize (TFile *fp)
 
STRING GetEncodingAsString (const UNICHARSET &unicharset) const
 

Static Public Member Functions

static bool DecomposeHangul (int unicode, int *leading, int *vowel, int *trailing)
 

Static Public Attributes

static const int kFirstHangul = 0xac00
 
static const int kNumHangul = 11172
 
static const int kLCount = 19
 
static const int kVCount = 21
 
static const int kTCount = 28
 

Detailed Description

Definition at line 134 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 87 of file unicharcompress.cpp.

87 : code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress src)

Definition at line 88 of file unicharcompress.cpp.

88 { *this = src; }

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 89 of file unicharcompress.cpp.

89 { Cleanup(); }

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const
inline

Definition at line 167 of file unicharcompress.h.

167 { return code_range_; }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding ( const UNICHARSET unicharset,
int  null_id,
STRING radical_stroke_table 
)

Definition at line 102 of file unicharcompress.cpp.

103  {
104  RSMap radical_map;
105  if (radical_stroke_table != nullptr &&
106  !DecodeRadicalTable(radical_stroke_table, &radical_map))
107  return false;
108  encoder_.clear();
109  UNICHARSET direct_set;
110  // To avoid unused codes, clear the special codes from the direct_set.
111  direct_set.clear();
112  // Always keep space as 0;
113  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
114  // Null char is next if we have one.
115  if (null_id >= 0) {
116  direct_set.unichar_insert(kNullChar);
117  }
118  RSCounts radical_counts;
119  // In the initial map, codes [0, unicharset.size()) are
120  // reserved for non-han/hangul sequences of 1 or more unicodes.
121  int hangul_offset = unicharset.size();
122  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
123  const int kTotalJamos = kLCount + kVCount + kTCount;
124  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
125  // to measure the number of radicals and strokes, initially we use the same
126  // code range for all 3 Han code positions, and fix them after.
127  int han_offset = hangul_offset + kTotalJamos;
128  int max_num_strokes = -1;
129  for (int u = 0; u <= unicharset.size(); ++u) {
130  // We special-case allow null_id to be equal to unicharset.size() in case
131  // there is no space in unicharset for it.
132  if (u == unicharset.size() && u != null_id) break; // Finished
133  RecodedCharID code;
134  // Convert to unicodes.
135  std::vector<char32> unicodes;
136  string cleaned;
137  if (u < unicharset.size())
138  cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
139  if (u < unicharset.size() &&
140  (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
141  // Check single unicodes for Hangul/Han and encode if so.
142  int unicode = unicodes[0];
143  int leading, vowel, trailing;
144  auto it = radical_map.find(unicode);
145  if (it != radical_map.end()) {
146  // This is Han. Use the radical codes directly.
147  int num_radicals = it->second->size();
148  for (int c = 0; c < num_radicals; ++c) {
149  code.Set(c, han_offset + (*it->second)[c]);
150  }
151  int pre_hash = RadicalPreHash(*it->second);
152  int num_samples = radical_counts[pre_hash]++;
153  if (num_samples > 0)
154  code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
155  } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
156  // This is Hangul. Since we know the exact size of each part at compile
157  // time, it gets the bottom set of codes.
158  code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
159  trailing + kLCount + kVCount + hangul_offset);
160  }
161  }
162  // If the code is still empty, it wasn't Han or Hangul.
163  if (code.length() == 0) {
164  // Special cases.
165  if (u == UNICHAR_SPACE) {
166  code.Set(0, 0); // Space.
167  } else if (u == null_id || (unicharset.has_special_codes() &&
169  code.Set(0, direct_set.unichar_to_id(kNullChar));
170  } else {
171  // Add the direct_set unichar-ids of the unicodes in sequence to the
172  // code.
173  for (int i = 0; i < unicodes.size(); ++i) {
174  int position = code.length();
175  if (position >= RecodedCharID::kMaxCodeLen) {
176  tprintf("Unichar %d=%s is too long to encode!!\n", u,
177  unicharset.id_to_unichar(u));
178  return false;
179  }
180  int uni = unicodes[i];
181  UNICHAR unichar(uni);
182  char* utf8 = unichar.utf8_str();
183  if (!direct_set.contains_unichar(utf8))
184  direct_set.unichar_insert(utf8);
185  code.Set(position, direct_set.unichar_to_id(utf8));
186  delete[] utf8;
187  if (direct_set.size() >
188  unicharset.size() + !unicharset.has_special_codes()) {
189  // Code space got bigger!
190  tprintf("Code space expanded from original unicharset!!\n");
191  return false;
192  }
193  }
194  }
195  }
196  encoder_.push_back(code);
197  }
198  // Now renumber Han to make all codes unique. We already added han_offset to
199  // all Han. Now separate out the radical, stroke, and count codes for Han.
200  // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
201  // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
202  // code uses the rest for the max number of duplicated radical/stroke combos.
203  int code_offset = 0;
204  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
205  int max_offset = 0;
206  for (int u = 0; u < unicharset.size(); ++u) {
207  RecodedCharID* code = &encoder_[u];
208  if (code->length() <= i) continue;
209  max_offset = std::max(max_offset, (*code)(i)-han_offset);
210  code->Set(i, (*code)(i) + code_offset);
211  }
212  if (max_offset == 0) break;
213  code_offset += max_offset + 1;
214  }
215  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
216  SetupDecoder();
217  return true;
218 }
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
std::unordered_map< int, std::unique_ptr< std::vector< int > > > RSMap
static const int kMaxCodeLen
static string CleanupString(const char *utf8_str)
Definition: unicharset.h:241
const char * kNullChar
#define tprintf(...)
Definition: tprintf.h:31
const int kRadicalRadix
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:668
int size() const
Definition: unicharset.h:338
bool has_special_codes() const
Definition: unicharset.h:721
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:623
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:207
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:213
std::unordered_map< int, int > RSCounts
void clear()
Definition: unicharset.h:303
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID code) const

Definition at line 297 of file unicharcompress.cpp.

297  {
298  int len = code.length();
299  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
300  auto it = decoder_.find(code);
301  if (it == decoder_.end()) return INVALID_UNICHAR_ID;
302  return it->second;
303 }
static const int kMaxCodeLen

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul ( int  unicode,
int *  leading,
int *  vowel,
int *  trailing 
)
static

Definition at line 354 of file unicharcompress.cpp.

355  {
356  if (unicode < kFirstHangul) return false;
357  int offset = unicode - kFirstHangul;
358  if (offset >= kNumHangul) return false;
359  const int kNCount = kVCount * kTCount;
360  *leading = offset / kNCount;
361  *vowel = (offset % kNCount) / kTCount;
362  *trailing = offset % kTCount;
363  return true;
364 }
static const int kFirstHangul

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile fp)

Definition at line 311 of file unicharcompress.cpp.

311  {
312  if (!encoder_.DeSerializeClasses(fp)) return false;
313  ComputeCodeRange();
314  SetupDecoder();
315  return true;
316 }

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar ( int  unichar_id,
RecodedCharID code 
) const

Definition at line 289 of file unicharcompress.cpp.

289  {
290  if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
291  *code = encoder_[unichar_id];
292  return code->length();
293 }

◆ GetEncodingAsString()

STRING tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET unicharset) const

Definition at line 325 of file unicharcompress.cpp.

326  {
327  STRING encoding;
328  for (int c = 0; c < encoder_.size(); ++c) {
329  const RecodedCharID& code = encoder_[c];
330  if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
331  // Don't show the duplicate entry.
332  continue;
333  }
334  encoding.add_str_int("", code(0));
335  for (int i = 1; i < code.length(); ++i) {
336  encoding.add_str_int(",", code(i));
337  }
338  encoding += "\t";
339  if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
340  unicharset.has_special_codes())) {
341  encoding += kNullChar;
342  } else {
343  encoding += unicharset.id_to_unichar(c);
344  }
345  encoding += "\n";
346  }
347  return encoding;
348 }
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
const char * kNullChar
int size() const
Definition: unicharset.h:338
bool has_special_codes() const
Definition: unicharset.h:721
Definition: strngs.h:45
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:288

◆ GetFinalCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID code) const
inline

Definition at line 185 of file unicharcompress.h.

185  {
186  auto it = final_codes_.find(code);
187  return it == final_codes_.end() ? NULL : it->second;
188  }

◆ GetNextCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID code) const
inline

Definition at line 179 of file unicharcompress.h.

179  {
180  auto it = next_codes_.find(code);
181  return it == next_codes_.end() ? NULL : it->second;
182  }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int  code) const
inline

Definition at line 176 of file unicharcompress.h.

176 { return is_valid_start_[code]; }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress src)

Definition at line 90 of file unicharcompress.cpp.

90  {
91  Cleanup();
92  encoder_ = src.encoder_;
93  code_range_ = src.code_range_;
94  SetupDecoder();
95  return *this;
96 }

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile fp) const

Definition at line 306 of file unicharcompress.cpp.

306  {
307  return encoder_.SerializeClasses(fp);
308 }

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const GenericVector< RecodedCharID > &  codes)

Definition at line 239 of file unicharcompress.cpp.

239  {
240  encoder_ = codes;
241  ComputeCodeRange();
242  SetupDecoder();
243 }

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET unicharset)

Definition at line 222 of file unicharcompress.cpp.

222  {
224  for (int u = 0; u < unicharset.size(); ++u) {
225  RecodedCharID code;
226  code.Set(0, u);
227  codes.push_back(code);
228  }
229  if (!unicharset.has_special_codes()) {
230  RecodedCharID code;
231  code.Set(0, unicharset.size());
232  codes.push_back(code);
233  }
234  SetupDirect(codes);
235 }
void SetupDirect(const GenericVector< RecodedCharID > &codes)
int size() const
Definition: unicharset.h:338
int push_back(T object)
bool has_special_codes() const
Definition: unicharset.h:721

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00
static

Definition at line 142 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19
static

Definition at line 147 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172
static

Definition at line 144 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28
static

Definition at line 149 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21
static

Definition at line 148 of file unicharcompress.h.


The documentation for this class was generated from the following files: