#include <unicharcompress.h>

Public Member Functions
	UnicharCompress ()

	UnicharCompress (const UnicharCompress &src)

	~UnicharCompress ()

UnicharCompress &	operator= (const UnicharCompress &src)

bool	ComputeEncoding (const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table)

void	SetupPassThrough (const UNICHARSET &unicharset)

void	SetupDirect (const GenericVector< RecodedCharID > &codes)

int	code_range () const

int	EncodeUnichar (int unichar_id, RecodedCharID *code) const

int	DecodeUnichar (const RecodedCharID &code) const

bool	IsValidFirstCode (int code) const

const GenericVector< int > *	GetNextCodes (const RecodedCharID &code) const

const GenericVector< int > *	GetFinalCodes (const RecodedCharID &code) const

bool	Serialize (TFile *fp) const

bool	DeSerialize (TFile *fp)

STRING	GetEncodingAsString (const UNICHARSET &unicharset) const

Static Public Member Functions
static bool	DecomposeHangul (int unicode, int leading, int vowel, int *trailing)

Static Public Attributes
static const int	kFirstHangul = 0xac00

static const int	kNumHangul = 11172

static const int	kLCount = 19

static const int	kVCount = 21

static const int	kTCount = 28

Detailed Description

Definition at line 134 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 87 of file unicharcompress.cpp.

87 : code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress & src )

Definition at line 88 of file unicharcompress.cpp.

88 { *this = src; }

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 89 of file unicharcompress.cpp.

89 { Cleanup(); }

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const

inline

Definition at line 167 of file unicharcompress.h.

167 { return code_range_; }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding	(	const UNICHARSET &	unicharset,
		int	null_id,
		STRING *	radical_stroke_table
	)

Definition at line 102 of file unicharcompress.cpp.

                                                                     {
   RSMap radical_map;
   if (radical_stroke_table != nullptr &&
       !DecodeRadicalTable(radical_stroke_table, &radical_map))
     return false;
   encoder_.clear();
   UNICHARSET direct_set;
   // To avoid unused codes, clear the special codes from the direct_set.
   direct_set.clear();
   // Always keep space as 0;
   direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
   // Null char is next if we have one.
   if (null_id >= 0) {
     direct_set.unichar_insert(kNullChar);
   }
   RSCounts radical_counts;
   // In the initial map, codes [0, unicharset.size()) are
   // reserved for non-han/hangul sequences of 1 or more unicodes.
   int hangul_offset = unicharset.size();
   // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
   const int kTotalJamos = kLCount + kVCount + kTCount;
   // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
   // to measure the number of radicals and strokes, initially we use the same
   // code range for all 3 Han code positions, and fix them after.
   int han_offset = hangul_offset + kTotalJamos;
   int max_num_strokes = -1;
   for (int u = 0; u <= unicharset.size(); ++u) {
     // We special-case allow null_id to be equal to unicharset.size() in case
     // there is no space in unicharset for it.
     if (u == unicharset.size() && u != null_id) break;  // Finished
     RecodedCharID code;
     // Convert to unicodes.
     std::vector<char32> unicodes;
     string cleaned;
     if (u < unicharset.size())
       cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
     if (u < unicharset.size() &&
         (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
       // Check single unicodes for Hangul/Han and encode if so.
       int unicode = unicodes[0];
       int leading, vowel, trailing;
       auto it = radical_map.find(unicode);
       if (it != radical_map.end()) {
         // This is Han. Use the radical codes directly.
         int num_radicals = it->second->size();
         for (int c = 0; c < num_radicals; ++c) {
           code.Set(c, han_offset + (*it->second)[c]);
         }
         int pre_hash = RadicalPreHash(*it->second);
         int num_samples = radical_counts[pre_hash]++;
         if (num_samples > 0)
           code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
       } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
         // This is Hangul. Since we know the exact size of each part at compile
         // time, it gets the bottom set of codes.
         code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                   trailing + kLCount + kVCount + hangul_offset);
       }
     }
     // If the code is still empty, it wasn't Han or Hangul.
     if (code.length() == 0) {
       // Special cases.
       if (u == UNICHAR_SPACE) {
         code.Set(0, 0);  // Space.
       } else if (u == null_id || (unicharset.has_special_codes() &&
                                   u < SPECIAL_UNICHAR_CODES_COUNT)) {
         code.Set(0, direct_set.unichar_to_id(kNullChar));
       } else {
         // Add the direct_set unichar-ids of the unicodes in sequence to the
         // code.
         for (int i = 0; i < unicodes.size(); ++i) {
           int position = code.length();
           if (position >= RecodedCharID::kMaxCodeLen) {
             tprintf("Unichar %d=%s is too long to encode!!\n", u,
                     unicharset.id_to_unichar(u));
             return false;
           }
           int uni = unicodes[i];
           UNICHAR unichar(uni);
           char* utf8 = unichar.utf8_str();
           if (!direct_set.contains_unichar(utf8))
             direct_set.unichar_insert(utf8);
           code.Set(position, direct_set.unichar_to_id(utf8));
           delete[] utf8;
           if (direct_set.size() >
               unicharset.size() + !unicharset.has_special_codes()) {
             // Code space got bigger!
             tprintf("Code space expanded from original unicharset!!\n");
             return false;
           }
         }
       }
     }
     encoder_.push_back(code);
   }
   // Now renumber Han to make all codes unique. We already added han_offset to
   // all Han. Now separate out the radical, stroke, and count codes for Han.
   // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
   // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
   // code uses the rest for the max number of duplicated radical/stroke combos.
   int code_offset = 0;
   for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
     int max_offset = 0;
     for (int u = 0; u < unicharset.size(); ++u) {
       RecodedCharID* code = &encoder_[u];
       if (code->length() <= i) continue;
       max_offset = std::max(max_offset, (*code)(i)-han_offset);
       code->Set(i, (*code)(i) + code_offset);
     }
     if (max_offset == 0) break;
     code_offset += max_offset + 1;
   }
   DefragmentCodeValues(null_id >= 0 ? 1 : -1);
   SetupDecoder();
   return true;
 }

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID & code ) const

Definition at line 297 of file unicharcompress.cpp.

                                                                   {
   int len = code.length();
   if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID;
   auto it = decoder_.find(code);
   if (it == decoder_.end()) return INVALID_UNICHAR_ID;
   return it->second;
 }

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul	(	int	unicode,
		int *	leading,
		int *	vowel,
		int *	trailing
	)

static

Definition at line 354 of file unicharcompress.cpp.

                                                      {
   if (unicode < kFirstHangul) return false;
   int offset = unicode - kFirstHangul;
   if (offset >= kNumHangul) return false;
   const int kNCount = kVCount * kTCount;
   *leading = offset / kNCount;
   *vowel = (offset % kNCount) / kTCount;
   *trailing = offset % kTCount;
   return true;
 }

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile * fp )

Definition at line 311 of file unicharcompress.cpp.

                                            {
   if (!encoder_.DeSerializeClasses(fp)) return false;
   ComputeCodeRange();
   SetupDecoder();
   return true;
 }

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar	(	int	unichar_id,
		RecodedCharID *	code
	)		const

Definition at line 289 of file unicharcompress.cpp.

                                                                             {
   if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0;
   *code = encoder_[unichar_id];
   return code->length();
 }

◆ GetEncodingAsString()

STRING tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET & unicharset ) const

Definition at line 325 of file unicharcompress.cpp.

                                         {
   STRING encoding;
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
     if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
       // Don't show the duplicate entry.
       continue;
     }
     encoding.add_str_int("", code(0));
     for (int i = 1; i < code.length(); ++i) {
       encoding.add_str_int(",", code(i));
     }
     encoding += "\t";
     if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT &&
                                    unicharset.has_special_codes())) {
       encoding += kNullChar;
     } else {
       encoding += unicharset.id_to_unichar(c);
     }
     encoding += "\n";
   }
   return encoding;
 }

◆ GetFinalCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID & code ) const

inline

Definition at line 185 of file unicharcompress.h.

                                                                            {
     auto it = final_codes_.find(code);
     return it == final_codes_.end() ? NULL : it->second;
   }

◆ GetNextCodes()

const GenericVector<int>* tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID & code ) const

inline

Definition at line 179 of file unicharcompress.h.

                                                                           {
     auto it = next_codes_.find(code);
     return it == next_codes_.end() ? NULL : it->second;
   }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int code ) const

inline

Definition at line 176 of file unicharcompress.h.

176 { return is_valid_start_[code]; }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress & src )

Definition at line 90 of file unicharcompress.cpp.

                                                                       {
   Cleanup();
   encoder_ = src.encoder_;
   code_range_ = src.code_range_;
   SetupDecoder();
   return *this;
 }

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile * fp ) const

Definition at line 306 of file unicharcompress.cpp.

                                                {
   return encoder_.SerializeClasses(fp);
 }

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const GenericVector< RecodedCharID > & codes )

Definition at line 239 of file unicharcompress.cpp.

                                                                            {
   encoder_ = codes;
   ComputeCodeRange();
   SetupDecoder();
 }

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET & unicharset )

Definition at line 222 of file unicharcompress.cpp.

                                                                    {
   GenericVector<RecodedCharID> codes;
   for (int u = 0; u < unicharset.size(); ++u) {
     RecodedCharID code;
     code.Set(0, u);
     codes.push_back(code);
   }
   if (!unicharset.has_special_codes()) {
     RecodedCharID code;
     code.Set(0, unicharset.size());
     codes.push_back(code);
   }
   SetupDirect(codes);
 }

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00

static

Definition at line 142 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19

static

Definition at line 147 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172

static

Definition at line 144 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28

static

Definition at line 149 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21

static

Definition at line 148 of file unicharcompress.h.

The documentation for this class was generated from the following files:

/home/stweil/src/github/tesseract-ocr/tesseract/ccutil/unicharcompress.h
/home/stweil/src/github/tesseract-ocr/tesseract/ccutil/unicharcompress.cpp

Public Member Functions

Static Public Member Functions

Static Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

◆ UnicharCompress() [2/2]

◆ ~UnicharCompress()

Member Function Documentation

◆ code_range()

◆ ComputeEncoding()

◆ DecodeUnichar()

◆ DecomposeHangul()

◆ DeSerialize()

◆ EncodeUnichar()

◆ GetEncodingAsString()

◆ GetFinalCodes()

◆ GetNextCodes()

◆ IsValidFirstCode()

◆ operator=()

◆ Serialize()

◆ SetupDirect()

◆ SetupPassThrough()

Member Data Documentation

◆ kFirstHangul

◆ kLCount

◆ kNumHangul

◆ kTCount

◆ kVCount