#include <unicharcompress.h>

Public Member Functions
	UnicharCompress ()

	UnicharCompress (const UnicharCompress &src)

	~UnicharCompress ()

UnicharCompress &	operator= (const UnicharCompress &src)

bool	ComputeEncoding (const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)

void	SetupPassThrough (const UNICHARSET &unicharset)

void	SetupDirect (const std::vector< RecodedCharID > &codes)

int	code_range () const

int	EncodeUnichar (unsigned unichar_id, RecodedCharID *code) const

int	DecodeUnichar (const RecodedCharID &code) const

bool	IsValidFirstCode (int code) const

const std::vector< int > *	GetNextCodes (const RecodedCharID &code) const

const std::vector< int > *	GetFinalCodes (const RecodedCharID &code) const

bool	Serialize (TFile *fp) const

bool	DeSerialize (TFile *fp)

std::string	GetEncodingAsString (const UNICHARSET &unicharset) const

Static Public Member Functions
static bool	DecomposeHangul (int unicode, int leading, int vowel, int *trailing)

Static Public Attributes
static const int	kFirstHangul = 0xac00

static const int	kNumHangul = 11172

static const int	kLCount = 19

static const int	kVCount = 21

static const int	kTCount = 28

Detailed Description

Definition at line 139 of file unicharcompress.h.

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

tesseract::UnicharCompress::UnicharCompress ( )

Definition at line 90 of file unicharcompress.cpp.

90: code_range_(0) {}

◆ UnicharCompress() [2/2]

tesseract::UnicharCompress::UnicharCompress ( const UnicharCompress & src )

Definition at line 91 of file unicharcompress.cpp.

                                                           {
  *this = src;
}

◆ ~UnicharCompress()

tesseract::UnicharCompress::~UnicharCompress ( )

Definition at line 94 of file unicharcompress.cpp.

                                  {
  Cleanup();
}

Member Function Documentation

◆ code_range()

int tesseract::UnicharCompress::code_range ( ) const

inline

Definition at line 171 of file unicharcompress.h.

                         {
    return code_range_;
  }

◆ ComputeEncoding()

bool tesseract::UnicharCompress::ComputeEncoding	(	const UNICHARSET &	unicharset,
		int	null_id,
		std::string *	radical_stroke_table
	)

Definition at line 109 of file unicharcompress.cpp.

                                                                       {
  RSMap radical_map;
  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
    return false;
  }
  encoder_.clear();
  UNICHARSET direct_set;
  // To avoid unused codes, clear the special codes from the direct_set.
  direct_set.clear();
  // Always keep space as 0;
  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
  // Null char is next if we have one.
  if (null_id >= 0) {
    direct_set.unichar_insert(kNullChar);
  }
  RSCounts radical_counts;
  // In the initial map, codes [0, unicharset.size()) are
  // reserved for non-han/hangul sequences of 1 or more unicodes.
  int hangul_offset = unicharset.size();
  // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos).
  const int kTotalJamos = kLCount + kVCount + kTCount;
  // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard
  // to measure the number of radicals and strokes, initially we use the same
  // code range for all 3 Han code positions, and fix them after.
  int han_offset = hangul_offset + kTotalJamos;
  for (unsigned u = 0; u <= unicharset.size(); ++u) {
    // We special-case allow null_id to be equal to unicharset.size() in case
    // there is no space in unicharset for it.
    if (u == unicharset.size() && static_cast<int>(u) != null_id) {
      break; // Finished
    }
    RecodedCharID code;
    // Convert to unicodes.
    std::vector<char32> unicodes;
    std::string cleaned;
    if (u < unicharset.size()) {
      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
    }
    if (u < unicharset.size() && (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
      // Check single unicodes for Hangul/Han and encode if so.
      int unicode = unicodes[0];
      int leading, vowel, trailing;
      auto it = radical_map.find(unicode);
      if (it != radical_map.end()) {
        // This is Han. Use the radical codes directly.
        int num_radicals = it->second->size();
        for (int c = 0; c < num_radicals; ++c) {
          code.Set(c, han_offset + (*it->second)[c]);
        }
        int pre_hash = RadicalPreHash(*it->second);
        int num_samples = radical_counts[pre_hash]++;
        if (num_samples > 0) {
          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
        }
      } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
        // This is Hangul. Since we know the exact size of each part at compile
        // time, it gets the bottom set of codes.
        code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset,
                  trailing + kLCount + kVCount + hangul_offset);
      }
    }
    // If the code is still empty, it wasn't Han or Hangul.
    if (code.empty()) {
      // Special cases.
      if (u == UNICHAR_SPACE) {
        code.Set(0, 0); // Space.
      } else if (static_cast<int>(u) == null_id ||
                 (unicharset.has_special_codes() && u < SPECIAL_UNICHAR_CODES_COUNT)) {
        code.Set(0, direct_set.unichar_to_id(kNullChar));
      } else {
        // Add the direct_set unichar-ids of the unicodes in sequence to the
        // code.
        for (int uni : unicodes) {
          int position = code.length();
          if (position >= RecodedCharID::kMaxCodeLen) {
            tprintf("Unichar %d=%s is too long to encode!!\n", u, unicharset.id_to_unichar(u));
            return false;
          }
          UNICHAR unichar(uni);
          char *utf8 = unichar.utf8_str();
          if (!direct_set.contains_unichar(utf8)) {
            direct_set.unichar_insert(utf8);
          }
          code.Set(position, direct_set.unichar_to_id(utf8));
          delete[] utf8;
          if (direct_set.size() > unicharset.size() + !unicharset.has_special_codes()) {
            // Code space got bigger!
            tprintf("Code space expanded from original unicharset!!\n");
            return false;
          }
        }
      }
    }
    encoder_.push_back(code);
  }
  // Now renumber Han to make all codes unique. We already added han_offset to
  // all Han. Now separate out the radical, stroke, and count codes for Han.
  int code_offset = 0;
  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
    int max_offset = 0;
    for (unsigned u = 0; u < unicharset.size(); ++u) {
      RecodedCharID *code = &encoder_[u];
      if (code->length() <= i) {
        continue;
      }
      max_offset = std::max(max_offset, (*code)(i)-han_offset);
      code->Set(i, (*code)(i) + code_offset);
    }
    if (max_offset == 0) {
      break;
    }
    code_offset += max_offset + 1;
  }
  DefragmentCodeValues(null_id >= 0 ? 1 : -1);
  SetupDecoder();
  return true;
}

◆ DecodeUnichar()

int tesseract::UnicharCompress::DecodeUnichar ( const RecodedCharID & code ) const

Definition at line 305 of file unicharcompress.cpp.

                                                                  {
  int len = code.length();
  if (len <= 0 || len > RecodedCharID::kMaxCodeLen) {
    return INVALID_UNICHAR_ID;
  }
  auto it = decoder_.find(code);
  if (it == decoder_.end()) {
    return INVALID_UNICHAR_ID;
  }
  return it->second;
}

◆ DecomposeHangul()

bool tesseract::UnicharCompress::DecomposeHangul	(	int	unicode,
		int *	leading,
		int *	vowel,
		int *	trailing
	)

static

Definition at line 367 of file unicharcompress.cpp.

                                                                                          {
  if (unicode < kFirstHangul) {
    return false;
  }
  int offset = unicode - kFirstHangul;
  if (offset >= kNumHangul) {
    return false;
  }
  const int kNCount = kVCount * kTCount;
  *leading = offset / kNCount;
  *vowel = (offset % kNCount) / kTCount;
  *trailing = offset % kTCount;
  return true;
}

◆ DeSerialize()

bool tesseract::UnicharCompress::DeSerialize ( TFile * fp )

Definition at line 323 of file unicharcompress.cpp.

                                           {
  if (!fp->DeSerialize(encoder_)) {
    return false;
  }
  ComputeCodeRange();
  SetupDecoder();
  return true;
}

◆ EncodeUnichar()

int tesseract::UnicharCompress::EncodeUnichar	(	unsigned	unichar_id,
		RecodedCharID *	code
	)		const

Definition at line 295 of file unicharcompress.cpp.

                                                                                 {
  if (unichar_id >= encoder_.size()) {
    return 0;
  }
  *code = encoder_[unichar_id];
  return code->length();
}

◆ GetEncodingAsString()

std::string tesseract::UnicharCompress::GetEncodingAsString ( const UNICHARSET & unicharset ) const

Definition at line 339 of file unicharcompress.cpp.

                                                                                 {
  std::string encoding;
  for (unsigned c = 0; c < encoder_.size(); ++c) {
    const RecodedCharID &code = encoder_[c];
    if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) {
      // Don't show the duplicate entry.
      continue;
    }
    encoding += std::to_string(code(0));
    for (int i = 1; i < code.length(); ++i) {
      encoding += "," + std::to_string(code(i));
    }
    encoding += "\t";
    if (c >= unicharset.size() ||
        (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && unicharset.has_special_codes())) {
      encoding += kNullChar;
    } else {
      encoding += unicharset.id_to_unichar(c);
    }
    encoding += "\n";
  }
  return encoding;
}

◆ GetFinalCodes()

const std::vector< int > * tesseract::UnicharCompress::GetFinalCodes ( const RecodedCharID & code ) const

inline

Definition at line 193 of file unicharcompress.h.

                                                                       {
    auto it = final_codes_.find(code);
    return it == final_codes_.end() ? nullptr : it->second;
  }

◆ GetNextCodes()

const std::vector< int > * tesseract::UnicharCompress::GetNextCodes ( const RecodedCharID & code ) const

inline

Definition at line 187 of file unicharcompress.h.

                                                                      {
    auto it = next_codes_.find(code);
    return it == next_codes_.end() ? nullptr : it->second;
  }

◆ IsValidFirstCode()

bool tesseract::UnicharCompress::IsValidFirstCode ( int code ) const

inline

Definition at line 182 of file unicharcompress.h.

                                        {
    return is_valid_start_[code];
  }

◆ operator=()

UnicharCompress & tesseract::UnicharCompress::operator= ( const UnicharCompress & src )

Definition at line 97 of file unicharcompress.cpp.

                                                                      {
  Cleanup();
  encoder_ = src.encoder_;
  code_range_ = src.code_range_;
  SetupDecoder();
  return *this;
}

◆ Serialize()

bool tesseract::UnicharCompress::Serialize ( TFile * fp ) const

Definition at line 318 of file unicharcompress.cpp.

                                               {
  return fp->Serialize(encoder_);
}

◆ SetupDirect()

void tesseract::UnicharCompress::SetupDirect ( const std::vector< RecodedCharID > & codes )

Definition at line 247 of file unicharcompress.cpp.

                                                                       {
  encoder_ = codes;
  ComputeCodeRange();
  SetupDecoder();
}

◆ SetupPassThrough()

void tesseract::UnicharCompress::SetupPassThrough ( const UNICHARSET & unicharset )

Definition at line 230 of file unicharcompress.cpp.

                                                                   {
  std::vector<RecodedCharID> codes;
  for (unsigned u = 0; u < unicharset.size(); ++u) {
    RecodedCharID code;
    code.Set(0, u);
    codes.push_back(code);
  }
  if (!unicharset.has_special_codes()) {
    RecodedCharID code;
    code.Set(0, unicharset.size());
    codes.push_back(code);
  }
  SetupDirect(codes);
}

Member Data Documentation

◆ kFirstHangul

const int tesseract::UnicharCompress::kFirstHangul = 0xac00

static

Definition at line 147 of file unicharcompress.h.

◆ kLCount

const int tesseract::UnicharCompress::kLCount = 19

static

Definition at line 152 of file unicharcompress.h.

◆ kNumHangul

const int tesseract::UnicharCompress::kNumHangul = 11172

static

Definition at line 149 of file unicharcompress.h.

◆ kTCount

const int tesseract::UnicharCompress::kTCount = 28

static

Definition at line 154 of file unicharcompress.h.

◆ kVCount

const int tesseract::UnicharCompress::kVCount = 21

static

Definition at line 153 of file unicharcompress.h.

The documentation for this class was generated from the following files:

/media/home/debian/src/github/tesseract-ocr/tesseract/src/ccutil/unicharcompress.h
/media/home/debian/src/github/tesseract-ocr/tesseract/src/ccutil/unicharcompress.cpp

Public Member Functions

Static Public Member Functions

Static Public Attributes

Detailed Description

Constructor & Destructor Documentation

◆ UnicharCompress() [1/2]

◆ UnicharCompress() [2/2]

◆ ~UnicharCompress()

Member Function Documentation

◆ code_range()

◆ ComputeEncoding()

◆ DecodeUnichar()

◆ DecomposeHangul()

◆ DeSerialize()

◆ EncodeUnichar()

◆ GetEncodingAsString()

◆ GetFinalCodes()

◆ GetNextCodes()

◆ IsValidFirstCode()

◆ operator=()

◆ Serialize()

◆ SetupDirect()

◆ SetupPassThrough()

Member Data Documentation

◆ kFirstHangul

◆ kLCount

◆ kNumHangul

◆ kTCount

◆ kVCount