29static const char *kNullChar =
"<nul>";
36static int RadicalPreHash(
const std::vector<int> &rs) {
38 for (
int radical : rs) {
46using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
50static bool DecodeRadicalLine(std::string &radical_data_line,
RSMap *radical_map) {
51 if (radical_data_line.empty() || (radical_data_line)[0] ==
'#') {
54 std::vector<std::string> entries =
split(radical_data_line,
' ');
55 if (entries.size() < 2) {
59 int unicode = strtol(&entries[0][0], &end, 10);
63 std::unique_ptr<std::vector<int>> radicals(
new std::vector<int>);
64 for (
size_t i = 1;
i < entries.size(); ++
i) {
65 int radical = strtol(&entries[
i][0], &end, 10);
69 radicals->push_back(radical);
71 (*radical_map)[unicode] = std::move(radicals);
79static bool DecodeRadicalTable(std::string &radical_data,
RSMap *radical_map) {
80 std::vector<std::string> lines =
split(radical_data,
'\n');
81 for (
unsigned i = 0;
i < lines.size(); ++
i) {
82 if (!DecodeRadicalLine(lines[
i], radical_map)) {
83 tprintf(
"Invalid format in radical table at line %d: %s\n",
i, lines[
i].c_str());
99 encoder_ = src.encoder_;
100 code_range_ = src.code_range_;
110 std::string *radical_stroke_table) {
112 if (radical_stroke_table !=
nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map)) {
128 int hangul_offset = unicharset.
size();
134 int han_offset = hangul_offset + kTotalJamos;
135 for (
unsigned u = 0; u <= unicharset.
size(); ++u) {
138 if (u == unicharset.
size() &&
static_cast<int>(u) != null_id) {
143 std::vector<char32> unicodes;
145 if (u < unicharset.
size()) {
150 int unicode = unicodes[0];
151 int leading, vowel, trailing;
152 auto it = radical_map.find(unicode);
153 if (it != radical_map.end()) {
155 int num_radicals = it->second->size();
156 for (
int c = 0; c < num_radicals; ++c) {
157 code.
Set(c, han_offset + (*it->second)[c]);
159 int pre_hash = RadicalPreHash(*it->second);
160 int num_samples = radical_counts[pre_hash]++;
161 if (num_samples > 0) {
167 code.
Set3(leading + hangul_offset, vowel +
kLCount + hangul_offset,
176 }
else if (
static_cast<int>(u) == null_id ||
182 for (
int uni : unicodes) {
183 int position = code.
length();
197 tprintf(
"Code space expanded from original unicharset!!\n");
203 encoder_.push_back(code);
210 for (
unsigned u = 0; u < unicharset.
size(); ++u) {
215 max_offset = std::max(max_offset, (*code)(
i)-han_offset);
216 code->
Set(
i, (*code)(
i) + code_offset);
218 if (max_offset == 0) {
221 code_offset += max_offset + 1;
223 DefragmentCodeValues(null_id >= 0 ? 1 : -1);
231 std::vector<RecodedCharID> codes;
232 for (
unsigned u = 0; u < unicharset.
size(); ++u) {
235 codes.push_back(code);
239 code.
Set(0, unicharset.
size());
240 codes.push_back(code);
254void UnicharCompress::DefragmentCodeValues(
int encoded_null) {
259 std::vector<int> offsets(code_range_);
261 for (
auto &code : encoder_) {
262 for (
int i = 0;
i < code.length(); ++
i) {
263 offsets[code(
i)] = 1;
268 for (
unsigned i = 0;
i < offsets.size(); ++
i) {
271 if (offsets[
i] == 0 ||
i ==
static_cast<unsigned>(encoded_null)) {
277 if (encoded_null >= 0) {
280 offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null;
283 for (
auto &c : encoder_) {
284 RecodedCharID *code = &c;
285 for (
int i = 0;
i < code->length(); ++
i) {
296 if (unichar_id >= encoder_.size()) {
299 *code = encoder_[unichar_id];
308 return INVALID_UNICHAR_ID;
310 auto it = decoder_.find(code);
311 if (it == decoder_.end()) {
312 return INVALID_UNICHAR_ID;
340 std::string encoding;
341 for (
unsigned c = 0; c < encoder_.size(); ++c) {
347 encoding += std::to_string(code(0));
348 for (
int i = 1;
i < code.
length(); ++
i) {
349 encoding +=
"," + std::to_string(code(
i));
352 if (c >= unicharset.
size() ||
354 encoding += kNullChar;
376 *leading = offset / kNCount;
377 *vowel = (offset % kNCount) /
kTCount;
383void UnicharCompress::ComputeCodeRange() {
385 for (
auto &code : encoder_) {
386 for (
int i = 0;
i < code.length(); ++
i) {
387 if (code(
i) > code_range_) {
388 code_range_ = code(
i);
396void UnicharCompress::SetupDecoder() {
398 is_valid_start_.clear();
399 is_valid_start_.resize(code_range_);
400 for (
unsigned c = 0; c < encoder_.size(); ++c) {
401 const RecodedCharID &code = encoder_[c];
403 is_valid_start_[code(0)] =
true;
404 RecodedCharID prefix = code;
405 int len = code.length() - 1;
406 prefix.Truncate(len);
407 auto final_it = final_codes_.find(prefix);
408 if (final_it == final_codes_.end()) {
409 auto *code_list =
new std::vector<int>;
410 code_list->push_back(code(len));
411 final_codes_[prefix] = code_list;
413 prefix.Truncate(len);
414 auto next_it = next_codes_.find(prefix);
415 if (next_it == next_codes_.end()) {
416 auto *code_list =
new std::vector<int>;
417 code_list->push_back(code(len));
418 next_codes_[prefix] = code_list;
422 if (!
contains(*next_it->second, code(len))) {
423 next_it->second->push_back(code(len));
429 if (!
contains(*final_it->second, code(len))) {
430 final_it->second->push_back(code(len));
437void UnicharCompress::Cleanup() {
439 is_valid_start_.clear();
440 for (
auto &next_code : next_codes_) {
441 delete next_code.second;
443 for (
auto &final_code : final_codes_) {
444 delete final_code.second;
447 final_codes_.clear();
std::unordered_map< int, std::unique_ptr< std::vector< int > > > RSMap
void tprintf(const char *format,...)
std::unordered_map< int, int > RSCounts
@ SPECIAL_UNICHAR_CODES_COUNT
const std::vector< std::string > split(const std::string &s, char c)
bool contains(const std::vector< T > &data, const T &value)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
bool DeSerialize(std::string &data)
bool Serialize(const std::string &data)
void Set(int index, int value)
static const int kMaxCodeLen
void Set3(int code0, int code1, int code2)
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
static bool DecomposeHangul(int unicode, int *leading, int *vowel, int *trailing)
bool DeSerialize(TFile *fp)
static const int kNumHangul
UnicharCompress & operator=(const UnicharCompress &src)
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
static const int kFirstHangul
void SetupPassThrough(const UNICHARSET &unicharset)
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
void SetupDirect(const std::vector< RecodedCharID > &codes)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
bool has_special_codes() const
const char * id_to_unichar(UNICHAR_ID id) const
bool contains_unichar(const char *const unichar_repr) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
static std::string CleanupString(const char *utf8_str)