19 class UnicharcompressTest :
public ::testing::Test {
22 void LoadUnicharset(
const string& unicharset_name) {
23 string radical_stroke_file =
24 file::JoinPath(FLAGS_test_srcdir,
25 "langdata/radical-stroke.txt");
26 string unicharset_file = file::JoinPath(
27 FLAGS_test_srcdir,
"testdata",
30 CHECK_OK(file::GetContents(unicharset_file, &uni_data, file::Defaults()));
32 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
36 STRING radical_str(radical_data.c_str());
44 string output_name = file::JoinPath(
45 FLAGS_test_tmpdir, absl::StrCat(unicharset_name,
".encoding.txt"));
47 string encoding_str(&encoding[0], encoding.
size());
48 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
49 LOG(INFO) <<
"Wrote encoding to:" << output_name;
52 void SerializeAndUndo() {
58 rfp.Open(&data[0], data.
size());
62 bool IsCJKLang(
const string&
lang) {
63 return lang ==
"chi_sim" || lang ==
"chi_tra" || lang ==
"kor" ||
67 bool IsIndicLang(
const string&
lang) {
68 return lang ==
"asm" || lang ==
"ben" || lang ==
"bih" || lang ==
"hin" ||
69 lang ==
"mar" || lang ==
"nep" || lang ==
"san" || lang ==
"bod" ||
70 lang ==
"dzo" || lang ==
"guj" || lang ==
"kan" || lang ==
"mal" ||
71 lang ==
"ori" || lang ==
"pan" || lang ==
"sin" || lang ==
"tam" ||
76 void ExpectCorrect(
const string&
lang) {
82 std::vector<RecodedCharID> times_seen(code_range, zeros);
99 EXPECT_EQ(unichar_id,
compressed_.DecodeUnichar(code));
101 for (
int i = 0; i < len; ++i) {
102 int code_val = code(i);
103 EXPECT_GE(code_val, 0);
104 EXPECT_LT(code_val, code_range);
105 times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
109 for (
int c = 0; c < code_range; ++c) {
112 if (times_seen[c](i) != 0) ++num_used;
114 EXPECT_GE(num_used, 1) <<
"c=" << c <<
"/" << code_range;
119 CheckCodeExtensions(code, times_seen);
123 if (IsCJKLang(lang) || IsIndicLang(lang)) {
128 LOG(INFO) <<
"Compressed unicharset of " <<
unicharset_.
size() <<
" to " 133 void CheckCodeExtensions(
const RecodedCharID& code,
134 const std::vector<RecodedCharID>& times_seen) {
135 RecodedCharID extended = code;
136 int length = code.length();
138 if (final_codes != NULL) {
139 for (
int i = 0; i < final_codes->
size(); ++i) {
140 int ending = (*final_codes)[i];
141 EXPECT_GT(times_seen[ending](length), 0);
142 extended.Set(length, ending);
143 int unichar_id =
compressed_.DecodeUnichar(extended);
144 EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
148 if (next_codes != NULL) {
149 for (
int i = 0; i < next_codes->
size(); ++i) {
150 int extension = (*next_codes)[i];
151 EXPECT_GT(times_seen[extension](length), 0);
152 extended.Set(length, extension);
153 CheckCodeExtensions(extended, times_seen);
165 TEST_F(UnicharcompressTest, DoesChinese) {
166 LOG(INFO) <<
"Testing chi_tra";
167 LoadUnicharset(
"chi_tra.unicharset");
168 ExpectCorrect(
"chi_tra");
169 LOG(INFO) <<
"Testing chi_sim";
170 LoadUnicharset(
"chi_sim.unicharset");
171 ExpectCorrect(
"chi_sim");
174 TEST_F(UnicharcompressTest, DoesJapanese) {
175 LOG(INFO) <<
"Testing jpn";
176 LoadUnicharset(
"jpn.unicharset");
177 ExpectCorrect(
"jpn");
180 TEST_F(UnicharcompressTest, DoesKorean) {
181 LOG(INFO) <<
"Testing kor";
182 LoadUnicharset(
"kor.unicharset");
183 ExpectCorrect(
"kor");
186 TEST_F(UnicharcompressTest, DoesKannada) {
187 LOG(INFO) <<
"Testing kan";
188 LoadUnicharset(
"kan.unicharset");
189 ExpectCorrect(
"kan");
191 ExpectCorrect(
"kan");
194 TEST_F(UnicharcompressTest, DoesMarathi) {
195 LOG(INFO) <<
"Testing mar";
196 LoadUnicharset(
"mar.unicharset");
197 ExpectCorrect(
"mar");
200 TEST_F(UnicharcompressTest, DoesEnglish) {
201 LOG(INFO) <<
"Testing eng";
202 LoadUnicharset(
"eng.unicharset");
203 ExpectCorrect(
"eng");
208 TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
209 LOG(INFO) <<
"Testing por with ligatures";
210 LoadUnicharset(
"por.unicharset");
211 ExpectCorrect(
"por");
219 for (
int i = 0; i < len; ++i) {
228 TEST_F(UnicharcompressTest, GetEncodingAsString) {
229 LoadUnicharset(
"trivial.unicharset");
230 ExpectCorrect(
"trivial");
232 string encoding_str(&encoding[0], encoding.
length());
233 std::vector<string> lines =
234 strings::Split(encoding_str,
"\n", strings::SkipEmpty());
235 EXPECT_EQ(5, lines.size());
237 EXPECT_EQ(
"0\t ", lines[0]);
239 EXPECT_EQ(
"1\ti", lines[1]);
241 EXPECT_EQ(
"2\tf", lines[2]);
244 EXPECT_EQ(
"2,1\tfi", lines[3]);
246 EXPECT_EQ(
"3\t<nul>", lines[4]);
UnicharCompress compressed_
static const int kMaxCodeLen
bool has_special_codes() const
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)