tesseract  4.00.00dev
unicharcompress_test.cc
Go to the documentation of this file.
1 // (C) Copyright 2017, Google Inc.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 // http://www.apache.org/licenses/LICENSE-2.0
6 // Unless required by applicable law or agreed to in writing, software
7 // distributed under the License is distributed on an "AS IS" BASIS,
8 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9 // See the License for the specific language governing permissions and
10 // limitations under the License.
11 #include "unicharcompress.h"
12 #include "gunit.h"
13 #include "serialis.h"
14 #include "printf.h"
15 
16 namespace tesseract {
17 namespace {
18 
19 class UnicharcompressTest : public ::testing::Test {
20  protected:
21  // Loads and compresses the given unicharset.
22  void LoadUnicharset(const string& unicharset_name) {
23  string radical_stroke_file =
24  file::JoinPath(FLAGS_test_srcdir,
25  "langdata/radical-stroke.txt");
26  string unicharset_file = file::JoinPath(
27  FLAGS_test_srcdir, "testdata",
28  unicharset_name);
29  string uni_data;
30  CHECK_OK(file::GetContents(unicharset_file, &uni_data, file::Defaults()));
31  string radical_data;
32  CHECK_OK(file::GetContents(radical_stroke_file, &radical_data,
33  file::Defaults()));
34  CHECK(
35  unicharset_.load_from_inmemory_file(uni_data.data(), uni_data.size()));
36  STRING radical_str(radical_data.c_str());
37  null_char_ =
39  compressed_.ComputeEncoding(unicharset_, null_char_, &radical_str);
40  // Get the encoding of the null char.
41  RecodedCharID code;
42  compressed_.EncodeUnichar(null_char_, &code);
43  encoded_null_char_ = code(0);
44  string output_name = file::JoinPath(
45  FLAGS_test_tmpdir, absl::StrCat(unicharset_name, ".encoding.txt"));
46  STRING encoding = compressed_.GetEncodingAsString(unicharset_);
47  string encoding_str(&encoding[0], encoding.size());
48  CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
49  LOG(INFO) << "Wrote encoding to:" << output_name;
50  }
51  // Serializes and de-serializes compressed_ over itself.
52  void SerializeAndUndo() {
54  TFile wfp;
55  wfp.OpenWrite(&data);
56  EXPECT_TRUE(compressed_.Serialize(&wfp));
57  TFile rfp;
58  rfp.Open(&data[0], data.size());
59  EXPECT_TRUE(compressed_.DeSerialize(&rfp));
60  }
61  // Returns true if the lang is in CJK.
62  bool IsCJKLang(const string& lang) {
63  return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" ||
64  lang == "jpn";
65  }
66  // Returns true if the lang is Indic.
67  bool IsIndicLang(const string& lang) {
68  return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" ||
69  lang == "mar" || lang == "nep" || lang == "san" || lang == "bod" ||
70  lang == "dzo" || lang == "guj" || lang == "kan" || lang == "mal" ||
71  lang == "ori" || lang == "pan" || lang == "sin" || lang == "tam" ||
72  lang == "tel";
73  }
74 
75  // Expects the appropriate results from the compressed_ unicharset_.
76  void ExpectCorrect(const string& lang) {
77  // Count the number of times each code is used in each element of
78  // RecodedCharID.
79  RecodedCharID zeros;
80  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) zeros.Set(i, 0);
81  int code_range = compressed_.code_range();
82  std::vector<RecodedCharID> times_seen(code_range, zeros);
83  for (int u = 0; u <= unicharset_.size(); ++u) {
84  if (u != UNICHAR_SPACE && u != null_char_ &&
87  continue; // Not used so not encoded.
88  }
89  RecodedCharID code;
90  int len = compressed_.EncodeUnichar(u, &code);
91  // Check round-trip encoding.
92  int unichar_id;
93  GenericVector<UNICHAR_ID> normed_ids;
94  if (u == null_char_ || u == unicharset_.size()) {
95  unichar_id = null_char_;
96  } else {
97  unichar_id = u;
98  }
99  EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
100  // Check that the codes are valid.
101  for (int i = 0; i < len; ++i) {
102  int code_val = code(i);
103  EXPECT_GE(code_val, 0);
104  EXPECT_LT(code_val, code_range);
105  times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
106  }
107  }
108  // Check that each code is used in at least one position.
109  for (int c = 0; c < code_range; ++c) {
110  int num_used = 0;
111  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
112  if (times_seen[c](i) != 0) ++num_used;
113  }
114  EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
115  }
116  // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
117  // and create valid codes.
118  RecodedCharID code;
119  CheckCodeExtensions(code, times_seen);
120  // Finally, we achieved all that using a codebook < 10% of the size of
121  // the original unicharset, for CK or Indic, and 20% with J, but just
122  // no bigger for all others.
123  if (IsCJKLang(lang) || IsIndicLang(lang)) {
124  EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
125  } else {
126  EXPECT_LE(code_range, unicharset_.size() + 1);
127  }
128  LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to "
129  << code_range;
130  }
131  // Checks for extensions of the current code that either finish a code, or
132  // extend it and checks those extensions recursively.
133  void CheckCodeExtensions(const RecodedCharID& code,
134  const std::vector<RecodedCharID>& times_seen) {
135  RecodedCharID extended = code;
136  int length = code.length();
137  const GenericVector<int>* final_codes = compressed_.GetFinalCodes(code);
138  if (final_codes != NULL) {
139  for (int i = 0; i < final_codes->size(); ++i) {
140  int ending = (*final_codes)[i];
141  EXPECT_GT(times_seen[ending](length), 0);
142  extended.Set(length, ending);
143  int unichar_id = compressed_.DecodeUnichar(extended);
144  EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
145  }
146  }
147  const GenericVector<int>* next_codes = compressed_.GetNextCodes(code);
148  if (next_codes != NULL) {
149  for (int i = 0; i < next_codes->size(); ++i) {
150  int extension = (*next_codes)[i];
151  EXPECT_GT(times_seen[extension](length), 0);
152  extended.Set(length, extension);
153  CheckCodeExtensions(extended, times_seen);
154  }
155  }
156  }
157 
158  UnicharCompress compressed_;
161  // The encoding of the null_char_.
163 };
164 
165 TEST_F(UnicharcompressTest, DoesChinese) {
166  LOG(INFO) << "Testing chi_tra";
167  LoadUnicharset("chi_tra.unicharset");
168  ExpectCorrect("chi_tra");
169  LOG(INFO) << "Testing chi_sim";
170  LoadUnicharset("chi_sim.unicharset");
171  ExpectCorrect("chi_sim");
172 }
173 
174 TEST_F(UnicharcompressTest, DoesJapanese) {
175  LOG(INFO) << "Testing jpn";
176  LoadUnicharset("jpn.unicharset");
177  ExpectCorrect("jpn");
178 }
179 
180 TEST_F(UnicharcompressTest, DoesKorean) {
181  LOG(INFO) << "Testing kor";
182  LoadUnicharset("kor.unicharset");
183  ExpectCorrect("kor");
184 }
185 
186 TEST_F(UnicharcompressTest, DoesKannada) {
187  LOG(INFO) << "Testing kan";
188  LoadUnicharset("kan.unicharset");
189  ExpectCorrect("kan");
190  SerializeAndUndo();
191  ExpectCorrect("kan");
192 }
193 
194 TEST_F(UnicharcompressTest, DoesMarathi) {
195  LOG(INFO) << "Testing mar";
196  LoadUnicharset("mar.unicharset");
197  ExpectCorrect("mar");
198 }
199 
200 TEST_F(UnicharcompressTest, DoesEnglish) {
201  LOG(INFO) << "Testing eng";
202  LoadUnicharset("eng.unicharset");
203  ExpectCorrect("eng");
204 }
205 
206 // Tests that a unicharset that contains double-letter ligatures (eg ff) has
207 // no null char in the encoding at all.
208 TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
209  LOG(INFO) << "Testing por with ligatures";
210  LoadUnicharset("por.unicharset");
211  ExpectCorrect("por");
212  // Check that any unichar-id that is encoded with multiple codes has the
213  // correct encoded_nulll_char_ in between.
214  for (int u = 0; u <= unicharset_.size(); ++u) {
215  RecodedCharID code;
216  int len = compressed_.EncodeUnichar(u, &code);
217  if (len > 1) {
218  // The should not be any null char in the code.
219  for (int i = 0; i < len; ++i) {
220  EXPECT_NE(encoded_null_char_, code(i));
221  }
222  }
223  }
224 }
225 
226 // Tests that GetEncodingAsString returns the right result for a trivial
227 // unicharset.
228 TEST_F(UnicharcompressTest, GetEncodingAsString) {
229  LoadUnicharset("trivial.unicharset");
230  ExpectCorrect("trivial");
231  STRING encoding = compressed_.GetEncodingAsString(unicharset_);
232  string encoding_str(&encoding[0], encoding.length());
233  std::vector<string> lines =
234  strings::Split(encoding_str, "\n", strings::SkipEmpty());
235  EXPECT_EQ(5, lines.size());
236  // The first line is always space.
237  EXPECT_EQ("0\t ", lines[0]);
238  // Next we have i.
239  EXPECT_EQ("1\ti", lines[1]);
240  // Next we have f.
241  EXPECT_EQ("2\tf", lines[2]);
242  // Next we have the fi ligature: fi. There are no nulls in it, as there are no
243  // repeated letter ligatures in this unicharset, unlike por.unicharset above.
244  EXPECT_EQ("2,1\tfi", lines[3]);
245  // Finally the null character.
246  EXPECT_EQ("3\t<nul>", lines[4]);
247 }
248 
249 } // namespace
250 } // namespace tesseract
inT32 size() const
Definition: strngs.h:69
UnicharCompress compressed_
static const int kMaxCodeLen
int size() const
Definition: genericvector.h:72
int size() const
Definition: unicharset.h:338
bool has_special_codes() const
Definition: unicharset.h:721
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:752
UNICHARSET unicharset_
Definition: strngs.h:45
int null_char_
int encoded_null_char_
inT32 length() const
Definition: strngs.cpp:193