tesseract v5.3.3.20231005
unicharcompress_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include <string>
13
14#include <allheaders.h>
15
16#include "include_gunit.h"
17#include "log.h" // for LOG
18#include "serialis.h"
19#include "tprintf.h"
20#include "unicharcompress.h"
21
22namespace tesseract {
23
25protected:
26 void SetUp() override {
27 std::locale::global(std::locale(""));
29 }
30
31 // Loads and compresses the given unicharset.
32 void LoadUnicharset(const std::string &unicharset_name) {
33 std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
34 std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
35 std::string radical_data;
36 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
37 CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
38 std::string radical_str(radical_data.c_str());
41 // Get the encoding of the null char.
42 RecodedCharID code;
44 encoded_null_char_ = code(0);
45 std::string output_name =
46 file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + ".encoding.txt";
47 std::string encoding = compressed_.GetEncodingAsString(unicharset_);
48 std::string encoding_str(&encoding[0], encoding.size());
49 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
50 LOG(INFO) << "Wrote encoding to:" << output_name;
51 }
52 // Serializes and de-serializes compressed_ over itself.
54 std::vector<char> data;
55 TFile wfp;
56 wfp.OpenWrite(&data);
58 TFile rfp;
59 rfp.Open(&data[0], data.size());
61 }
62 // Returns true if the lang is in CJK.
63 bool IsCJKLang(const std::string &lang) {
64 return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";
65 }
66 // Returns true if the lang is Indic.
67 bool IsIndicLang(const std::string &lang) {
68 return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||
69 lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||
70 lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||
71 lang == "tam" || lang == "tel";
72 }
73
74 // Expects the appropriate results from the compressed_ unicharset_.
75 void ExpectCorrect(const std::string &lang) {
76 // Count the number of times each code is used in each element of
77 // RecodedCharID.
78 RecodedCharID zeros;
79 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
80 zeros.Set(i, 0);
81 }
82 int code_range = compressed_.code_range();
83 std::vector<RecodedCharID> times_seen(code_range, zeros);
84 for (int u = 0; u <= unicharset_.size(); ++u) {
85 if (u != UNICHAR_SPACE && u != null_char_ &&
86 (u == unicharset_.size() ||
88 continue; // Not used so not encoded.
89 }
90 RecodedCharID code;
91 int len = compressed_.EncodeUnichar(u, &code);
92 // Check round-trip encoding.
93 int unichar_id;
94 std::vector<UNICHAR_ID> normed_ids;
95 if (u == null_char_ || u == unicharset_.size()) {
96 unichar_id = null_char_;
97 } else {
98 unichar_id = u;
99 }
100 EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
101 // Check that the codes are valid.
102 for (int i = 0; i < len; ++i) {
103 int code_val = code(i);
104 EXPECT_GE(code_val, 0);
105 EXPECT_LT(code_val, code_range);
106 times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
107 }
108 }
109 // Check that each code is used in at least one position.
110 for (int c = 0; c < code_range; ++c) {
111 int num_used = 0;
112 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
113 if (times_seen[c](i) != 0) {
114 ++num_used;
115 }
116 }
117 EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
118 }
119 // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
120 // and create valid codes.
121 RecodedCharID code;
122 CheckCodeExtensions(code, times_seen);
123 // Finally, we achieved all that using a codebook < 10% of the size of
124 // the original unicharset, for CK or Indic, and 20% with J, but just
125 // no bigger for all others.
126 if (IsCJKLang(lang) || IsIndicLang(lang)) {
127 EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
128 } else {
129 EXPECT_LE(code_range, unicharset_.size() + 1);
130 }
131 LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;
132 }
133 // Checks for extensions of the current code that either finish a code, or
134 // extend it and checks those extensions recursively.
136 const std::vector<RecodedCharID> &times_seen) {
137 RecodedCharID extended = code;
138 int length = code.length();
139 const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
140 if (final_codes != nullptr) {
141 for (int ending : *final_codes) {
142 EXPECT_GT(times_seen[ending](length), 0);
143 extended.Set(length, ending);
144 int unichar_id = compressed_.DecodeUnichar(extended);
145 EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
146 }
147 }
148 const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
149 if (next_codes != nullptr) {
150 for (int extension : *next_codes) {
151 EXPECT_GT(times_seen[extension](length), 0);
152 extended.Set(length, extension);
153 CheckCodeExtensions(extended, times_seen);
154 }
155 }
156 }
157
161 // The encoding of the null_char_.
163};
164
166 LOG(INFO) << "Testing chi_tra";
167 LoadUnicharset("chi_tra.unicharset");
168 ExpectCorrect("chi_tra");
169 LOG(INFO) << "Testing chi_sim";
170 LoadUnicharset("chi_sim.unicharset");
171 ExpectCorrect("chi_sim");
172}
173
175 LOG(INFO) << "Testing jpn";
176 LoadUnicharset("jpn.unicharset");
177 ExpectCorrect("jpn");
178}
179
181 LOG(INFO) << "Testing kor";
182 LoadUnicharset("kor.unicharset");
183 ExpectCorrect("kor");
184}
185
187 LOG(INFO) << "Testing kan";
188 LoadUnicharset("kan.unicharset");
189 ExpectCorrect("kan");
190 SerializeAndUndo();
191 ExpectCorrect("kan");
192}
193
195 LOG(INFO) << "Testing mar";
196 LoadUnicharset("mar.unicharset");
197 ExpectCorrect("mar");
198}
199
201 LOG(INFO) << "Testing eng";
202 LoadUnicharset("eng.unicharset");
203 ExpectCorrect("eng");
204}
205
206// Tests that a unicharset that contains double-letter ligatures (eg ff) has
207// no null char in the encoding at all.
208TEST_F(UnicharcompressTest, DoesLigaturesWithDoubles) {
209 LOG(INFO) << "Testing por with ligatures";
210 LoadUnicharset("por.unicharset");
211 ExpectCorrect("por");
212 // Check that any unichar-id that is encoded with multiple codes has the
213 // correct encoded_nulll_char_ in between.
214 for (int u = 0; u <= unicharset_.size(); ++u) {
215 RecodedCharID code;
216 int len = compressed_.EncodeUnichar(u, &code);
217 if (len > 1) {
218 // The should not be any null char in the code.
219 for (int i = 0; i < len; ++i) {
220 EXPECT_NE(encoded_null_char_, code(i));
221 }
222 }
223 }
224}
225
226// Tests that GetEncodingAsString returns the right result for a trivial
227// unicharset.
228TEST_F(UnicharcompressTest, GetEncodingAsString) {
229 LoadUnicharset("trivial.unicharset");
230 ExpectCorrect("trivial");
231 std::string encoding = compressed_.GetEncodingAsString(unicharset_);
232 std::string encoding_str(&encoding[0], encoding.length());
233 std::vector<std::string> lines = split(encoding_str, '\n');
234 EXPECT_EQ(5, lines.size());
235 // The first line is always space.
236 EXPECT_EQ("0\t ", lines[0]);
237 // Next we have i.
238 EXPECT_EQ("1\ti", lines[1]);
239 // Next we have f.
240 EXPECT_EQ("2\tf", lines[2]);
241 // Next we have the fi ligature: fi. There are no nulls in it, as there are no
242 // repeated letter ligatures in this unicharset, unlike por.unicharset above.
243 EXPECT_EQ("2,1\tfi", lines[3]);
244 // Finally the null character.
245 EXPECT_EQ("3\t<nul>", lines[4]);
246}
247
248} // namespace tesseract
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_NE(val1, val2)
Definition: gtest.h:2045
#define EXPECT_GT(val1, val2)
Definition: gtest.h:2053
#define EXPECT_GE(val1, val2)
Definition: gtest.h:2051
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_LE(val1, val2)
Definition: gtest.h:2047
#define EXPECT_LT(val1, val2)
Definition: gtest.h:2049
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ UNICHAR_BROKEN
Definition: unicharset.h:38
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
const std::vector< std::string > split(const std::string &s, char c)
Definition: helpers.h:43
TEST_F(EuroText, FastLatinOCR)
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
void Set(int index, int value)
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
int DecodeUnichar(const RecodedCharID &code) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool Serialize(TFile *fp) const
bool has_special_codes() const
Definition: unicharset.h:756
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
size_t size() const
Definition: unicharset.h:355
static int Defaults()
Definition: include_gunit.h:61
static void MakeTmpdir()
Definition: include_gunit.h:38
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52
void ExpectCorrect(const std::string &lang)
bool IsIndicLang(const std::string &lang)
void CheckCodeExtensions(const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)
bool IsCJKLang(const std::string &lang)
void LoadUnicharset(const std::string &unicharset_name)