tesseract v5.3.3.20231005
tesseract::UnicharcompressTest Class Reference
Inheritance diagram for tesseract::UnicharcompressTest:
testing::Test

Protected Member Functions

void SetUp () override
 
void LoadUnicharset (const std::string &unicharset_name)
 
void SerializeAndUndo ()
 
bool IsCJKLang (const std::string &lang)
 
bool IsIndicLang (const std::string &lang)
 
void ExpectCorrect (const std::string &lang)
 
void CheckCodeExtensions (const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)
 
- Protected Member Functions inherited from testing::Test
 Test ()
 
virtual void SetUp ()
 
virtual void TearDown ()
 

Protected Attributes

UnicharCompress compressed_
 
UNICHARSET unicharset_
 
int null_char_
 
int encoded_null_char_
 

Additional Inherited Members

- Public Member Functions inherited from testing::Test
virtual ~Test ()
 
- Static Public Member Functions inherited from testing::Test
static void SetUpTestSuite ()
 
static void TearDownTestSuite ()
 
static void TearDownTestCase ()
 
static void SetUpTestCase ()
 
static bool HasFatalFailure ()
 
static bool HasNonfatalFailure ()
 
static bool IsSkipped ()
 
static bool HasFailure ()
 
static void RecordProperty (const std::string &key, const std::string &value)
 
static void RecordProperty (const std::string &key, int value)
 

Detailed Description

Definition at line 24 of file unicharcompress_test.cc.

Member Function Documentation

◆ CheckCodeExtensions()

void tesseract::UnicharcompressTest::CheckCodeExtensions ( const RecodedCharID code,
const std::vector< RecodedCharID > &  times_seen 
)
inlineprotected

Definition at line 135 of file unicharcompress_test.cc.

136 {
137 RecodedCharID extended = code;
138 int length = code.length();
139 const std::vector<int> *final_codes = compressed_.GetFinalCodes(code);
140 if (final_codes != nullptr) {
141 for (int ending : *final_codes) {
142 EXPECT_GT(times_seen[ending](length), 0);
143 extended.Set(length, ending);
144 int unichar_id = compressed_.DecodeUnichar(extended);
145 EXPECT_NE(INVALID_UNICHAR_ID, unichar_id);
146 }
147 }
148 const std::vector<int> *next_codes = compressed_.GetNextCodes(code);
149 if (next_codes != nullptr) {
150 for (int extension : *next_codes) {
151 EXPECT_GT(times_seen[extension](length), 0);
152 extended.Set(length, extension);
153 CheckCodeExtensions(extended, times_seen);
154 }
155 }
156 }
#define EXPECT_NE(val1, val2)
Definition: gtest.h:2045
#define EXPECT_GT(val1, val2)
Definition: gtest.h:2053
const std::vector< int > * GetFinalCodes(const RecodedCharID &code) const
const std::vector< int > * GetNextCodes(const RecodedCharID &code) const
int DecodeUnichar(const RecodedCharID &code) const
void CheckCodeExtensions(const RecodedCharID &code, const std::vector< RecodedCharID > &times_seen)

◆ ExpectCorrect()

void tesseract::UnicharcompressTest::ExpectCorrect ( const std::string &  lang)
inlineprotected

Definition at line 75 of file unicharcompress_test.cc.

75 {
76 // Count the number of times each code is used in each element of
77 // RecodedCharID.
78 RecodedCharID zeros;
79 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
80 zeros.Set(i, 0);
81 }
82 int code_range = compressed_.code_range();
83 std::vector<RecodedCharID> times_seen(code_range, zeros);
84 for (int u = 0; u <= unicharset_.size(); ++u) {
85 if (u != UNICHAR_SPACE && u != null_char_ &&
86 (u == unicharset_.size() ||
88 continue; // Not used so not encoded.
89 }
90 RecodedCharID code;
91 int len = compressed_.EncodeUnichar(u, &code);
92 // Check round-trip encoding.
93 int unichar_id;
94 std::vector<UNICHAR_ID> normed_ids;
95 if (u == null_char_ || u == unicharset_.size()) {
96 unichar_id = null_char_;
97 } else {
98 unichar_id = u;
99 }
100 EXPECT_EQ(unichar_id, compressed_.DecodeUnichar(code));
101 // Check that the codes are valid.
102 for (int i = 0; i < len; ++i) {
103 int code_val = code(i);
104 EXPECT_GE(code_val, 0);
105 EXPECT_LT(code_val, code_range);
106 times_seen[code_val].Set(i, times_seen[code_val](i) + 1);
107 }
108 }
109 // Check that each code is used in at least one position.
110 for (int c = 0; c < code_range; ++c) {
111 int num_used = 0;
112 for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
113 if (times_seen[c](i) != 0) {
114 ++num_used;
115 }
116 }
117 EXPECT_GE(num_used, 1) << "c=" << c << "/" << code_range;
118 }
119 // Check that GetNextCodes/GetFinalCodes lists match the times_seen,
120 // and create valid codes.
121 RecodedCharID code;
122 CheckCodeExtensions(code, times_seen);
123 // Finally, we achieved all that using a codebook < 10% of the size of
124 // the original unicharset, for CK or Indic, and 20% with J, but just
125 // no bigger for all others.
126 if (IsCJKLang(lang) || IsIndicLang(lang)) {
127 EXPECT_LT(code_range, unicharset_.size() / (lang == "jpn" ? 5 : 10));
128 } else {
129 EXPECT_LE(code_range, unicharset_.size() + 1);
130 }
131 LOG(INFO) << "Compressed unicharset of " << unicharset_.size() << " to " << code_range;
132 }
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_GE(val1, val2)
Definition: gtest.h:2051
#define EXPECT_LE(val1, val2)
Definition: gtest.h:2047
#define EXPECT_LT(val1, val2)
Definition: gtest.h:2049
@ UNICHAR_SPACE
Definition: unicharset.h:36
@ SPECIAL_UNICHAR_CODES_COUNT
Definition: unicharset.h:40
static const int kMaxCodeLen
int EncodeUnichar(unsigned unichar_id, RecodedCharID *code) const
bool has_special_codes() const
Definition: unicharset.h:756
size_t size() const
Definition: unicharset.h:355
bool IsIndicLang(const std::string &lang)
bool IsCJKLang(const std::string &lang)

◆ IsCJKLang()

bool tesseract::UnicharcompressTest::IsCJKLang ( const std::string &  lang)
inlineprotected

Definition at line 63 of file unicharcompress_test.cc.

63 {
64 return lang == "chi_sim" || lang == "chi_tra" || lang == "kor" || lang == "jpn";
65 }

◆ IsIndicLang()

bool tesseract::UnicharcompressTest::IsIndicLang ( const std::string &  lang)
inlineprotected

Definition at line 67 of file unicharcompress_test.cc.

67 {
68 return lang == "asm" || lang == "ben" || lang == "bih" || lang == "hin" || lang == "mar" ||
69 lang == "nep" || lang == "san" || lang == "bod" || lang == "dzo" || lang == "guj" ||
70 lang == "kan" || lang == "mal" || lang == "ori" || lang == "pan" || lang == "sin" ||
71 lang == "tam" || lang == "tel";
72 }

◆ LoadUnicharset()

void tesseract::UnicharcompressTest::LoadUnicharset ( const std::string &  unicharset_name)
inlineprotected

Definition at line 32 of file unicharcompress_test.cc.

32 {
33 std::string radical_stroke_file = file::JoinPath(LANGDATA_DIR, "radical-stroke.txt");
34 std::string unicharset_file = file::JoinPath(TESTDATA_DIR, unicharset_name);
35 std::string radical_data;
36 CHECK_OK(file::GetContents(radical_stroke_file, &radical_data, file::Defaults()));
37 CHECK(unicharset_.load_from_file(unicharset_file.c_str()));
38 std::string radical_str(radical_data.c_str());
41 // Get the encoding of the null char.
42 RecodedCharID code;
44 encoded_null_char_ = code(0);
45 std::string output_name =
46 file::JoinPath(FLAGS_test_tmpdir, unicharset_name) + ".encoding.txt";
47 std::string encoding = compressed_.GetEncodingAsString(unicharset_);
48 std::string encoding_str(&encoding[0], encoding.size());
49 CHECK_OK(file::SetContents(output_name, encoding_str, file::Defaults()));
50 LOG(INFO) << "Wrote encoding to:" << output_name;
51 }
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ UNICHAR_BROKEN
Definition: unicharset.h:38
std::string GetEncodingAsString(const UNICHARSET &unicharset) const
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table)
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
static int Defaults()
Definition: include_gunit.h:61
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool SetContents(const std::string &name, const std::string &contents, bool)
Definition: include_gunit.h:56
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52

◆ SerializeAndUndo()

void tesseract::UnicharcompressTest::SerializeAndUndo ( )
inlineprotected

Definition at line 53 of file unicharcompress_test.cc.

53 {
54 std::vector<char> data;
55 TFile wfp;
56 wfp.OpenWrite(&data);
58 TFile rfp;
59 rfp.Open(&data[0], data.size());
61 }
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
bool Serialize(TFile *fp) const

◆ SetUp()

void tesseract::UnicharcompressTest::SetUp ( )
inlineoverrideprotectedvirtual

Reimplemented from testing::Test.

Definition at line 26 of file unicharcompress_test.cc.

26 {
27 std::locale::global(std::locale(""));
29 }
static void MakeTmpdir()
Definition: include_gunit.h:38

Member Data Documentation

◆ compressed_

UnicharCompress tesseract::UnicharcompressTest::compressed_
protected

Definition at line 158 of file unicharcompress_test.cc.

◆ encoded_null_char_

int tesseract::UnicharcompressTest::encoded_null_char_
protected

Definition at line 162 of file unicharcompress_test.cc.

◆ null_char_

int tesseract::UnicharcompressTest::null_char_
protected

Definition at line 160 of file unicharcompress_test.cc.

◆ unicharset_

UNICHARSET tesseract::UnicharcompressTest::unicharset_
protected

Definition at line 159 of file unicharcompress_test.cc.


The documentation for this class was generated from the following file: