tesseract v5.3.3.20231005
unicharset_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "unicharset.h"
13#include <string>
14#include "gmock/gmock.h" // for testing::ElementsAreArray
15#include "include_gunit.h"
16#include "log.h" // for LOG
17
18using testing::ElementsAreArray;
19
20namespace tesseract {
21
23protected:
24 void SetUp() override {
25 std::locale::global(std::locale(""));
26 }
27};
28
30 // This test verifies basic insertion, unichar_to_id, and encode.
31 UNICHARSET u;
32 u.unichar_insert("a");
33 EXPECT_EQ(u.size(), 4);
34 u.unichar_insert("f");
35 EXPECT_EQ(u.size(), 5);
36 u.unichar_insert("i");
37 EXPECT_EQ(u.size(), 6);
38 // The fi ligature is NOT added because it can be encoded with a cleanup as f
39 // then i.
40 u.unichar_insert("\ufb01");
41 EXPECT_EQ(u.size(), 6);
42 u.unichar_insert("e");
43 EXPECT_EQ(u.size(), 7);
44 u.unichar_insert("n");
45 EXPECT_EQ(u.size(), 8);
46 EXPECT_EQ(u.unichar_to_id("f"), 4);
47 EXPECT_EQ(u.unichar_to_id("i"), 5);
48 // The fi ligature has no valid id.
49 EXPECT_EQ(u.unichar_to_id("\ufb01"), INVALID_UNICHAR_ID);
50 // The fi pair has no valid id.
51 EXPECT_EQ(u.unichar_to_id("fi"), INVALID_UNICHAR_ID);
52 std::vector<int> labels;
53 EXPECT_TRUE(u.encode_string("affine", true, &labels, nullptr, nullptr));
54 std::vector<int> v(&labels[0], &labels[0] + labels.size());
55 EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
56 // With the fi ligature encoding fails without a pre-cleanup.
57 std::string lig_str = "af\ufb01ne";
58 EXPECT_FALSE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
59 lig_str = u.CleanupString(lig_str.c_str());
60 EXPECT_TRUE(u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
61 v = std::vector<int>(&labels[0], &labels[0] + labels.size());
62 EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
63}
64
65TEST(UnicharsetTest, Multibyte) {
66 // This test verifies basic insertion, unichar_to_id, and encode.
67 // The difference from Basic above is that now we are testing multi-byte
68 // unicodes instead of single byte.
69 UNICHARSET u;
70 // Insert some Arabic letters.
71 u.unichar_insert("\u0627");
72 EXPECT_EQ(u.size(), 4);
73 u.unichar_insert("\u062c");
74 EXPECT_EQ(u.size(), 5);
75 u.unichar_insert("\u062f");
76 EXPECT_EQ(u.size(), 6);
77 u.unichar_insert("\ufb01"); // fi ligature is added as fi pair.
78 EXPECT_EQ(u.size(), 7);
79 u.unichar_insert("\u062b");
80 EXPECT_EQ(u.size(), 8);
81 u.unichar_insert("\u0635");
82 EXPECT_EQ(u.size(), 9);
83 EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
84 EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
85 // The first two bytes of this string is \u0627, which matches id 3;
86 EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
87 EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
88 // Individual f and i are not present, but they are there as a pair.
89 EXPECT_EQ(u.unichar_to_id("f"), INVALID_UNICHAR_ID);
90 EXPECT_EQ(u.unichar_to_id("i"), INVALID_UNICHAR_ID);
91 EXPECT_EQ(u.unichar_to_id("fi"), 6);
92 // The fi ligature is findable.
93 EXPECT_EQ(u.unichar_to_id("\ufb01"), 6);
94 std::vector<int> labels;
96 u.encode_string("\u0627\u062c\u062c\u062f\u0635\u062b", true, &labels, nullptr, nullptr));
97 std::vector<int> v(&labels[0], &labels[0] + labels.size());
98 EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 8, 7}));
99 // With the fi ligature the fi is picked out.
100 std::vector<char> lengths;
101 unsigned encoded_length;
102 std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
103 // src_str has to be pre-cleaned for lengths to be correct.
104 std::string cleaned = u.CleanupString(src_str.c_str());
105 EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths, &encoded_length));
106 EXPECT_EQ(encoded_length, cleaned.size());
107 std::string len_str(&lengths[0], lengths.size());
108 EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
109 v = std::vector<int>(&labels[0], &labels[0] + labels.size());
110 EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
111}
112
113TEST(UnicharsetTest, MultibyteBigrams) {
114 // This test verifies basic insertion, unichar_to_id, and encode.
115 // The difference from Basic above is that now we are testing multi-byte
116 // unicodes instead of single byte.
117 UNICHARSET u;
118 // Insert some Arabic letters.
119 u.unichar_insert("\u0c9c");
120 EXPECT_EQ(u.size(), 4);
121 u.unichar_insert("\u0cad");
122 EXPECT_EQ(u.size(), 5);
123 u.unichar_insert("\u0ccd\u0c9c");
124 EXPECT_EQ(u.size(), 6);
125 u.unichar_insert("\u0ccd");
126 EXPECT_EQ(u.size(), 7);
127 // By default the encodable bigram is NOT added.
128 u.unichar_insert("\u0ccd\u0cad");
129 EXPECT_EQ(u.size(), 7);
130 // It is added if we force it to be.
131 u.unichar_insert("\u0ccd\u0cad", OldUncleanUnichars::kTrue);
132 EXPECT_EQ(u.size(), 8);
133 std::vector<char> data;
135 fp.OpenWrite(&data);
136 u.save_to_file(&fp);
137 fp.Open(&data[0], data.size());
138 UNICHARSET v;
139 v.load_from_file(&fp, false);
140 EXPECT_EQ(v.unichar_to_id("\u0c9c"), 3);
141 EXPECT_EQ(v.unichar_to_id("\u0cad"), 4);
142 EXPECT_EQ(v.unichar_to_id("\u0ccd\u0c9c"), 5);
143 EXPECT_EQ(v.unichar_to_id("\u0ccd"), 6);
144 EXPECT_EQ(v.unichar_to_id("\u0ccd\u0cad"), 7);
145}
146
148 // This test verifies an old unicharset that contains fi/fl ligatures loads
149 // and keeps all the entries.
150 std::string filename = file::JoinPath(TESTDATA_DIR, "eng.unicharset");
151 UNICHARSET u;
152 LOG(INFO) << "Filename=" << filename;
153 EXPECT_TRUE(u.load_from_file(filename.c_str()));
154 EXPECT_EQ(u.size(), 111);
155}
156
157} // namespace tesseract
@ LOG
@ INFO
Definition: log.h:28
#define EXPECT_THAT(value, matcher)
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_STREQ(s1, s2)
Definition: gtest.h:2112
#define EXPECT_FALSE(condition)
Definition: gtest.h:1986
TEST(TesseractInstanceTest, TestMultipleTessInstances)
void OpenWrite(std::vector< char > *data)
Definition: serialis.cpp:246
bool Open(const char *filename, FileReader reader)
Definition: serialis.cpp:140
bool encode_string(const char *str, bool give_up_on_failure, std::vector< UNICHAR_ID > *encoding, std::vector< char > *lengths, unsigned *encoded_length) const
Definition: unicharset.cpp:239
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:391
bool save_to_file(const char *const filename) const
Definition: unicharset.h:361
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:186
size_t size() const
Definition: unicharset.h:355
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:265
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65