tesseract v5.3.3.20231005
tatweel_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#if defined(_WIN32)
13# include <io.h> // for _access
14#else
15# include <unistd.h> // for access
16#endif
17
18#include "dawg.h"
19#include "include_gunit.h"
20#include "trie.h"
21#include "unicharset.h"
22#include "util/utf8/unicodetext.h" // for UnicodeText
23
24namespace tesseract {
25
26// Replacement for std::filesystem::exists (C++-17)
27static bool file_exists(const char *filename) {
28#if defined(_WIN32)
29 return _access(filename, 0) == 0;
30#else
31 return access(filename, 0) == 0;
32#endif
33}
34
36protected:
37 void SetUp() override {
38 static std::locale system_locale("");
39 std::locale::global(system_locale);
40 }
41
43 std::string filename = TestDataNameToPath("ara.wordlist");
44 if (file_exists(filename.c_str())) {
45 std::string wordlist("\u0640");
46 CHECK_OK(file::GetContents(filename, &wordlist, file::Defaults()));
47 // Put all the unicodes in the unicharset_.
48 UnicodeText text;
49 text.PointToUTF8(wordlist.data(), wordlist.size());
50 int num_tatweel = 0;
51 for (auto it = text.begin(); it != text.end(); ++it) {
52 std::string utf8 = it.get_utf8_string();
53 if (utf8.find("\u0640") != std::string::npos)
54 ++num_tatweel;
55 unicharset_.unichar_insert(utf8.c_str());
56 }
57 LOG(INFO) << "Num tatweels in source data=" << num_tatweel;
58 EXPECT_GT(num_tatweel, 0);
59 }
60 }
61
62 std::string TestDataNameToPath(const std::string &name) {
63 return file::JoinPath(TESTDATA_DIR, name);
64 }
66};
67
68TEST_F(TatweelTest, UnicharsetIgnoresTatweel) {
69 // This test verifies that the unicharset ignores the Tatweel character.
70 for (int i = 0; i < unicharset_.size(); ++i) {
71 const char *utf8 = unicharset_.id_to_unichar(i);
72 EXPECT_EQ(strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")), nullptr);
73 }
74}
75
76TEST_F(TatweelTest, DictIgnoresTatweel) {
77 // This test verifies that the dictionary ignores the Tatweel character.
78 tesseract::Trie trie(tesseract::DAWG_TYPE_WORD, "ara", SYSTEM_DAWG_PERM, unicharset_.size(), 0);
79 std::string filename = TestDataNameToPath("ara.wordlist");
80 if (!file_exists(filename.c_str())) {
81 LOG(INFO) << "Skip test because of missing " << filename;
82 GTEST_SKIP();
83 } else {
84 EXPECT_TRUE(trie.read_and_add_word_list(filename.c_str(), unicharset_,
86 EXPECT_EQ(0, trie.check_for_words(filename.c_str(), unicharset_, false));
87 }
88}
89
90TEST_F(TatweelTest, UnicharsetLoadKeepsTatweel) {
91 // This test verifies that a load of an existing unicharset keeps any
92 // existing tatweel for backwards compatibility.
93 std::string filename = TestDataNameToPath("ara.unicharset");
94 if (!file_exists(filename.c_str())) {
95 LOG(INFO) << "Skip test because of missing " << filename;
96 GTEST_SKIP();
97 } else {
98 EXPECT_TRUE(unicharset_.load_from_file(filename.c_str()));
99 int num_tatweel = 0;
100 for (int i = 0; i < unicharset_.size(); ++i) {
101 const char *utf8 = unicharset_.id_to_unichar(i);
102 if (strstr(utf8, reinterpret_cast<const char *>(u8"\u0640")) != nullptr) {
103 ++num_tatweel;
104 }
105 }
106 LOG(INFO) << "Num tatweels in unicharset=" << num_tatweel;
107 EXPECT_EQ(num_tatweel, 4);
108 }
109}
110
111} // namespace tesseract
@ LOG
@ INFO
Definition: log.h:28
#define GTEST_SKIP()
Definition: gtest.h:1889
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_GT(val1, val2)
Definition: gtest.h:2053
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define CHECK_OK(test)
Definition: include_gunit.h:84
@ DAWG_TYPE_WORD
Definition: dawg.h:66
std::string TestDataNameToPath(const std::string &name)
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:244
TEST_F(EuroText, FastLatinOCR)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:654
int check_for_words(const char *filename, const UNICHARSET &unicharset, bool enable_wildcard) const
Definition: dawg.cpp:68
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:273
@ RRP_REVERSE_IF_HAS_RTL
Definition: trie.h:57
static int Defaults()
Definition: include_gunit.h:61
static std::string JoinPath(const std::string &s1, const std::string &s2)
Definition: include_gunit.h:65
static bool GetContents(const std::string &filename, std::string *out, int)
Definition: include_gunit.h:52
void SetUp() override
Definition: tatweel_test.cc:37
std::string TestDataNameToPath(const std::string &name)
Definition: tatweel_test.cc:62
const_iterator end() const
Definition: unicodetext.cc:412
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
const_iterator begin() const
Definition: unicodetext.cc:408