tesseract v5.3.3.20231005
validate_grapheme_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "include_gunit.h"
13#include "normstrngs.h"
14#include "normstrngs_test.h"
15
16namespace tesseract {
17
18TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
19 std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
20 std::vector<std::string> glyphs;
22 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
24 // It made 3 graphemes.
25 EXPECT_EQ(glyphs.size(), 3);
26 EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
27 EXPECT_EQ(glyphs[1], std::string("\u0c15"));
28 EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
29}
30
31TEST(ValidateGraphemeTest, SingleConsonantOK) {
32 std::string str = "\u0cb9"; // HA
33 std::vector<std::string> glyphs;
35 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
37 EXPECT_EQ(glyphs.size(), 1);
38 EXPECT_EQ(glyphs[0], str);
39}
40
41TEST(ValidateGraphemeTest, SimpleCV) {
42 std::string str = "\u0cb9\u0cbf"; // HA I
43 std::vector<std::string> glyphs;
45 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
47 EXPECT_EQ(glyphs.size(), 1);
48 EXPECT_EQ(glyphs[0], str);
49}
50
51TEST(ValidateGraphemeTest, SubscriptConjunct) {
52 std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
53 std::vector<std::string> glyphs;
55 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
57 EXPECT_EQ(glyphs.size(), 1);
58 EXPECT_EQ(glyphs[0], str);
60 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
61 &glyphs))
63 EXPECT_EQ(glyphs.size(), 3);
64 EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
65}
66
67TEST(ValidateGraphemeTest, HalfFormJoiner) {
68 std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
69 std::vector<std::string> glyphs;
71 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
73 EXPECT_EQ(glyphs.size(), 1);
74 EXPECT_EQ(glyphs[0], str);
76 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
77 &glyphs))
79 EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
80 EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
81}
82
83TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
84 std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
85 std::vector<std::string> glyphs;
87 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
89 EXPECT_EQ(glyphs.size(), 1);
90 EXPECT_EQ(glyphs[0], str);
92 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
93 &glyphs))
95 EXPECT_EQ(glyphs.size(), 3);
96 EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
97}
98
99TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
100 std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
101 std::vector<std::string> glyphs;
103 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
105 EXPECT_EQ(glyphs.size(), 1);
106 EXPECT_EQ(glyphs[0], str);
108 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
109 &glyphs))
111 EXPECT_EQ(glyphs.size(), 3);
112 EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
113 // Malaylam only, so not allowed in Telugu.
114 str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
116 GraphemeNormMode::kCombined, true, str.c_str(),
117 &glyphs))
119}
120
121TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
122 std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
123 std::vector<std::string> glyphs;
125 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
127 EXPECT_EQ(glyphs.size(), 2);
128 EXPECT_EQ(glyphs[1], std::string("\u0d24"));
130 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
131 &glyphs))
133 EXPECT_EQ(glyphs.size(), 3);
134 EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
135}
136
137TEST(ValidateGraphemeTest, ThaiGraphemes) {
138 // This is a single grapheme unless in glyph split mode
139 std::string str = "\u0e14\u0e38\u0e4a";
140 std::vector<std::string> glyphs;
142 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
144 EXPECT_EQ(glyphs.size(), 1);
145 EXPECT_EQ(glyphs[0], str);
147 GraphemeNormMode::kGlyphSplit, true, str.c_str(),
148 &glyphs))
150 EXPECT_EQ(glyphs.size(), 3);
151 EXPECT_EQ(glyphs[0], std::string("\u0e14"));
152}
153
154TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
155 std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
156 std::vector<std::string> glyphs;
157 // Returns true, but the joiner is gone.
159 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
161 EXPECT_EQ(glyphs.size(), 5);
162 EXPECT_EQ(glyphs[0], std::string("'"));
163 EXPECT_EQ(glyphs[1], std::string("\u0d24"));
164 EXPECT_EQ(glyphs[2], std::string("\u0d23"));
165 EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
166 EXPECT_EQ(glyphs[4], std::string("'"));
167}
168
169} // namespace tesseract
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_FALSE(condition)
Definition: gtest.h:1986
std::string PrintString32WithUnicodes(const std::string &str)
std::string PrintStringVectorWithUnicodes(const std::vector< std::string > &glyphs)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179
TEST(TesseractInstanceTest, TestMultipleTessInstances)