tesseract v5.3.3.20231005
pango_font_info_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "pango_font_info.h"
13#include <pango/pango.h>
14#include <cstdio>
15#include <string>
16#include "commandlineflags.h"
17#include "fileio.h"
18#include "gmock/gmock-matchers.h" // for EXPECT_THAT
19#include "include_gunit.h"
20#ifdef INCLUDE_TENSORFLOW
21# include "util/utf8/unicodetext.h" // for UnicodeText
22#endif
23
24namespace tesseract {
25
26// Fonts in testdata directory
27const char *kExpectedFontNames[] = {"Arab",
28 "Arial Bold Italic",
29 "DejaVu Sans Ultra-Light",
30 "Lohit Hindi",
31#if PANGO_VERSION <= 12005
32 "Times New Roman",
33#else
34 "Times New Roman,", // Pango v1.36.2 requires a trailing ','
35#endif
36 "UnBatang",
37 "Verdana"};
38
39// Sample text used in tests.
40const char kArabicText[] = "والفكر والصراع 1234,\nوالفكر والصراع";
41const char kEngText[] = "the quick brown fox jumps over the lazy dog";
42const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
43const char kKorText[] = "이는 것으로";
44// Hindi words containing illegal vowel sequences.
45const char *kBadlyFormedHinWords[] = {
46#if PANGO_VERSION <= 12005
47 "उपयोक्ताो", "नहीें", "कहीअे", "पत्रिाका", "छह्णाीस",
48#endif
49 // Pango v1.36.2 will render the above words even though they are invalid.
50 "प्रंात", nullptr};
51
52static PangoFontMap *font_map;
53
55protected:
56 void SetUp() override {
57 if (!font_map) {
58 font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
59 }
60 pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
61 }
62
63 // Creates a fake fonts.conf file that points to the testdata fonts for
64 // fontconfig to initialize with.
65 static void SetUpTestCase() {
66 static std::locale system_locale("");
67 std::locale::global(system_locale);
68
69 FLAGS_fonts_dir = TESTING_DIR;
70 FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
73 }
74
76};
77
78TEST_F(PangoFontInfoTest, TestNonDefaultConstructor) {
79 PangoFontInfo font("Arial Bold Italic 12");
80 EXPECT_EQ(12, font.font_size());
81 EXPECT_EQ("Arial", font.family_name());
82}
83
84TEST_F(PangoFontInfoTest, DoesParseFontDescriptionName) {
85 EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Bold Italic 12"));
86 EXPECT_EQ(12, font_info_.font_size());
87 EXPECT_EQ("Arial", font_info_.family_name());
88
89 EXPECT_TRUE(font_info_.ParseFontDescriptionName("Verdana 10"));
90 EXPECT_EQ(10, font_info_.font_size());
91 EXPECT_EQ("Verdana", font_info_.family_name());
92
93 EXPECT_TRUE(font_info_.ParseFontDescriptionName("DejaVu Sans Ultra-Light"));
94 EXPECT_EQ("DejaVu Sans", font_info_.family_name());
95}
96
97TEST_F(PangoFontInfoTest, DoesParseMissingFonts) {
98 // Font family one of whose faces exists but this one doesn't.
99 EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
100 EXPECT_EQ(12, font_info_.font_size());
101 EXPECT_EQ("Arial", font_info_.family_name());
102
103 // Font family that doesn't exist in testdata. It will still parse the
104 // description name. But without the file, it will not be able to populate
105 // some font family details, like is_monospace().
106 EXPECT_TRUE(font_info_.ParseFontDescriptionName("Georgia 10"));
107 EXPECT_EQ(10, font_info_.font_size());
108 EXPECT_EQ("Georgia", font_info_.family_name());
109}
110
111TEST_F(PangoFontInfoTest, DoesGetSpacingProperties) {
112 EXPECT_TRUE(font_info_.ParseFontDescriptionName("Arial Italic 12"));
113 int x_bearing, x_advance;
114 EXPECT_TRUE(font_info_.GetSpacingProperties("A", &x_bearing, &x_advance));
115 EXPECT_GT(x_advance, 0);
116 EXPECT_TRUE(font_info_.GetSpacingProperties("a", &x_bearing, &x_advance));
117 EXPECT_GT(x_advance, 0);
118}
119
120TEST_F(PangoFontInfoTest, CanRenderString) {
121 font_info_.ParseFontDescriptionName("Verdana 12");
122 EXPECT_TRUE(font_info_.CanRenderString(kEngText, strlen(kEngText)));
123
124 font_info_.ParseFontDescriptionName("UnBatang 12");
125 EXPECT_TRUE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
126
127 font_info_.ParseFontDescriptionName("Lohit Hindi 12");
128 EXPECT_TRUE(font_info_.CanRenderString(kHinText, strlen(kHinText)));
129}
130
131TEST_F(PangoFontInfoTest, CanRenderLigature) {
132 font_info_.ParseFontDescriptionName("Arab 12");
133 const char kArabicLigature[] = "لا";
134 EXPECT_TRUE(font_info_.CanRenderString(kArabicLigature, strlen(kArabicLigature)));
135
136 printf("Next word\n");
137 EXPECT_TRUE(font_info_.CanRenderString(kArabicText, strlen(kArabicText)));
138}
139
140TEST_F(PangoFontInfoTest, CannotRenderUncoveredString) {
141 font_info_.ParseFontDescriptionName("Verdana 12");
142 EXPECT_FALSE(font_info_.CanRenderString(kKorText, strlen(kKorText)));
143}
144
145TEST_F(PangoFontInfoTest, CannotRenderInvalidString) {
146 font_info_.ParseFontDescriptionName("Lohit Hindi 12");
147 for (int i = 0; kBadlyFormedHinWords[i] != nullptr; ++i) {
149 font_info_.CanRenderString(kBadlyFormedHinWords[i], strlen(kBadlyFormedHinWords[i])))
150 << "Can render " << kBadlyFormedHinWords[i];
151 }
152}
153
154TEST_F(PangoFontInfoTest, CanDropUncoveredChars) {
155 font_info_.ParseFontDescriptionName("Verdana 12");
156 // Verdana cannot render the "ff" ligature
157 std::string word = "office";
158 EXPECT_EQ(1, font_info_.DropUncoveredChars(&word));
159 EXPECT_EQ("oice", word);
160
161 // Don't drop non-letter characters like word joiners.
162 const char *kJoiners[] = {
163 "\u2060", // U+2060 (WJ)
164 "\u200C", // U+200C (ZWJ)
165 "\u200D" // U+200D (ZWNJ)
166 };
167 for (auto &kJoiner : kJoiners) {
168 word = kJoiner;
169 EXPECT_EQ(0, font_info_.DropUncoveredChars(&word));
170 EXPECT_STREQ(kJoiner, word.c_str());
171 }
172}
173
174// ------------------------ FontUtils ------------------------------------
175
177protected:
178 void SetUp() override {
180 }
181 // Creates a fake fonts.conf file that points to the testdata fonts for
182 // fontconfig to initialize with.
183 static void SetUpTestCase() {
184 FLAGS_fonts_dir = TESTING_DIR;
185 FLAGS_fontconfig_tmpdir = FLAGS_test_tmpdir;
186 if (!font_map) {
187 font_map = pango_cairo_font_map_new_for_font_type(CAIRO_FONT_TYPE_FT);
188 }
189 pango_cairo_font_map_set_default(PANGO_CAIRO_FONT_MAP(font_map));
190 }
191
192#ifdef INCLUDE_TENSORFLOW
193 void CountUnicodeChars(const char *utf8_text, std::unordered_map<char32, int64_t> *ch_map) {
194 ch_map->clear();
195 UnicodeText ut;
196 ut.PointToUTF8(utf8_text, strlen(utf8_text));
197 for (UnicodeText::const_iterator it = ut.begin(); it != ut.end(); ++it) {
198# if 0
199 if (UnicodeProps::IsWhitespace(*it)) continue;
200# else
201 if (std::isspace(*it))
202 continue;
203# endif
204 ++(*ch_map)[*it];
205 }
206 }
207#endif
208};
209
210TEST_F(FontUtilsTest, DoesFindAvailableFonts) {
211 EXPECT_TRUE(FontUtils::IsAvailableFont("Arial Bold Italic"));
213 EXPECT_TRUE(FontUtils::IsAvailableFont("DejaVu Sans Ultra-Light"));
214
215 // Test that we can support font name convention for Pango v1.30.2 even when
216 // we are running an older version.
217 EXPECT_TRUE(FontUtils::IsAvailableFont("Times New Roman,"));
218}
219
220TEST_F(FontUtilsTest, DoesDetectMissingFonts) {
221 // Only bold italic face is available.
223 // Don't have a ttf for the Courier family.
225 // Pango "synthesizes" the italic font from the available Verdana Regular and
226 // includes it in its list, but it is not really loadable.
227 EXPECT_FALSE(FontUtils::IsAvailableFont("Verdana Italic"));
228 // We have "Dejavu Sans Ultra-Light" but not its medium weight counterpart.
230}
231
232TEST_F(FontUtilsTest, DoesListAvailableFonts) {
233 const std::vector<std::string> &fonts = FontUtils::ListAvailableFonts();
234 EXPECT_THAT(fonts, ::testing::ElementsAreArray(kExpectedFontNames));
235 for (auto &font : fonts) {
236 PangoFontInfo font_info;
237 EXPECT_TRUE(font_info.ParseFontDescriptionName(font));
238 }
239}
240
241#ifdef INCLUDE_TENSORFLOW
242TEST_F(FontUtilsTest, DoesFindBestFonts) {
243 std::string fonts_list;
244 std::unordered_map<char32, int64_t> ch_map;
245 CountUnicodeChars(kEngText, &ch_map);
246 EXPECT_EQ(26, ch_map.size()); // 26 letters
247 std::vector<std::pair<const char *, std::vector<bool> > > font_flags;
248 std::string best_list = FontUtils::BestFonts(ch_map, &font_flags);
249 EXPECT_TRUE(best_list.size());
250 // All fonts except Lohit Hindi should render English text.
251 EXPECT_EQ(countof(kExpectedFontNames) - 1, font_flags.size());
252
253 CountUnicodeChars(kKorText, &ch_map);
254 best_list = FontUtils::BestFonts(ch_map, &font_flags);
255 EXPECT_TRUE(best_list.size());
256 // Only UnBatang font family is able to render korean.
257 EXPECT_EQ(1, font_flags.size());
258 EXPECT_STREQ("UnBatang", font_flags[0].first);
259}
260#endif
261
262TEST_F(FontUtilsTest, DoesSelectFont) {
263 const char *kLangText[] = {kArabicText, kEngText, kHinText, kKorText, nullptr};
264 const char *kLangNames[] = {"Arabic", "English", "Hindi", "Korean", nullptr};
265 for (int i = 0; kLangText[i] != nullptr; ++i) {
266 SCOPED_TRACE(kLangNames[i]);
267 std::vector<std::string> graphemes;
268 std::string selected_font;
270 FontUtils::SelectFont(kLangText[i], strlen(kLangText[i]), &selected_font, &graphemes));
271 EXPECT_TRUE(selected_font.size());
272 EXPECT_TRUE(graphemes.size());
273 }
274}
275
276TEST_F(FontUtilsTest, DoesFailToSelectFont) {
277 const char kMixedScriptText[] = "पिताने विवाह की | والفكر والصراع";
278 std::vector<std::string> graphemes;
279 std::string selected_font;
280 EXPECT_FALSE(FontUtils::SelectFont(kMixedScriptText, strlen(kMixedScriptText), &selected_font,
281 &graphemes));
282}
283
284#if 0
285// Needs fix. FontUtils::GetAllRenderableCharacters was removed
286// because of deprecated pango_coverage_max.
287TEST_F(FontUtilsTest, GetAllRenderableCharacters) {
288 const int32_t kHindiChar = 0x0905;
289 const int32_t kArabicChar = 0x0623;
290 const int32_t kMongolianChar = 0x180E; // Mongolian vowel separator
291 const int32_t kOghamChar = 0x1680; // Ogham space mark
292 std::vector<bool> unicode_mask;
293 FontUtils::GetAllRenderableCharacters(&unicode_mask);
294 EXPECT_TRUE(unicode_mask['A']);
295 EXPECT_TRUE(unicode_mask['1']);
296 EXPECT_TRUE(unicode_mask[kHindiChar]);
297 EXPECT_TRUE(unicode_mask[kArabicChar]);
298 EXPECT_FALSE(unicode_mask[kMongolianChar]); // no font for mongolian.
299# if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
300 EXPECT_FALSE(unicode_mask[kOghamChar]); // no font for ogham.
301# endif
302 unicode_mask.clear();
303
304 std::vector<std::string> selected_fonts;
305 selected_fonts.push_back("Lohit Hindi");
306 FontUtils::GetAllRenderableCharacters(selected_fonts, &unicode_mask);
307 EXPECT_TRUE(unicode_mask['1']);
308 EXPECT_TRUE(unicode_mask[kHindiChar]);
309 EXPECT_FALSE(unicode_mask['A']); // Lohit doesn't render English,
310 EXPECT_FALSE(unicode_mask[kArabicChar]); // or Arabic,
311 EXPECT_FALSE(unicode_mask[kMongolianChar]); // or Mongolian,
312 EXPECT_FALSE(unicode_mask[kOghamChar]); // or Ogham.
313 unicode_mask.clear();
314
315 // Check that none of the included fonts cover the Mongolian or Ogham space
316 // characters.
317 for (size_t f = 0; f < countof(kExpectedFontNames); ++f) {
318 std::string tracestring = "Testing " + kExpectedFontNames[f];
319 SCOPED_TRACE(tracestring);
320 FontUtils::GetAllRenderableCharacters(kExpectedFontNames[f], &unicode_mask);
321# if 0 // TODO: check fails because DejaVu Sans Ultra-Light supports ogham
322 EXPECT_FALSE(unicode_mask[kOghamChar]);
323# endif
324 EXPECT_FALSE(unicode_mask[kMongolianChar]);
325 unicode_mask.clear();
326 }
327}
328#endif
329
330} // namespace tesseract
#define EXPECT_THAT(value, matcher)
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define SCOPED_TRACE(message)
Definition: gtest.h:2281
#define EXPECT_GT(val1, val2)
Definition: gtest.h:2053
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_STREQ(s1, s2)
Definition: gtest.h:2112
#define EXPECT_FALSE(condition)
Definition: gtest.h:1986
const char kArabicText[]
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:228
const char kHinText[]
const char kEngText[]
const char kKorText[]
constexpr size_t countof(T const (&)[N]) noexcept
Definition: serialis.h:34
const char * kBadlyFormedHinWords[]
const char * kExpectedFontNames[]
TEST_F(EuroText, FastLatinOCR)
bool ParseFontDescriptionName(const std::string &name)
const std::string & family_name() const
static std::string BestFonts(const std::unordered_map< char32, int64_t > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
static bool SelectFont(const char *utf8_word, const int utf8_len, std::string *font_name, std::vector< std::string > *graphemes)
static bool IsAvailableFont(const char *font_desc)
static const std::vector< std::string > & ListAvailableFonts()
static void MakeTmpdir()
Definition: include_gunit.h:38
const_iterator end() const
Definition: unicodetext.cc:412
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
const_iterator begin() const
Definition: unicodetext.cc:408