tesseract v5.3.3.20231005
normstrngs_test.cc
Go to the documentation of this file.
1// (C) Copyright 2017, Google Inc.
2// Licensed under the Apache License, Version 2.0 (the "License");
3// you may not use this file except in compliance with the License.
4// You may obtain a copy of the License at
5// http://www.apache.org/licenses/LICENSE-2.0
6// Unless required by applicable law or agreed to in writing, software
7// distributed under the License is distributed on an "AS IS" BASIS,
8// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9// See the License for the specific language governing permissions and
10// limitations under the License.
11
12#include "normstrngs.h"
13#include <tesseract/unichar.h>
14#include "include_gunit.h"
15#include "normstrngs_test.h"
16#ifdef INCLUDE_TENSORFLOW
17# include "util/utf8/unilib.h" // for UniLib
18#endif
19
20#include "include_gunit.h"
21
22namespace tesseract {
23
24#if defined(MISSING_CODE)
25static std::string EncodeAsUTF8(const char32 ch32) {
26 UNICHAR uni_ch(ch32);
27 return std::string(uni_ch.utf8(), uni_ch.utf8_len());
28}
29#endif
30
31TEST(NormstrngsTest, BasicText) {
32 const char *kBasicText = "AbCd Ef";
33 std::string result;
35 GraphemeNorm::kNormalize, kBasicText, &result));
36 EXPECT_STREQ(kBasicText, result.c_str());
37}
38
39TEST(NormstrngsTest, LigatureText) {
40 const char *kTwoByteLigText = "ij"; // U+0133 (ij) -> ij
41 std::string result;
43 GraphemeNorm::kNormalize, kTwoByteLigText, &result));
44 EXPECT_STREQ("ij", result.c_str());
45
46 const char *kThreeByteLigText = "finds"; // U+FB01 (fi) -> fi
48 GraphemeNorm::kNormalize, kThreeByteLigText, &result));
49 EXPECT_STREQ("finds", result.c_str());
50}
51
52TEST(NormstrngsTest, OcrSpecificNormalization) {
53 const char *kSingleQuoteText = "‘Hi"; // U+2018 (‘) -> U+027 (')
54 std::string result;
56 GraphemeNorm::kNormalize, kSingleQuoteText, &result));
57 EXPECT_STREQ("'Hi", result.c_str());
58
59 const char *kDoubleQuoteText = "“Hi"; // U+201C (“) -> U+022 (")
61 GraphemeNorm::kNormalize, kDoubleQuoteText, &result));
62 EXPECT_STREQ("\"Hi", result.c_str());
63
64 const char *kEmDash = "Hi—"; // U+2014 (—) -> U+02D (-)
66 GraphemeNorm::kNormalize, kEmDash, &result));
67 EXPECT_STREQ("Hi-", result.c_str());
68 // Without the ocr normalization, these changes are not made.
70 kSingleQuoteText, &result));
71 EXPECT_STREQ(kSingleQuoteText, result.c_str());
73 kDoubleQuoteText, &result));
74 EXPECT_STREQ(kDoubleQuoteText, result.c_str());
76 kEmDash, &result));
77 EXPECT_STREQ(kEmDash, result.c_str());
78}
79
80// Sample text used in tests.
81const char kEngText[] = "the quick brown fox jumps over the lazy dog";
82const char kHinText[] = "पिताने विवाह की | हो गई उद्विग्न वह सोचा";
83const char kKorText[] = "이는 것으로";
84// Hindi words containing illegal vowel sequences.
85const char *kBadlyFormedHinWords[] = {"उपयोक्ताो", "नहीें", "प्रंात", "कहीअे", "पत्रिाका", "छह्णाीस"};
86// Thai illegal sequences.
87const char *kBadlyFormedThaiWords[] = {"ฤิ", "กา้ํ", "กิำ", "นำ้", "เเก"};
88
89TEST(NormstrngsTest, DetectsCorrectText) {
90 std::string chars;
92 kEngText, &chars));
93 EXPECT_STREQ(kEngText, chars.c_str());
94
96 kHinText, &chars))
97 << "Incorrect text: '" << kHinText << "'";
98 EXPECT_STREQ(kHinText, chars.c_str());
99
101 kKorText, &chars));
102 EXPECT_STREQ(kKorText, chars.c_str());
103}
104
105TEST(NormstrngsTest, DetectsIncorrectText) {
106 for (auto &kBadlyFormedHinWord : kBadlyFormedHinWords) {
108 GraphemeNorm::kNormalize, kBadlyFormedHinWord, nullptr))
109 << kBadlyFormedHinWord;
110 }
111 for (auto &kBadlyFormedThaiWord : kBadlyFormedThaiWords) {
113 GraphemeNorm::kNormalize, kBadlyFormedThaiWord, nullptr))
114 << kBadlyFormedThaiWord;
115 }
116}
117
118TEST(NormstrngsTest, NonIndicTextDoesntBreakIndicRules) {
119 std::string nonindic = "Here's some latin text.";
120 std::string dest;
122 nonindic.c_str(), &dest))
123 << PrintString32WithUnicodes(nonindic);
124 EXPECT_EQ(dest, nonindic);
125}
126
127TEST(NormstrngsTest, NoLonelyJoiners) {
128 std::string str = "x\u200d\u0d06\u0d34\u0d02";
129 std::vector<std::string> glyphs;
130 // Returns true, but the joiner is gone.
132 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
134 EXPECT_EQ(glyphs.size(), 3);
135 EXPECT_EQ(glyphs[0], std::string("x"));
136 EXPECT_EQ(glyphs[1], std::string("\u0d06"));
137 EXPECT_EQ(glyphs[2], std::string("\u0d34\u0d02"));
138}
139
140TEST(NormstrngsTest, NoLonelyJoinersPlus) {
141 std::string str = "\u0d2a\u200d+\u0d2a\u0d4b";
142 std::vector<std::string> glyphs;
143 // Returns true, but the joiner is gone.
145 GraphemeNormMode::kCombined, true, str.c_str(), &glyphs))
147 EXPECT_EQ(glyphs.size(), 3);
148 EXPECT_EQ(glyphs[0], std::string("\u0d2a"));
149 EXPECT_EQ(glyphs[1], std::string("+"));
150 EXPECT_EQ(glyphs[2], std::string("\u0d2a\u0d4b"));
151}
152
153TEST(NormstrngsTest, NoLonelyJoinersNonAlpha) {
154 std::string str = "\u200d+\u200c\u200d";
155 // Returns true, but the joiners are gone.
156 ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 1, 1, 1, std::string("+"));
157 str = "\u200d\u200c\u200d";
158 // Without the plus, the string is invalid.
159 std::string result;
161 str.c_str(), &result))
162 << PrintString32WithUnicodes(result);
163}
164
165TEST(NormstrngsTest, JoinersStayInArabic) {
166 std::string str = "\u0628\u200c\u0628\u200d\u0628";
167 // Returns true, string untouched.
169}
170
171TEST(NormstrngsTest, DigitOK) {
172 std::string str = "\u0cea"; // Digit 4.
174}
175
176TEST(NormstrngsTest, DandaOK) {
177 std::string str = "\u0964"; // Single danda.
179 str = "\u0965"; // Double danda.
181}
182
183TEST(NormstrngsTest, AllScriptsRegtest) {
184 // Tests some valid text in a large number of scripts, some of which were
185 // found to be rejected by an earlier version.
186 const std::vector<std::pair<std::string, std::string>> kScriptText(
187 {{"Arabic",
188 " فكان منهم علقمة بن قيس ، و إبراهيم النخعي ، و الأسود بن"
189 "توفي بالمدينة في هذه السنة وهو ابن مائة وعشرين سنة "
190 "مجموعه هیچ اثری در فنون هنر و ادب و ترجمه، تقدیم پیشگاه ارجمند "
191 "سازنده تاریخ نگاه میکرد و به اصطلاح انسان و فطرت انسانی را زیربنای"},
192 {"Armenian",
193 "անտիկ աշխարհի փիլիսոփաների կենսագրությունը, թե′ նրանց ուս-"
194 "պատրաստւում է դալ (բուլամա): Կովկասում կաթից նաև պատ-"
195 "Հոգաբարձութեան յղել այդ անձին յիմարութիւնը հաստա-"
196 "գծերը եւ միջագծերը կը համրուին վարէն վեր:"},
197 {"Bengali",
198 "এসে দাঁড়ায় দাও খানি উঁচিয়ে নিয়ে । ঝরনার স্বচ্ছ জলে প্রতিবিম্বিত "
199 "পাঠিয়ে, গোবিন্দ স্মরণ করে, নির্ভয়ে রওনা হয়েছিল। তাতে সে "
200 "সুলতার। মনে পড়ে বিয়ের সময় বাবা এদের বাড়ি থেকে ঘুরে "
201 "কিন্তু তারপর মাতৃহৃদয় কেমন করে আছে? কী"},
202 {"Cyrillic",
203 "достей, є ще нагороди й почесті, є хай і сумнівна, але слава, "
204 "вып., 96б). Параўн. найсвятший у 1 знач., насвятейший у 1 знач., "
205 "»Правді«, — гітлерівські окупанти винищували нижчі раси, після дру- "
206 "І знов майдан зачорнів од народу. Всередині чоло-"},
207 {"Devanagari",
208 "डा॰ नै हात्तीमाथि चढेर त्यो भएनेर आइपुगे। राजालाई देखी "
209 "बाबतीत लिहिणे ही एक मोठीच जबाबदारी आहे. काकासाहेबांच्या कार्याचा "
210 "प्रबंध, आधोगिक प्रबंध तथा बैंकिंग एवम वाणिज्य आदि विषयों में "
211 "चित्रकृती दिल्या. शंभराहून अधिक देश आज आपापले चित्रपट निर्माण करीत"},
212 {"Greek",
213 "Μέσα ένα τετράδιο είχα στριμώξει το πρώτο "
214 "νον αξίως τού ευαγγελίου τού χριστού πολιτεύεσθε, ίνα "
215 "οὐδεμία ὑπ' αὐτοῦ μνεία γίνεται τῶν οἰκείων χωρίων. "
216 "είτα την φάσιν αυτήν ην ούτος εποιήσατο κατά του Μίκω-"},
217 {"Gujarati",
218 "ઉપહારગૃહે ને નાટ્યસ્થળે આ એ જ તેલ કડકડતું "
219 "શકી. ભાવવધારો અટકાવી નથી શકી અને બેકારીને "
220 "ત્યાં વાંકુથી પાછે આવ્યો, ચોરીનો માલ સોંપવા ! "
221 "કહી. એણે રેશમના કપડામાં વીંટી રાખેલ કુંવરીની છબી"},
222 {"Gurmukhi",
223 "ਯਾਦ ਰਹੇ ਕਿ ‘ਨਫਰਤ ’ ਦਾ ਵਿਸ਼ਾ ਕ੍ਰਾਤੀ ਨਹੀ ਹੈ ਅਤੇ ਕਵੀ ਦੀ ਇਹ "
224 "ਮਹਾਂ ਨੰਦਾ ਕੋਲ ਇਕ ਚੀਜ਼ ਸੀ ਉਹ ਸੀ ਸਚ, ਕੋਰਾ ਸਚ, ਬੇਧਤ੍ਰਕ ਕਹਿੳ "
225 "ਭੂਰਾ ਸਾਨੂੰ ਥੜਾ ਚੰਗਾ ਲਗਦਾ ਸੀ । ਉਸ ਦਾ ਇਕ ਪੈਰ ਜਨਮ ਤੋ "
226 "ਨੂੰ ਇਹ ਅਧਿਕਾਰ ਦਿੱਤਾ ਕਿ ਉਹ ਸਿੱਖ ਵਿਰੋਧ ਦਾ ਸੰਗਠਨ ਕਰੇ ਅਤੇ 3 ਸਤੰਬਰ,"},
227 {"Hangul",
228 "로 들어갔다. 이대통령은 아이젠하워 대통령의 뒷모습을 보면서 "
229 "그것뿐인 줄 아요? 노름도 했다 캅니다. 빌어묵을 놈이 그러 "
230 "의 가장 과학적 태도이며, 우리 역사를 가장 정확하게 학습할 수 있는 "
231 "마르크스 레"
232 "각하는 그는 그들의 식사보장을 위해 때때로 집에"},
233 {"HanS",
234 "大凡世界上的先生可 分 三 种: 第一种只会教书, 只会拿一 "
235 "书像是探宝一样,在茶叶店里我买过西湖龙井﹑黄山毛峰﹑福建的铁观音﹑大红"
236 " "
237 "持 “左” 倾冒险主义的干部,便扣上 “富农 "
238 "笑说:“我听说了,王总工程师也跟我说过了,只是工作忙,谁"},
239 {"HanT",
240 "叁、 銀行資產管理的群組分析模式 "
241 "民國六十三年,申請就讀台灣大學歷史研究所,並從事著述,"
242 "質言之﹐在社會結構中﹐性質﹑特徵﹑地位相類似的一羣人﹐由於 "
243 "董橋,一九四二年生,福建晉江人,國立成功大學外"},
244 {"Hebrew",
245 " אֵ-לִי, אֵ-לִי, כֵּיַצד מְטַפְּסִים בְּקִירוֹת שֶׁל זְכוּכִי"
246 " הראשון חוצה אותי שוב. אני בסיבוב הרביעי, הוא בטח מתחיל את"
247 " ווערטער געהאט, אבער דער עיקר איז ניט דאָס וואָרט, נאָר"
248 " על גחלת היהדות המקורית בעירך, נתת צביון ואופי מיוחד"},
249 {"Japanese",
250 "は異民族とみなされていた。楚の荘王(前613〜前 "
251 "を詳細に吟味する。実際の治療活動の領域は便宜上、(1) 障害者 "
252 "困難性は多角企業の場合原則として部門別に判断されている.). "
253 "☆ご希望の団体には見本をお送りします"},
254 {"Kannada",
255 "ಕೂಡ ಯುದ್ಧ ಮಾಡಿ ಜಯಪಡೆ. ನಂತರ ನಗರದೊಳಕ್ಕೆ ನಡೆ ಇದನ್ನು "
256 "ಅಸಹ್ಯದೃಶ್ಯ ಯಾರಿಗಾದರೂ ನಾಚಿಕೆತರುವಂತಹದಾಗಿದೆ. ಆರೋಗ್ಯ ದೃಷ್ಟಿ "
257 "ಯಾಗಲಿ, ಮೋಹನನಾಗಲಿ ಇಂಥ ಬಿಸಿಲಿನಲ್ಲಿ ಎಂದೂ ಬಹಳ ಹೊತ್ತು "
258 "\"ಇದೆ...ಖಂಡಿತಾ ಇದೆ\" ಅಂದ ಮನಸ್ಸಿನಲ್ಲಿಯೇ ವಂದಿಸುತ್ತಾ,"},
259 {"Khmer",
260 "សិតសក់និងផ្លាស់សម្លៀកបំពាក់ពេលយប់ចេញ។ "
261 "និយាយអំពីនគរនេះ ប្រាប់ដល់លោកទាំងមូលឲ្យដឹងច្បាស់លាស់អំពី "
262 "កន្លះកាថាសម្រាប់ទន្ទេញឲ្យងាយចាំ បោះពុម្ពនៅក្នុងទ្រង់ទ្រាយបច្ចុប្បន្ន "
263 "ឯកសារនេះបានផ្សព្វផ្សាយនៅក្នុងសន្និសីទ"},
264 {"Lao",
265 "ເອີຍ ! ຟັງສຽງຟ້າມັນຮ້ອງຮ່ວນ ມັນດັງໄກໆ ເອີຍ "
266 "ໄດລຽງດູລາວມາດວບຄວາມລາບາກຫລາຍ; "
267 "ບາງໄດ້ ເຈົ້າລອງສູ້ບໍ່ໄດ້ຈຶ່ງຫນີລົງມາວຽງຈັນ. "
268 "ລົບອອກຈາກ 3 ເຫລືອ 1, ຂ້ອຍຂຽນ 1 (1)"},
269 {"Latin",
270 "režisoru, palīdzēja to manu domīgo, kluso Dzejas metru ielikt "
271 "Ešte nedávno sa chcel mladý Novomeský „liečiť” "
272 "tiivisia kysymyksiä, mistä seuraa, että spekula- | don luonteesta "
273 "Grabiel Sanchez, yang bertani selama 120 tahun meninggal"},
274 {"Malayalam",
275 "അമൂർത്തചിത്രമായിരിക്കും. ഛേ! ആ വീട്ടിലേക്ക് അവളൊന്നിച്ച് പോകേണ്ടതാ "
276 "മൃഗങ്ങൾക്ക് എന്തെക്കിലും പറ്റിയാൽ മാത്രം ഞാനതു "
277 "വെലക്ക് വേണമെങ്കിൽ തരാം. എന്തോ തരും? പറ. "
278 "എല്ലാം കഴിഞ്ഞ് സീനിയറിന്റെ അടുത്തു ചെന്ന് കാൽതൊട്ട"},
279 {"Tamil",
280 "பொருத்தமாகப் பாடினாள் நம் ஔவைப் பாட்டி. காவிரி "
281 "உள்ளடக்கி நிற்பது விநோத வார்த்தையின் அஃறிணை "
282 "சூரிய கிரஹண சமயத்தில் குருக்ஷேத்திரம் செல்வது "
283 "காலங்களில் வெளியே போகும்பொழுது, 'ஸார்', 'ஸார்',"},
284 {"Telugu",
285 "1892లో ఆమె 10వ సంవత్సరంలో గుంటూరు తాలూకా వేములాపాడు "
286 "ఫండ్స్ చట్టము'నందు చేయబడెను. తరువాత క్రీ. శ. "
287 "సంచారము చేయును. మీరు ఇప్పుడే కాళకాలయమునకు "
288 "ఎంతటి సరళమైన భాషలో వ్రాశాడో విశదమవుతుంది. పైగా ఆనాటి భాష"},
289 {"Thai",
290 "อ้อ! กับนัง....แม่ยอดพระกลิ่น นั่นเอง ! หรับก็ย่อมจะรู้โดยชัดเจนว่า "
291 "ถ้าตราบใดยังมีเรือปืนอยู่ใกล้ ๆ แล้ว ตราบนั้น "
292 "พระดำรินี้ ที่มีคตีทำกรวยหมากและธูปเทียน "
293 "อันยานมีเรือเปนต้นฃ้ามยาก ฯ เพราะว่าแม่น้ำนั่นมีน้ำใสยิ่ง แม้เพียง"},
294 {"Vietnamese",
295 "vợ đến tai mụ hung thần Xăng-tô- mê-a. Mụ vô cùng "
296 "chiếc xe con gấu chạy qua nhà. Nhưng thỉnh thoảng "
297 "hòa hoãn với người Pháp để cho họ được dựng một ngôi nhà thờ nhỏ bằng "
298 "Cặp câu đói súc tích mà sâu sắc, là lời chúc lời"}});
299
300 for (const auto &p : kScriptText) {
301 std::string normalized;
304 tesseract::GraphemeNorm::kNormalize, p.second.c_str(), &normalized))
305 << "Script=" << p.first << " text=" << p.second;
306 }
307}
308
309TEST(NormstrngsTest, IsWhitespace) {
310 // U+0020 is whitespace
315 // U+2000 through U+200A
316 for (char32 ch = 0x2000; ch <= 0x200A; ++ch) {
317 char text[80];
318 snprintf(text, sizeof(text), "Failed at U+%x", ch);
319 SCOPED_TRACE(text);
321 }
322 // U+3000 is whitespace
323 EXPECT_TRUE(IsWhitespace(0x3000));
324 // ZWNBSP is not considered a space.
325 EXPECT_FALSE(IsWhitespace(0xFEFF));
326}
327
328TEST(NormstrngsTest, SpanUTF8Whitespace) {
329 EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\n"));
330 EXPECT_EQ(4, SpanUTF8Whitespace(" \t\r\nabc"));
331 EXPECT_EQ(0, SpanUTF8Whitespace("abc \t\r\nabc"));
333}
334
335TEST(NormstrngsTest, SpanUTF8NotWhitespace) {
336 const char kHinText[] = "पिताने विवाह";
337 const char kKorText[] = "이는 것으로 다시 넣을";
338 const char kMixedText[] = "والفكر 123 والصراع abc";
339
342 EXPECT_EQ(0, SpanUTF8NotWhitespace("\rabc"));
343 EXPECT_EQ(0, SpanUTF8NotWhitespace("\tabc"));
344 EXPECT_EQ(0, SpanUTF8NotWhitespace("\nabc"));
345 EXPECT_EQ(3, SpanUTF8NotWhitespace("abc def"));
349}
350
351// Test that the method clones the util/utf8/unilib definition of
352// interchange validity.
353TEST(NormstrngsTest, IsInterchangeValid) {
354#ifdef INCLUDE_TENSORFLOW
355 const int32_t kMinUnicodeValue = 33;
356 const int32_t kMaxUnicodeValue = 0x10FFFF;
357 for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
358 char text[80];
359 snprintf(text, sizeof(text), "Failed at U+%x", ch);
360 SCOPED_TRACE(text);
362 }
363#else
364 GTEST_SKIP();
365#endif
366}
367
368// Test that the method clones the util/utf8/unilib definition of
369// 7-bit ASCII interchange validity.
371#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
372 const int32_t kMinUnicodeValue = 33;
373 const int32_t kMaxUnicodeValue = 0x10FFFF;
374 for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
375 char text[80];
376 snprintf(text, sizeof(text), "Failed at U+%x", ch);
377 SCOPED_TRACE(text);
378 std::string str = EncodeAsUTF8(ch);
380 }
381#else
382 // Skipped because of missing UniLib::IsInterchangeValid7BitAscii.
383 GTEST_SKIP();
384#endif
385}
386
387// Test that the method clones the util/utf8/unilib definition of
388// fullwidth-halfwidth .
389TEST(NormstrngsTest, FullwidthToHalfwidth) {
390 // U+FF21 -> U+0041 (Latin capital letter A)
391 EXPECT_EQ('A', FullwidthToHalfwidth(0xFF21));
392 // U+FF05 -> U+0025 (percent sign)
393 EXPECT_EQ('%', FullwidthToHalfwidth(0xFF05));
394 // U+FFE6 -> U+20A9 (won sign)
395 EXPECT_EQ(0x20A9, FullwidthToHalfwidth(0xFFE6));
396
397#if defined(MISSING_CODE) && defined(INCLUDE_TENSORFLOW)
398 // Skipped because of missing UniLib::FullwidthToHalfwidth.
399 const int32_t kMinUnicodeValue = 33;
400 const int32_t kMaxUnicodeValue = 0x10FFFF;
401 for (int32_t ch = kMinUnicodeValue; ch <= kMaxUnicodeValue; ++ch) {
402 if (!IsValidCodepoint(ch))
403 continue;
404 char text[80];
405 snprintf(text, sizeof(text), "Failed at U+%x", ch);
406 SCOPED_TRACE(text);
407 std::string str = EncodeAsUTF8(ch);
408 const std::string expected_half_str =
409 UniLib::FullwidthToHalfwidth(str.c_str(), str.length(), true);
410 EXPECT_EQ(expected_half_str, EncodeAsUTF8(FullwidthToHalfwidth(ch)));
411 }
412#endif
413}
414
415} // namespace tesseract
signed int char32
const char * p
#define GTEST_SKIP()
Definition: gtest.h:1889
#define EXPECT_EQ(val1, val2)
Definition: gtest.h:2043
#define SCOPED_TRACE(message)
Definition: gtest.h:2281
#define EXPECT_TRUE(condition)
Definition: gtest.h:1982
#define EXPECT_STREQ(s1, s2)
Definition: gtest.h:2112
#define EXPECT_FALSE(condition)
Definition: gtest.h:1986
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:228
const char kMixedText[]
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:237
const char kHinText[]
signed int char32
Definition: unichar.h:49
std::string PrintString32WithUnicodes(const std::string &str)
void ExpectGraphemeModeResults(const std::string &str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, const std::string &target_str)
const char kEngText[]
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:261
const char kKorText[]
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
const char * kBadlyFormedHinWords[]
const char * kBadlyFormedThaiWords[]
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:249
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:223
TEST(TesseractInstanceTest, TestMultipleTessInstances)
dest
Definition: upload.py:409
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33