24#include <unordered_map>
30#include "unicode/normalizer2.h"
31#include "unicode/translit.h"
32#include "unicode/uchar.h"
33#include "unicode/unorm2.h"
34#include "unicode/uscript.h"
38static bool is_hyphen_punc(
const char32 ch) {
39 static const int kNumHyphenPuncUnicodes = 13;
40 static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
41 '-', 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,
49 for (
int kHyphenPuncUnicode : kHyphenPuncUnicodes) {
50 if (kHyphenPuncUnicode ==
ch) {
57static bool is_single_quote(
const char32 ch) {
58 static const int kNumSingleQuoteUnicodes = 8;
59 static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
69 for (
int kSingleQuoteUnicode : kSingleQuoteUnicodes) {
70 if (kSingleQuoteUnicode ==
ch) {
77static bool is_double_quote(
const char32 ch) {
78 static const int kNumDoubleQuoteUnicodes = 8;
79 static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
90 for (
int kDoubleQuoteUnicode : kDoubleQuoteUnicodes) {
91 if (kDoubleQuoteUnicode ==
ch) {
101 std::vector<char32> *normed32) {
103 icu::UnicodeString uch_str(str8,
"UTF-8");
104 IcuErrorCode error_code;
106 const char *norm_type =
112 const icu::Normalizer2 *normalizer =
113 icu::Normalizer2::getInstance(
nullptr, norm_type, compose, error_code);
114 error_code.assertSuccess();
116 icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
117 error_code.assertSuccess();
119 normed32->reserve(norm_str.length());
120 for (
int offset = 0; offset < norm_str.length(); offset = norm_str.moveIndex32(offset, 1)) {
121 char32 ch = norm_str.char32At(offset);
129 normed32->push_back(
ch);
134static void StripJoiners(std::vector<char32> *str32) {
143 (*str32)[len++] =
ch;
154 std::string *normalized) {
155 std::vector<char32> normed32;
156 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
158 StripJoiners(&normed32);
159 std::vector<std::vector<char32>> graphemes;
161 normed32, &graphemes);
162 if (graphemes.empty() || graphemes[0].empty()) {
164 }
else if (normalized !=
nullptr) {
169 if (normalized !=
nullptr) {
181 std::vector<std::string> *graphemes) {
182 std::vector<char32> normed32;
183 NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
184 StripJoiners(&normed32);
185 std::vector<std::vector<char32>> graphemes32;
190 std::vector<char32> cleaned32;
191 for (
const auto &g : graphemes32) {
192 cleaned32.insert(cleaned32.end(), g.begin(), g.end());
194 if (cleaned32 != normed32) {
200 graphemes->reserve(graphemes32.size());
201 for (
const auto &grapheme : graphemes32) {
209 if (is_hyphen_punc(
ch)) {
211 }
else if (is_single_quote(
ch)) {
213 }
else if (is_double_quote(
ch)) {
225 return (
static_cast<uint32_t
>(
ch) < 0xD800) || (
ch >= 0xE000 &&
ch <= 0x10FFFF);
230 return u_isUWhiteSpace(
static_cast<UChar32
>(
ch));
244 n_white += it.utf8_len();
256 n_notwhite += it.utf8_len();
263 !(
ch >= 0xFFFE &&
ch <= 0xFFFF) && !(
ch >= 0x1FFFE &&
ch <= 0x1FFFF) &&
264 !(
ch >= 0x2FFFE &&
ch <= 0x2FFFF) && !(
ch >= 0x3FFFE &&
ch <= 0x3FFFF) &&
265 !(
ch >= 0x4FFFE &&
ch <= 0x4FFFF) && !(
ch >= 0x5FFFE &&
ch <= 0x5FFFF) &&
266 !(
ch >= 0x6FFFE &&
ch <= 0x6FFFF) && !(
ch >= 0x7FFFE &&
ch <= 0x7FFFF) &&
267 !(
ch >= 0x8FFFE &&
ch <= 0x8FFFF) && !(
ch >= 0x9FFFE &&
ch <= 0x9FFFF) &&
268 !(
ch >= 0xAFFFE &&
ch <= 0xAFFFF) && !(
ch >= 0xBFFFE &&
ch <= 0xBFFFF) &&
269 !(
ch >= 0xCFFFE &&
ch <= 0xCFFFF) && !(
ch >= 0xDFFFE &&
ch <= 0xDFFFF) &&
270 !(
ch >= 0xEFFFE &&
ch <= 0xEFFFF) && !(
ch >= 0xFFFFE &&
ch <= 0xFFFFF) &&
271 !(
ch >= 0x10FFFE &&
ch <= 0x10FFFF) &&
272 (!u_isISOControl(
static_cast<UChar32
>(
ch)) ||
ch ==
'\n' ||
ch ==
'\f' ||
ch ==
'\t' ||
278 (!u_isISOControl(
static_cast<UChar32
>(
ch)) ||
ch ==
'\n' ||
ch ==
'\f' ||
ch ==
'\t' ||
298 icu::UnicodeString uch_str(
static_cast<UChar32
>(
ch));
299 const icu::Transliterator *fulltohalf =
300 icu::Transliterator::createInstance(
"Fullwidth-Halfwidth", UTRANS_FORWARD, error_code);
301 error_code.assertSuccess();
304 fulltohalf->transliterate(uch_str);
#define ASSERT_HOST_MSG(x,...)
bool IsOCREquivalent(char32 ch1, char32 ch2)
bool IsWhitespace(const char32 ch)
char32 OCRNormalize(char32 ch)
unsigned int SpanUTF8Whitespace(const char *text)
bool IsInterchangeValid(const char32 ch)
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
bool IsInterchangeValid7BitAscii(const char32 ch)
char32 FullwidthToHalfwidth(const char32 ch)
unsigned int SpanUTF8NotWhitespace(const char *text)
bool IsValidCodepoint(const char32 ch)
bool IsUTF8Whitespace(const char *text)
static const_iterator begin(const char *utf8_str, int byte_length)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
static const_iterator end(const char *utf8_str, int byte_length)
static const char32 kZeroWidthNonJoiner
static bool IsZeroWidthMark(char32 ch)
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
static const char32 kZeroWidthJoiner