3#include "unicode/uchar.h"
8 const unsigned num_codes =
codes_.size();
12 int num_codes_in_grapheme = 0;
29 tprintf(
"Two grapheme links in a row:0x%x 0x%x\n", prev_ch,
ch);
34 IsBadlyFormed(prev_ch,
ch)) {
40 if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) {
44 ++num_codes_in_grapheme;
45 prev_prev_ch = prev_ch;
49 if (num_codes_in_grapheme > 0) {
61 if (u_hasBinaryProperty(
ch, UCHAR_GRAPHEME_LINK)) {
64 if (u_isUWhiteSpace(
ch)) {
71 int char_type = u_charType(
ch);
72 if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
83 if (IsBadlyFormedIndicVowel(prev_ch,
ch)) {
85 tprintf(
"Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch,
ch);
89 if (IsBadlyFormedThai(prev_ch,
ch)) {
91 tprintf(
"Badly formed Thai:0x%x 0x%x\n", prev_ch,
ch);
111bool ValidateGrapheme::IsBadlyFormedIndicVowel(
char32 prev_ch,
char32 ch) {
112 return ((prev_ch == 0x905 && (
ch == 0x946 ||
ch == 0x93E)) || (prev_ch == 0x909 &&
ch == 0x941) ||
113 (prev_ch == 0x90F && (
ch >= 0x945 &&
ch <= 0x947)) ||
114 (prev_ch == 0x905 && (
ch >= 0x949 &&
ch <= 0x94C)) ||
115 (prev_ch == 0x906 && (
ch >= 0x949 &&
ch <= 0x94C)) ||
117 (prev_ch == 0x93E && (
ch >= 0x945 &&
ch <= 0x948)) ||
119 (prev_ch == 0x94D && (
ch >= 0x93E &&
ch <= 0x94C)) ||
121 (prev_ch == 0x985 &&
ch == 0x9BE) ||
123 (prev_ch == 0xC12 && (
ch == 0xC55 ||
ch == 0xC4C)) ||
125 (prev_ch == 0xC92 &&
ch == 0xCCC));
129static bool IsThaiConsonant(
char32 ch) {
130 return 0xe01 <=
ch &&
ch <= 0xe2e;
134static bool IsThaiBeforeConsonantVowel(
char32 ch) {
135 return 0xe40 <=
ch &&
ch <= 0xe44;
139static bool IsThaiToneMark(
char32 ch) {
140 return 0xe48 <=
ch &&
ch <= 0xe4b;
145static bool IsThaiTonableVowel(
char32 ch) {
146 return (0xe34 <=
ch &&
ch <= 0xe39) ||
ch == 0xe31;
155bool ValidateGrapheme::IsBadlyFormedThai(
char32 prev_ch,
char32 ch) {
157 if (IsThaiToneMark(
ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
161 if ((IsThaiTonableVowel(
ch) ||
ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
165 if (
ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
171 if (
ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
175 if ((
ch == 0xe30 ||
ch == 0xe32 ||
ch == 0xe33) &&
176 !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
177 !(prev_ch == 0xe32 &&
ch == 0xe30) && !(prev_ch == 0xe4d &&
ch == 0xe32)) {
182 if (IsThaiBeforeConsonantVowel(
ch) &&
183 (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) {
187 if ((0xe30 <=
ch &&
ch <= 0xe4D) && prev_ch == 0xe24) {
void tprintf(const char *format,...)
CharClass UnicodeToCharClass(char32 ch) const override
bool ConsumeGraphemeIfValid() override
static const char32 kZeroWidthNonJoiner
static bool IsVedicAccent(char32 unicode)
void MultiCodePart(unsigned length)
std::vector< IndicPair > codes_
static const char32 kZeroWidthJoiner