4 #include <unordered_map> 9 #include "unicode/uchar.h" 10 #include "unicode/uscript.h" 36 std::vector<std::vector<char32>>* dest) {
38 std::vector<std::vector<char32>> graphemes;
53 std::unique_ptr<Validator> validator(
55 for (
const auto& grapheme : graphemes) {
56 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
69 return std::unique_ptr<Validator>(
72 return std::unique_ptr<Validator>(
75 return std::unique_ptr<Validator>(
78 return std::unique_ptr<Validator>(
90 std::vector<std::vector<char32>>* dest) {
106 std::vector<std::vector<char32>>* dest) {
110 dest->reserve(dest->size() +
output_.size());
114 std::move(
parts_.begin(),
parts_.end(), std::back_inserter(*dest));
117 dest->push_back(std::vector<char32>());
122 dest->back().insert(dest->back().end(),
output_.begin(),
output_.end());
127 const std::pair<int, int>& p2) {
128 return p1.second < p2.second;
135 const std::vector<char32>& utf32) {
136 std::unordered_map<int, int> histogram;
143 UScriptCode script_code = uscript_getScript(ch, err);
145 script_code != USCRIPT_COMMON) ||
146 script_code == USCRIPT_MYANMAR) {
147 if (script_code == USCRIPT_MYANMAR)
152 if (!histogram.empty()) {
154 std::max_element(histogram.begin(), histogram.end(),
CmpPairSecond)
172 (unicode & 0x7f) == 0x4d) ||
180 return 0x1cd0 <= unicode && unicode < 0x1d00;
191 codes_.reserve(text.size());
static const char32 kInvalid
virtual bool ConsumeGraphemeIfValid()=0
static const char32 kKhmerVirama
static const char32 kLeftToRightMark
bool CmpPairSecond(const std::pair< int, int > &p1, const std::pair< int, int > &p2)
static const char32 kZeroWidthSpace
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kMaxViramaScriptUnicode
virtual CharClass UnicodeToCharClass(char32 ch) const =0
void ComputeClassCodes(const std::vector< char32 > &text)
std::vector< IndicPair > codes_
static bool IsVedicAccent(char32 unicode)
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
static const char32 kMyanmarVirama
static const char32 kMaxSinhalaUnicode
static const char32 kSinhalaVirama
static const int kIndicCodePageSize
bool IsSubscriptScript() const
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
std::vector< std::vector< char32 > > parts_
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static bool IsVirama(char32 unicode)
static const char32 kRightToLeftMark
std::vector< char32 > output_
static const char32 kZeroWidthJoiner
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
static const char32 kMinIndicUnicode
static const char32 kZeroWidthNonJoiner