5#include <unordered_map>
9#include "unicode/uchar.h"
10#include "unicode/uscript.h"
41 const std::vector<char32> &src,
42 std::vector<std::vector<char32>> *
dest) {
44 std::vector<std::vector<char32>> graphemes;
60 std::unique_ptr<Validator> validator(
ScriptValidator(script, report_errors));
61 for (
const auto &grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
73#define CASE(e, T) case ViramaScript::e: return std::make_unique<T>(script, report_errors)
80 return std::make_unique<ValidateIndic>(script, report_errors);
90 const std::vector<char32> &src,
91 std::vector<std::vector<char32>> *
dest) {
119 dest->push_back(std::vector<char32>());
128static bool CmpPairSecond(
const std::pair<int, int> &p1,
const std::pair<int, int> &p2) {
129 return p1.second < p2.second;
136 std::unordered_map<int, int> histogram;
143 UScriptCode script_code = uscript_getScript(
ch, err);
145 script_code == USCRIPT_MYANMAR) {
146 if (script_code == USCRIPT_MYANMAR) {
152 if (!histogram.empty()) {
153 int base = std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)->first;
171 (unicode & 0x7f) == 0x4d) ||
179 return (0x1cd0 <= unicode && unicode < 0x1d00) || (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
180 (0x951 <= unicode && unicode <= 0x954);
191 codes_.reserve(text.size());
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kSinhalaVirama
static const char32 kZeroWidthNonJoiner
static const char32 kKhmerVirama
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
std::vector< char32 > output_
static const char32 kInvalid
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
static const char32 kRightToLeftMark
static bool IsVedicAccent(char32 unicode)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
static const char32 kLeftToRightMark
static bool IsVirama(char32 unicode)
static const int kIndicCodePageSize
static const char32 kZeroWidthSpace
bool IsSubscriptScript() const
std::vector< IndicPair > codes_
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 > > *dest)
void ComputeClassCodes(const std::vector< char32 > &text)
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 > > *dest)
static const char32 kZeroWidthJoiner
static const char32 kMaxJavaneseUnicode
static const char32 kMaxSinhalaUnicode
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
std::vector< std::vector< char32 > > parts_
virtual bool ConsumeGraphemeIfValid()=0