23#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
39 for (total_len = 0; total_len < len; total_len += step) {
48 for (
i = 1;
i < step; ++
i) {
49 if ((
utf8_str[total_len +
i] & 0xc0) != 0x80) {
61 chars[total_len++] = 0;
69 const int bytemask = 0xBF;
70 const int bytemark = 0x80;
76 chars[0] =
static_cast<char>(unicode);
77 }
else if (unicode < 0x800) {
80 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
82 chars[0] =
static_cast<char>(unicode | 0xc0);
83 }
else if (unicode < 0x10000) {
85 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
87 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
89 chars[0] =
static_cast<char>(unicode | 0xe0);
92 chars[3] =
static_cast<char>((unicode | bytemark) & bytemask);
94 chars[2] =
static_cast<char>((unicode | bytemark) & bytemask);
96 chars[1] =
static_cast<char>((unicode | bytemark) & bytemask);
98 chars[0] =
static_cast<char>(unicode | 0xf0);
106 static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
109 const char *src = chars;
115 uni +=
static_cast<unsigned char>(*src++);
119 uni +=
static_cast<unsigned char>(*src++);
123 uni +=
static_cast<unsigned char>(*src++);
127 uni +=
static_cast<unsigned char>(*src++);
129 uni -= utf8_offsets[len];
136 char *str =
new char[len + 1];
137 memcpy(str, chars, len);
144 static const char utf8_bytes[256] = {
145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
155 return utf8_bytes[
static_cast<unsigned char>(*utf8_str)];
162 tprintf(
"ERROR: Illegal UTF8 encountered.\n");
163 for (
int i = 0;
i < 5 && it_[
i] !=
'\0'; ++
i) {
164 tprintf(
"Index %d char = 0x%x\n",
i, it_[
i]);
176 tprintf(
"WARNING: Illegal UTF8 encountered\n");
187 tprintf(
"WARNING: Illegal UTF8 encountered\n");
188 utf8_output[0] =
' ';
191 strncpy(utf8_output, it_, len);
199 tprintf(
"WARNING: Illegal UTF8 encountered\n");
221 const int utf8_length = strlen(
utf8_str);
222 std::vector<char32> unicodes;
223 unicodes.reserve(utf8_length);
227 unicodes.push_back(*it);
#define UNI_MAX_LEGAL_UTF32
void tprintf(const char *format,...)
static const_iterator begin(const char *utf8_str, int byte_length)
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
const char * utf8() const
static const_iterator end(const char *utf8_str, int byte_length)
static int utf8_step(const char *utf8_str)
const_iterator & operator++()
int get_utf8(char *buf) const