31static int CodepointDistance(
const char *start,
const char *end) {
34 for (
const char *
p = start;
p < end; ++
p) {
35 n += (*
reinterpret_cast<const signed char *
>(
p) >= -0x40);
40static int CodepointCount(
const char *utf8,
int len) {
41 return CodepointDistance(utf8, utf8 + len);
46 return CodepointDistance(first.it_,
last.it_);
51static int ConvertToInterchangeValid(
char *start,
int len) {
66 char *
const in = start;
68 char *
const end = start + len;
73 memmove(out, start, good);
101void UnicodeText::Repr::reserve(
int new_capacity) {
103 if (capacity_ >= new_capacity && ours_)
107 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
108 char *new_data =
new char[capacity_];
112 memcpy(new_data, data_, size_);
121void UnicodeText::Repr::resize(
int new_size) {
125 if (!ours_ || new_size > capacity_)
128 if (size_ < new_size)
129 memset(data_ + size_, 0, new_size - size_);
137void UnicodeText::Repr::clear() {
141 size_ = capacity_ = 0;
145void UnicodeText::Repr::Copy(
const char *data,
int size) {
147 memcpy(data_, data, size);
150void UnicodeText::Repr::TakeOwnershipOf(
char *data,
int size,
int capacity) {
157 capacity_ = capacity;
161void UnicodeText::Repr::PointTo(
const char *data,
int size) {
164 data_ =
const_cast<char *
>(data);
170void UnicodeText::Repr::append(
const char *bytes,
int byte_length) {
171 reserve(size_ + byte_length);
172 memcpy(data_ + size_, bytes, byte_length);
173 size_ += byte_length;
176#ifdef INCLUDE_TENSORFLOW
177string UnicodeText::Repr::DebugString()
const {
178 return tensorflow::strings::Printf(
"{Repr %p data=%p size=%d capacity=%d %s}",
this, data_, size_,
179 capacity_, ours_ ?
"Owned" :
"Alias");
198 CHECK(first <=
last) <<
" Incompatible iterators";
199 repr_.append(first.it_,
last.it_ - first.it_);
203 CHECK(first <=
last) <<
" Incompatible iterators";
204 return string(first.it_,
last.it_ - first.it_);
217 repr_.Copy(src.repr_.data_, src.repr_.size_);
222 repr_.Copy(buffer, byte_length);
224 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
225 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
230UnicodeText &UnicodeText::UnsafeCopyUTF8(
const char *buffer,
int byte_length) {
231 repr_.Copy(buffer, byte_length);
238 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
240 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
241 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
246UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(
char *buffer,
int byte_length,
248 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
256 repr_.PointTo(buffer, byte_length);
258 LOG(
WARNING) <<
"UTF-8 buffer is not interchange-valid.";
259 repr_.Copy(buffer, byte_length);
260 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
265UnicodeText &UnicodeText::UnsafePointToUTF8(
const char *buffer,
int byte_length) {
266 repr_.PointTo(buffer, byte_length);
271 repr_.PointTo(src.repr_.data_, src.repr_.size_);
276 CHECK(first <=
last) <<
" Incompatible iterators";
284 repr_.append(u.repr_.data_, u.repr_.size_);
289 CHECK(first <=
last) <<
" Incompatible iterators";
290 repr_.append(first.it_,
last.it_ - first.it_);
294UnicodeText &UnicodeText::UnsafeAppendUTF8(
const char *utf8,
int len) {
295 repr_.append(utf8, len);
305 return UnsafeFind(look, start_pos);
309 return UnsafeFind(look,
begin());
313 const_iterator start_pos)
const {
316#ifdef INCLUDE_TENSORFLOW
321#ifdef INCLUDE_TENSORFLOW
324 StringPiece::size_type found = StringPiece::npos;
325 if (found == StringPiece::npos)
333#ifdef INCLUDE_TENSORFLOW
340 StringPiece looking_for(
"\xEF\xBF\xBD", 3);
362 repr_.append(buf, len);
364 LOG(
WARNING) <<
"Unicode value 0x" << std::hex << c <<
" is not valid for interchange";
365 repr_.append(
" ", 1);
368 LOG(
WARNING) <<
"Illegal Unicode value: 0x" << std::hex << c;
369 repr_.append(
" ", 1);
374 return CodepointCount(repr_.data_, repr_.size_);
380 if (lhs.repr_.size_ != rhs.repr_.size_)
382 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
385#ifdef INCLUDE_TENSORFLOW
387 return tensorflow::strings::Printf(
"{UnicodeText %p chars=%d repr=%s}",
this,
size(),
388 repr_.DebugString().c_str());
417 return lhs.it_ < rhs.it_;
427 unsigned char byte1 = it_[0];
431 unsigned char byte2 = it_[1];
433 return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
435 unsigned char byte3 = it_[2];
437 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
439 unsigned char byte4 = it_[3];
440 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
455 utf8_output[0] = it_[0];
456 if ((it_[0] & 0xff) < 0x80)
458 utf8_output[1] = it_[1];
459 if ((it_[0] & 0xff) < 0xE0)
461 utf8_output[2] = it_[2];
462 if ((it_[0] & 0xff) < 0xF0)
464 utf8_output[3] = it_[3];
473 if ((it_[0] & 0xff) < 0x80) {
475 }
else if ((it_[0] & 0xff) < 0xE0) {
477 }
else if ((it_[0] & 0xff) < 0xF0) {
488 const char *
end = start + len;
495#ifdef INCLUDE_TENSORFLOW
497 return tensorflow::strings::Printf(
"{iter %p}", it_);
506 tensorflow::strings::Appendf(&s,
"%X ", *it++);
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs)
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
string CodepointString(const UnicodeText &t)
int runetochar(char *str, const Rune *rune)
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
#define CHECK_GE(test, value)
#define CHECK_LE(test, value)
bool IsInterchangeValid(char32 c)
bool IsValidCodepoint(char32 c)
int SpanInterchangeValid(const char *begin, int byte_length)
int OneCharLen(const char *src)
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
void push_back(char32 codepoint)
const_iterator MakeIterator(const char *p) const
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
UnicodeText & Copy(const UnicodeText &src)
UnicodeText & PointTo(const UnicodeText &src)
string DebugString() const
const_iterator end() const
friend class const_iterator
UnicodeText & operator=(const UnicodeText &src)
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
bool HasReplacementChar() const
const_iterator begin() const
const char * utf8_data() const
string DebugString() const
const_iterator & operator++()
const_iterator & operator--()
ptrdiff_t difference_type
int get_utf8(char *buf) const
const_iterator & operator=(const const_iterator &other)
string get_utf8_string() const
const char * utf8_data() const