tesseract v5.3.3.20231005
unicodetext.cc
Go to the documentation of this file.
1
17#include "include_gunit.h"
19
20#include <string.h> // for memcpy, NULL, memcmp, etc
21#include <algorithm> // for max
22
23//#include "base/logging.h" // for operator<<, CHECK, etc
24//#include "base/stringprintf.h" // for StringPrintf, StringAppendF
25//#include "strings/stringpiece.h" // for StringPiece, etc
26
27#include "third_party/utf/utf.h" // for isvalidcharntorune, etc
28#include "util/utf8/unilib.h" // for IsInterchangeValid, etc
29#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
30
31static int CodepointDistance(const char *start, const char *end) {
32 int n = 0;
33 // Increment n on every non-trail-byte.
34 for (const char *p = start; p < end; ++p) {
35 n += (*reinterpret_cast<const signed char *>(p) >= -0x40);
36 }
37 return n;
38}
39
40static int CodepointCount(const char *utf8, int len) {
41 return CodepointDistance(utf8, utf8 + len);
42}
43
46 return CodepointDistance(first.it_, last.it_);
47}
48
49// ---------- Utility ----------
50
51static int ConvertToInterchangeValid(char *start, int len) {
52 // This routine is called only when we've discovered that a UTF-8 buffer
53 // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
54 // was not interchange valid. This indicates a bug in the caller, and
55 // a LOG(WARNING) is done in that case.
56 // This is similar to CoerceToInterchangeValid, but it replaces each
57 // structurally valid byte with a space, and each non-interchange
58 // character with a space, even when that character requires more
59 // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
60 // structurally valid UTF8, but U+FDD0 is not an interchange-valid
61 // code point. The result should contain one space, not three.
62 //
63 // Since the conversion never needs to write more data than it
64 // reads, it is safe to change the buffer in place. It returns the
65 // number of bytes written.
66 char *const in = start;
67 char *out = start;
68 char *const end = start + len;
69 while (start < end) {
70 int good = UniLib::SpanInterchangeValid(start, end - start);
71 if (good > 0) {
72 if (out != start) {
73 memmove(out, start, good);
74 }
75 out += good;
76 start += good;
77 if (start == end) {
78 break;
79 }
80 }
81 // Is the current string invalid UTF8 or just non-interchange UTF8?
82 char32 rune;
83 int n;
84 if (isvalidcharntorune(start, end - start, &rune, &n)) {
85 // structurally valid UTF8, but not interchange valid
86 start += n; // Skip over the whole character.
87 } else { // bad UTF8
88 start += 1; // Skip over just one byte
89 }
90 *out++ = ' ';
91 }
92 return out - in;
93}
94
95// *************** Data representation **********
96
97// Note: the copy constructor is undefined.
98
99// After reserve(), resize(), or clear(), we're an owner, not an alias.
100
101void UnicodeText::Repr::reserve(int new_capacity) {
102 // If there's already enough capacity, and we're an owner, do nothing.
103 if (capacity_ >= new_capacity && ours_)
104 return;
105
106 // Otherwise, allocate a new buffer.
107 capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
108 char *new_data = new char[capacity_];
109
110 // If there is an old buffer, copy it into the new buffer.
111 if (data_) {
112 memcpy(new_data, data_, size_);
113 if (ours_)
114 delete[] data_; // If we owned the old buffer, free it.
115 }
116 data_ = new_data;
117 ours_ = true; // We own the new buffer.
118 // size_ is unchanged.
119}
120
121void UnicodeText::Repr::resize(int new_size) {
122 if (new_size == 0) {
123 clear();
124 } else {
125 if (!ours_ || new_size > capacity_)
126 reserve(new_size);
127 // Clear the memory in the expanded part.
128 if (size_ < new_size)
129 memset(data_ + size_, 0, new_size - size_);
130 size_ = new_size;
131 ours_ = true;
132 }
133}
134
135// This implementation of clear() deallocates the buffer if we're an owner.
136// That's not strictly necessary; we could just set size_ to 0.
137void UnicodeText::Repr::clear() {
138 if (ours_)
139 delete[] data_;
140 data_ = nullptr;
141 size_ = capacity_ = 0;
142 ours_ = true;
143}
144
145void UnicodeText::Repr::Copy(const char *data, int size) {
146 resize(size);
147 memcpy(data_, data, size);
148}
149
150void UnicodeText::Repr::TakeOwnershipOf(char *data, int size, int capacity) {
151 if (data == data_)
152 return; // We already own this memory. (Weird case.)
153 if (ours_ && data_)
154 delete[] data_; // If we owned the old buffer, free it.
155 data_ = data;
156 size_ = size;
157 capacity_ = capacity;
158 ours_ = true;
159}
160
161void UnicodeText::Repr::PointTo(const char *data, int size) {
162 if (ours_ && data_)
163 delete[] data_; // If we owned the old buffer, free it.
164 data_ = const_cast<char *>(data);
165 size_ = size;
166 capacity_ = size;
167 ours_ = false;
168}
169
170void UnicodeText::Repr::append(const char *bytes, int byte_length) {
171 reserve(size_ + byte_length);
172 memcpy(data_ + size_, bytes, byte_length);
173 size_ += byte_length;
174}
175
176#ifdef INCLUDE_TENSORFLOW
177string UnicodeText::Repr::DebugString() const {
178 return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}", this, data_, size_,
179 capacity_, ours_ ? "Owned" : "Alias");
180}
181#endif
182
183// *************** UnicodeText ******************
184
185// ----- Constructors -----
186
187// Default constructor
189
190// Copy constructor
192 Copy(src);
193}
194
195// Substring constructor
198 CHECK(first <= last) << " Incompatible iterators";
199 repr_.append(first.it_, last.it_ - first.it_);
200}
201
203 CHECK(first <= last) << " Incompatible iterators";
204 return string(first.it_, last.it_ - first.it_);
205}
206
207// ----- Copy -----
208
210 if (this != &src) {
211 Copy(src);
212 }
213 return *this;
214}
215
217 repr_.Copy(src.repr_.data_, src.repr_.size_);
218 return *this;
219}
220
221UnicodeText &UnicodeText::CopyUTF8(const char *buffer, int byte_length) {
222 repr_.Copy(buffer, byte_length);
223 if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
224 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
225 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
226 }
227 return *this;
228}
229
230UnicodeText &UnicodeText::UnsafeCopyUTF8(const char *buffer, int byte_length) {
231 repr_.Copy(buffer, byte_length);
232 return *this;
233}
234
235// ----- TakeOwnershipOf -----
236
237UnicodeText &UnicodeText::TakeOwnershipOfUTF8(char *buffer, int byte_length, int byte_capacity) {
238 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
239 if (!UniLib::IsInterchangeValid(buffer, byte_length)) {
240 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
241 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
242 }
243 return *this;
244}
245
246UnicodeText &UnicodeText::UnsafeTakeOwnershipOfUTF8(char *buffer, int byte_length,
247 int byte_capacity) {
248 repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
249 return *this;
250}
251
252// ----- PointTo -----
253
254UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
255 if (UniLib::IsInterchangeValid(buffer, byte_length)) {
256 repr_.PointTo(buffer, byte_length);
257 } else {
258 LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
259 repr_.Copy(buffer, byte_length);
260 repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
261 }
262 return *this;
263}
264
265UnicodeText &UnicodeText::UnsafePointToUTF8(const char *buffer, int byte_length) {
266 repr_.PointTo(buffer, byte_length);
267 return *this;
268}
269
271 repr_.PointTo(src.repr_.data_, src.repr_.size_);
272 return *this;
273}
274
276 CHECK(first <= last) << " Incompatible iterators";
277 repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
278 return *this;
279}
280
281// ----- Append -----
282
284 repr_.append(u.repr_.data_, u.repr_.size_);
285 return *this;
286}
287
289 CHECK(first <= last) << " Incompatible iterators";
290 repr_.append(first.it_, last.it_ - first.it_);
291 return *this;
292}
293
294UnicodeText &UnicodeText::UnsafeAppendUTF8(const char *utf8, int len) {
295 repr_.append(utf8, len);
296 return *this;
297}
298
299// ----- substring searching -----
300
302 const_iterator start_pos) const {
303 CHECK_GE(start_pos.utf8_data(), utf8_data());
304 CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
305 return UnsafeFind(look, start_pos);
306}
307
309 return UnsafeFind(look, begin());
310}
311
312UnicodeText::const_iterator UnicodeText::UnsafeFind(const UnicodeText &look,
313 const_iterator start_pos) const {
314 // Due to the magic of the UTF8 encoding, searching for a sequence of
315 // letters is equivalent to substring search.
316#ifdef INCLUDE_TENSORFLOW
317 StringPiece searching(utf8_data(), utf8_length());
318 StringPiece look_piece(look.utf8_data(), look.utf8_length());
319#endif
320 LOG(FATAL) << "Not implemented";
321#ifdef INCLUDE_TENSORFLOW
322 // StringPiece::size_type found =
323 // searching.find(look_piece, start_pos.utf8_data() - utf8_data());
324 StringPiece::size_type found = StringPiece::npos;
325 if (found == StringPiece::npos)
326 return end();
327 return const_iterator(utf8_data() + found);
328#else
329 return end();
330#endif
331}
332
333#ifdef INCLUDE_TENSORFLOW
335 // Equivalent to:
336 // UnicodeText replacement_char;
337 // replacement_char.push_back(0xFFFD);
338 // return find(replacement_char) != end();
339 StringPiece searching(utf8_data(), utf8_length());
340 StringPiece looking_for("\xEF\xBF\xBD", 3);
341 LOG(FATAL) << "Not implemented";
342 // return searching.find(looking_for) != StringPiece::npos;
343 return false;
344}
345#endif
346
347// ----- other methods -----
348
349// Clear operator
351 repr_.clear();
352}
353
354// Destructor
356
359 char buf[UTFmax];
360 int len = runetochar(buf, &c);
361 if (UniLib::IsInterchangeValid(buf, len)) {
362 repr_.append(buf, len);
363 } else {
364 LOG(WARNING) << "Unicode value 0x" << std::hex << c << " is not valid for interchange";
365 repr_.append(" ", 1);
366 }
367 } else {
368 LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
369 repr_.append(" ", 1);
370 }
371}
372
373int UnicodeText::size() const {
374 return CodepointCount(repr_.data_, repr_.size_);
375}
376
377bool operator==(const UnicodeText &lhs, const UnicodeText &rhs) {
378 if (&lhs == &rhs)
379 return true;
380 if (lhs.repr_.size_ != rhs.repr_.size_)
381 return false;
382 return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
383}
384
385#ifdef INCLUDE_TENSORFLOW
386string UnicodeText::DebugString() const {
387 return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}", this, size(),
388 repr_.DebugString().c_str());
389}
390#endif
391
392// ******************* UnicodeText::const_iterator *********************
393
394// The implementation of const_iterator would be nicer if it
395// inherited from boost::iterator_facade
396// (http://boost.org/libs/iterator/doc/iterator_facade.html).
397
399
401
403 if (&other != this)
404 it_ = other.it_;
405 return *this;
406}
407
409 return const_iterator(repr_.data_);
410}
411
413 return const_iterator(repr_.data_ + repr_.size_);
414}
415
417 return lhs.it_ < rhs.it_;
418}
419
421 // (We could call chartorune here, but that does some
422 // error-checking, and we're guaranteed that our data is valid
423 // UTF-8. Also, we expect this routine to be called very often. So
424 // for speed, we do the calculation ourselves.)
425
426 // Convert from UTF-8
427 unsigned char byte1 = it_[0];
428 if (byte1 < 0x80)
429 return byte1;
430
431 unsigned char byte2 = it_[1];
432 if (byte1 < 0xE0)
433 return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);
434
435 unsigned char byte3 = it_[2];
436 if (byte1 < 0xF0)
437 return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
438
439 unsigned char byte4 = it_[3];
440 return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) | ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
441}
442
444 it_ += UniLib::OneCharLen(it_);
445 return *this;
446}
447
449 while (UniLib::IsTrailByte(*--it_))
450 ;
451 return *this;
452}
453
454int UnicodeText::const_iterator::get_utf8(char *utf8_output) const {
455 utf8_output[0] = it_[0];
456 if ((it_[0] & 0xff) < 0x80)
457 return 1;
458 utf8_output[1] = it_[1];
459 if ((it_[0] & 0xff) < 0xE0)
460 return 2;
461 utf8_output[2] = it_[2];
462 if ((it_[0] & 0xff) < 0xF0)
463 return 3;
464 utf8_output[3] = it_[3];
465 return 4;
466}
467
469 return string(utf8_data(), utf8_length());
470}
471
473 if ((it_[0] & 0xff) < 0x80) {
474 return 1;
475 } else if ((it_[0] & 0xff) < 0xE0) {
476 return 2;
477 } else if ((it_[0] & 0xff) < 0xF0) {
478 return 3;
479 } else {
480 return 4;
481 }
482}
483
485 CHECK(p != nullptr);
486 const char *start = utf8_data();
487 int len = utf8_length();
488 const char *end = start + len;
489 CHECK(p >= start);
490 CHECK(p <= end);
491 CHECK(p == end || !UniLib::IsTrailByte(*p));
492 return const_iterator(p);
493}
494
495#ifdef INCLUDE_TENSORFLOW
497 return tensorflow::strings::Printf("{iter %p}", it_);
498}
499
500// *************************** Utilities *************************
501
502string CodepointString(const UnicodeText &t) {
503 string s;
505 while (it != end)
506 tensorflow::strings::Appendf(&s, "%X ", *it++);
507 return s;
508}
509#endif
signed int char32
@ LOG
@ FATAL
Definition: log.h:28
@ WARNING
Definition: log.h:28
bool operator<(const UnicodeText::const_iterator &lhs, const UnicodeText::const_iterator &rhs)
Definition: unicodetext.cc:416
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
UnicodeText::const_iterator::difference_type distance(const UnicodeText::const_iterator &first, const UnicodeText::const_iterator &last)
Definition: unicodetext.cc:44
string CodepointString(const UnicodeText &t)
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244
int isvalidcharntorune(const char *str, int length, Rune *rune, int *consumed)
Definition: rune.c:239
@ UTFmax
Definition: utf.h:22
const char * p
#define CHECK_GE(test, value)
Definition: include_gunit.h:80
#define CHECK(condition)
Definition: include_gunit.h:76
#define CHECK_LE(test, value)
Definition: include_gunit.h:83
LIST last(LIST var_list)
Definition: oldlist.cpp:153
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
bool IsValidCodepoint(char32 c)
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39
bool IsTrailByte(char x)
int OneCharLen(const char *src)
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:202
void push_back(char32 codepoint)
Definition: unicodetext.cc:357
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:221
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:301
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:163
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:270
string DebugString() const
const_iterator end() const
Definition: unicodetext.cc:412
friend class const_iterator
Definition: unicodetext.h:348
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:209
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:237
int utf8_length() const
Definition: unicodetext.h:308
int size() const
Definition: unicodetext.cc:373
bool HasReplacementChar() const
const_iterator begin() const
Definition: unicodetext.cc:408
const char * utf8_data() const
Definition: unicodetext.h:305
void clear()
Definition: unicodetext.cc:350
const_iterator & operator++()
Definition: unicodetext.cc:443
const_iterator & operator--()
Definition: unicodetext.cc:448
int get_utf8(char *buf) const
Definition: unicodetext.cc:454
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402
string get_utf8_string() const
Definition: unicodetext.cc:468
const char * utf8_data() const
Definition: unicodetext.h:244