tesseract v5.3.3.20231005
unicodetext.h
Go to the documentation of this file.
1
17#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
18#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
19
20#include <stddef.h> // for NULL, ptrdiff_t
21#include <iterator> // for bidirectional_iterator_tag, etc
22#include <string> // for string
23#include <utility> // for pair
24
25#include "syntaxnet/base.h"
26
27// ***************************** UnicodeText **************************
28//
29// A UnicodeText object is a container for a sequence of Unicode
30// codepoint values. It has default, copy, and assignment constructors.
31// Data can be appended to it from another UnicodeText, from
32// iterators, or from a single codepoint.
33//
34// The internal representation of the text is UTF-8. Since UTF-8 is a
35// variable-width format, UnicodeText does not provide random access
36// to the text, and changes to the text are permitted only at the end.
37//
38// The UnicodeText class defines a const_iterator. The dereferencing
39// operator (*) returns a codepoint (char32). The iterator is a
40// bidirectional, read-only iterator. It becomes invalid if the text
41// is changed.
42//
43// There are methods for appending and retrieving UTF-8 data directly.
44// The 'utf8_data' method returns a const char* that contains the
45// UTF-8-encoded version of the text; 'utf8_length' returns the number
46// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
47// 4 bytes of UTF-8 data in a char array and returns the number of
48// bytes that it stored.
49//
50// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
51// 0x10FFFF], but UnicodeText has the additional restriction that it
52// can contain only those characters that are valid for interchange on
53// the Web. This excludes all of the control codes except for carriage
54// return, line feed, and horizontal tab. It also excludes
55// non-characters, but codepoints that are in the Private Use regions
56// are allowed, as are codepoints that are unassigned. (See the
57// Unicode reference for details.) The function UniLib::IsInterchangeValid
58// can be used as a test for this property.
59//
60// UnicodeTexts are safe. Every method that constructs or modifies a
61// UnicodeText tests for interchange-validity, and will substitute a
62// space for the invalid data. Such cases are reported via
63// LOG(WARNING).
64//
65// MEMORY MANAGEMENT: copy, take ownership, or point to
66//
67// A UnicodeText is either an "owner", meaning that it owns the memory
68// for the data buffer and will free it when the UnicodeText is
69// destroyed, or it is an "alias", meaning that it does not.
70//
71// There are three methods for storing UTF-8 data in a UnicodeText:
72//
73// CopyUTF8(buffer, len) copies buffer.
74//
75// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
76//
77// PointToUTF8(buffer, size) creates an alias pointing to buffer.
78//
79// All three methods perform a validity check on the buffer. There are
80// private, "unsafe" versions of these functions that bypass the
81// validity check. They are used internally and by friend-functions
82// that are handling UTF-8 data that has already been validated.
83//
84// The purpose of an alias is to avoid making an unnecessary copy of a
85// UTF-8 buffer while still providing access to the Unicode values
86// within that text through iterators or the fast scanners that are
87// based on UTF-8 state tables. The lifetime of an alias must not
88// exceed the lifetime of the buffer from which it was constructed.
89//
90// The semantics of an alias might be described as "copy on write or
91// repair." The source data is never modified. If push_back() or
92// append() is called on an alias, a copy of the data will be created,
93// and the UnicodeText will become an owner. If clear() is called on
94// an alias, it becomes an (empty) owner.
95//
96// The copy constructor and the assignment operator produce an owner.
97// That is, after direct initialization ("UnicodeText x(y);") or copy
98// initialization ("UnicodeText x = y;") x will be an owner, even if y
99// was an alias. The assignment operator ("x = y;") also produces an
100// owner unless x and y are the same object and y is an alias.
101//
102// Aliases should be used with care. If the source from which an alias
103// was created is freed, or if the contents are changed, while the
104// alias is still in use, fatal errors could result. But it can be
105// quite useful to have a UnicodeText "window" through which to see a
106// UTF-8 buffer without having to pay the price of making a copy.
107//
108// UTILITIES
109//
110// The interfaces in util/utf8/public/textutils.h provide higher-level
111// utilities for dealing with UnicodeTexts, including routines for
112// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
113// strings, creating strings from UnicodeTexts, normalizing text for
114// efficient matching or display, and others.
115
117public:
118 class const_iterator;
119
121
122 // Constructors. These always produce owners.
123 UnicodeText(); // Create an empty text.
124 UnicodeText(const UnicodeText &src); // copy constructor
125 // Construct a substring (copies the data).
126 UnicodeText(const const_iterator &first, const const_iterator &last);
127
128 // Assignment operator. This copies the data and produces an owner
129 // unless this == &src, e.g., "x = x;", which is a no-op.
130 UnicodeText &operator=(const UnicodeText &src);
131
132 // x.Copy(y) copies the data from y into x.
133 UnicodeText &Copy(const UnicodeText &src);
134 inline UnicodeText &assign(const UnicodeText &src) {
135 return Copy(src);
136 }
137
138 // x.PointTo(y) changes x so that it points to y's data.
139 // It does not copy y or take ownership of y's data.
140 UnicodeText &PointTo(const UnicodeText &src);
142
143 ~UnicodeText();
144
145 void clear(); // Clear text.
146 bool empty() const {
147 return repr_.size_ == 0;
148 } // Test if text is empty.
149
150 // Add a codepoint to the end of the text.
151 // If the codepoint is not interchange-valid, add a space instead
152 // and log a warning.
153 void push_back(char32 codepoint);
154
155 // Generic appending operation.
156 // iterator_traits<ForwardIterator>::value_type must be implicitly
157 // convertible to char32. Typical uses of this method might include:
158 // char32 chars[] = {0x1, 0x2, ...};
159 // vector<char32> more_chars = ...;
160 // utext.append(chars, chars+arraysize(chars));
161 // utext.append(more_chars.begin(), more_chars.end());
162 template <typename ForwardIterator>
163 UnicodeText &append(ForwardIterator first, const ForwardIterator last) {
164 while (first != last) {
165 push_back(*first++);
166 }
167 return *this;
168 }
169
170 // A specialization of the generic append() method.
171 UnicodeText &append(const const_iterator &first, const const_iterator &last);
172
173 // An optimization of append(source.begin(), source.end()).
174 UnicodeText &append(const UnicodeText &source);
175
176 int size() const; // the number of Unicode characters (codepoints)
177
178 friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
179 friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs);
180
182 typedef const_iterator CI;
183
184 public:
185 typedef std::bidirectional_iterator_tag iterator_category;
187 typedef ptrdiff_t difference_type;
188 typedef void pointer; // (Not needed.)
189 typedef const char32 reference; // (Needed for const_reverse_iterator)
190
191 // Iterators are default-constructible.
193
194 // It's safe to make multiple passes over a UnicodeText.
195 const_iterator(const const_iterator &other);
197
198 char32 operator*() const; // Dereference
199
200 const_iterator &operator++(); // Advance (++iter)
201 const_iterator operator++(int) { // (iter++)
202 const_iterator result(*this);
203 ++*this;
204 return result;
205 }
206
207 const_iterator &operator--(); // Retreat (--iter)
208 const_iterator operator--(int) { // (iter--)
209 const_iterator result(*this);
210 --*this;
211 return result;
212 }
213
214 // We love relational operators.
215 friend bool operator==(const CI &lhs, const CI &rhs) {
216 return lhs.it_ == rhs.it_;
217 }
218 friend bool operator!=(const CI &lhs, const CI &rhs) {
219 return !(lhs == rhs);
220 }
221 friend bool operator<(const CI &lhs, const CI &rhs);
222 friend bool operator>(const CI &lhs, const CI &rhs) {
223 return rhs < lhs;
224 }
225 friend bool operator<=(const CI &lhs, const CI &rhs) {
226 return !(rhs < lhs);
227 }
228 friend bool operator>=(const CI &lhs, const CI &rhs) {
229 return !(lhs < rhs);
230 }
231
232 friend difference_type distance(const CI &first, const CI &last);
233
234 // UTF-8-specific methods
235 // Store the UTF-8 encoding of the current codepoint into buf,
236 // which must be at least 4 bytes long. Return the number of
237 // bytes written.
238 int get_utf8(char *buf) const;
239 // Return the UTF-8 character that the iterator points to.
240 string get_utf8_string() const;
241 // Return the byte length of the UTF-8 character the iterator points to.
242 int utf8_length() const;
243 // Return the iterator's pointer into the UTF-8 data.
244 const char *utf8_data() const {
245 return it_;
246 }
247
248 string DebugString() const;
249
250 private:
251 friend class UnicodeText;
252 friend class UnicodeTextUtils;
254 explicit const_iterator(const char *it) : it_(it) {}
255
256 const char *it_;
257 };
258
259 const_iterator begin() const;
260 const_iterator end() const;
261
262 class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
263 public:
265 : std::reverse_iterator<const_iterator>(it) {}
266 const char *utf8_data() const {
267 const_iterator tmp_it = base();
268 return (--tmp_it).utf8_data();
269 }
270 int get_utf8(char *buf) const {
271 const_iterator tmp_it = base();
272 return (--tmp_it).get_utf8(buf);
273 }
274 string get_utf8_string() const {
275 const_iterator tmp_it = base();
276 return (--tmp_it).get_utf8_string();
277 }
278 int utf8_length() const {
279 const_iterator tmp_it = base();
280 return (--tmp_it).utf8_length();
281 }
282 };
284 return const_reverse_iterator(end());
285 }
288 }
289
290 // Substring searching. Returns the beginning of the first
291 // occurrence of "look", or end() if not found.
292 const_iterator find(const UnicodeText &look, const_iterator start_pos) const;
293 // Equivalent to find(look, begin())
294 const_iterator find(const UnicodeText &look) const;
295
296 // Returns whether this contains the character U+FFFD. This can
297 // occur, for example, if the input to Encodings::Decode() had byte
298 // sequences that were invalid in the source encoding.
299 bool HasReplacementChar() const;
300
301 // UTF-8-specific methods
302 //
303 // Return the data, length, and capacity of UTF-8-encoded version of
304 // the text. Length and capacity are measured in bytes.
305 const char *utf8_data() const {
306 return repr_.data_;
307 }
308 int utf8_length() const {
309 return repr_.size_;
310 }
311 int utf8_capacity() const {
312 return repr_.capacity_;
313 }
314
315 // Return the UTF-8 data as a string.
316 static string UTF8Substring(const const_iterator &first, const const_iterator &last);
317
318 // There are three methods for initializing a UnicodeText from UTF-8
319 // data. They vary in details of memory management. In all cases,
320 // the data is tested for interchange-validity. If it is not
321 // interchange-valid, a LOG(WARNING) is issued, and each
322 // structurally invalid byte and each interchange-invalid codepoint
323 // is replaced with a space.
324
325 // x.CopyUTF8(buf, len) copies buf into x.
326 UnicodeText &CopyUTF8(const char *utf8_buffer, int byte_length);
327
328 // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
329 // buf. buf is not copied.
330 UnicodeText &TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
331
332 // x.PointToUTF8(buf,len) changes x so that it points to buf
333 // ("becomes an alias"). It does not take ownership or copy buf.
334 // If the buffer is not valid, this has the same effect as
335 // CopyUTF8(utf8_buffer, byte_length).
336 UnicodeText &PointToUTF8(const char *utf8_buffer, int byte_length);
337
338 // Occasionally it is necessary to use functions that operate on the
339 // pointer returned by utf8_data(). MakeIterator(p) provides a way
340 // to get back to the UnicodeText level. It uses CHECK to ensure
341 // that p is a pointer within this object's UTF-8 data, and that it
342 // points to the beginning of a character.
343 const_iterator MakeIterator(const char *p) const;
344
345 string DebugString() const;
346
347private:
348 friend class const_iterator;
349 friend class UnicodeTextUtils;
350
351 class Repr { // A byte-string.
352 public:
353 char *data_;
354 int size_;
355 int capacity_;
356 bool ours_; // Do we own data_?
357
358 Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
359 ~Repr() {
360 if (ours_)
361 delete[] data_;
362 }
363
364 void clear();
365 void reserve(int capacity);
366 void resize(int size);
367
368 void append(const char *bytes, int byte_length);
369 void Copy(const char *data, int size);
370 void TakeOwnershipOf(char *data, int size, int capacity);
371 void PointTo(const char *data, int size);
372
373 string DebugString() const;
374
375 private:
376 Repr &operator=(const Repr &);
377 Repr(const Repr &other);
378 };
379
380 Repr repr_;
381
382 // UTF-8-specific private methods.
383 // These routines do not perform a validity check when compiled
384 // in opt mode.
385 // It is an error to call these methods with UTF-8 data that
386 // is not interchange-valid.
387 //
388 UnicodeText &UnsafeCopyUTF8(const char *utf8_buffer, int byte_length);
389 UnicodeText &UnsafeTakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity);
390 UnicodeText &UnsafePointToUTF8(const char *utf8_buffer, int byte_length);
391 UnicodeText &UnsafeAppendUTF8(const char *utf8_buffer, int byte_length);
392 const_iterator UnsafeFind(const UnicodeText &look, const_iterator start_pos) const;
393};
394
395bool operator==(const UnicodeText &lhs, const UnicodeText &rhs);
396
397inline bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs) {
398 return !(lhs == rhs);
399}
400
401// UnicodeTextRange is a pair of iterators, useful for specifying text
402// segments. If the iterators are ==, the segment is empty.
403typedef pair<UnicodeText::const_iterator, UnicodeText::const_iterator> UnicodeTextRange;
404
406 return r.first == r.second;
407}
408
409// *************************** Utilities *************************
410
411// A factory function for creating a UnicodeText from a buffer of
412// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
413// is an "owner.")
414//
415// Each byte that is structurally invalid will be replaced with a
416// space. Each codepoint that is interchange-invalid will also be
417// replaced with a space, even if the codepoint was represented with a
418// multibyte sequence in the UTF-8 data.
419//
420inline UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length,
421 int byte_capacity) {
422 return UnicodeText().TakeOwnershipOfUTF8(utf8_buffer, byte_length, byte_capacity);
423}
424
425// A factory function for creating a UnicodeText from a buffer of
426// UTF-8 data. The new UnicodeText does not take ownership of the
427// buffer. (It is an "alias.")
428//
430 int byte_length) {
431 return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
432}
433
434// Create a UnicodeText from a UTF-8 string or buffer.
435//
436// If do_copy is true, then a copy of the string is made. The copy is
437// owned by the resulting UnicodeText object and will be freed when
438// the object is destroyed. This UnicodeText object is referred to
439// as an "owner."
440//
441// If do_copy is false, then no copy is made. The resulting
442// UnicodeText object does NOT take ownership of the string; in this
443// case, the lifetime of the UnicodeText object must not exceed the
444// lifetime of the string. This Unicodetext object is referred to as
445// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
446//
447// If the input string does not contain valid UTF-8, then a copy is
448// made (as if do_copy were true) and coerced to valid UTF-8 by
449// replacing each invalid byte with a space.
450//
451inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy) {
452 UnicodeText t;
453 if (do_copy) {
454 t.CopyUTF8(utf8_buf, len);
455 } else {
456 t.PointToUTF8(utf8_buf, len);
457 }
458 return t;
459}
460
461inline UnicodeText UTF8ToUnicodeText(const string &utf_string, bool do_copy) {
462 return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
463}
464
465inline UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len) {
466 return UTF8ToUnicodeText(utf8_buf, len, true);
467}
468inline UnicodeText UTF8ToUnicodeText(const string &utf8_string) {
469 return UTF8ToUnicodeText(utf8_string, true);
470}
471
472// Return a string containing the UTF-8 encoded version of all the
473// Unicode characters in t.
474inline string UnicodeTextToUTF8(const UnicodeText &t) {
475 return string(t.utf8_data(), t.utf8_length());
476}
477
478// This template function declaration is used in defining arraysize.
479// Note that the function doesn't need an implementation, as we only
480// use its type.
481template <typename T, size_t N>
482char (&ArraySizeHelper(T (&array)[N]))[N];
483#define arraysize(array) (sizeof(ArraySizeHelper(array)))
484
485// For debugging. Return a string of integers, written in uppercase
486// hex (%X), corresponding to the codepoints within the text. Each
487// integer is followed by a space. E.g., "61 62 6A 3005 ".
489
490#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
signed int char32
char(& ArraySizeHelper(T(&array)[N]))[N]
UnicodeText UTF8ToUnicodeText(const char *utf8_buf, int len, bool do_copy)
Definition: unicodetext.h:451
pair< UnicodeText::const_iterator, UnicodeText::const_iterator > UnicodeTextRange
Definition: unicodetext.h:403
string CodepointString(const UnicodeText &t)
UnicodeText MakeUnicodeTextAcceptingOwnership(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.h:420
UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(const char *utf8_buffer, int byte_length)
Definition: unicodetext.h:429
bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
string UnicodeTextToUTF8(const UnicodeText &t)
Definition: unicodetext.h:474
bool UnicodeTextRangeIsEmpty(const UnicodeTextRange &r)
Definition: unicodetext.h:405
bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.h:397
const char * p
STL namespace.
LIST last(LIST var_list)
Definition: oldlist.cpp:153
static string UTF8Substring(const const_iterator &first, const const_iterator &last)
Definition: unicodetext.cc:202
void push_back(char32 codepoint)
Definition: unicodetext.cc:357
const_iterator MakeIterator(const char *p) const
Definition: unicodetext.cc:484
UnicodeText & CopyUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:221
const_iterator find(const UnicodeText &look, const_iterator start_pos) const
Definition: unicodetext.cc:301
char32 value_type
Definition: unicodetext.h:120
UnicodeText & assign(const UnicodeText &src)
Definition: unicodetext.h:134
UnicodeText & append(ForwardIterator first, const ForwardIterator last)
Definition: unicodetext.h:163
friend class UnicodeTextUtils
Definition: unicodetext.h:349
UnicodeText & Copy(const UnicodeText &src)
Definition: unicodetext.cc:216
const_reverse_iterator rend() const
Definition: unicodetext.h:286
UnicodeText & PointTo(const UnicodeText &src)
Definition: unicodetext.cc:270
bool empty() const
Definition: unicodetext.h:146
string DebugString() const
int utf8_capacity() const
Definition: unicodetext.h:311
friend bool operator==(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.cc:377
const_iterator end() const
Definition: unicodetext.cc:412
friend class const_iterator
Definition: unicodetext.h:348
UnicodeText & operator=(const UnicodeText &src)
Definition: unicodetext.cc:209
UnicodeText & PointToUTF8(const char *utf8_buffer, int byte_length)
Definition: unicodetext.cc:254
UnicodeText & TakeOwnershipOfUTF8(char *utf8_buffer, int byte_length, int byte_capacity)
Definition: unicodetext.cc:237
int utf8_length() const
Definition: unicodetext.h:308
int size() const
Definition: unicodetext.cc:373
bool HasReplacementChar() const
const_iterator begin() const
Definition: unicodetext.cc:408
friend bool operator!=(const UnicodeText &lhs, const UnicodeText &rhs)
Definition: unicodetext.h:397
const char * utf8_data() const
Definition: unicodetext.h:305
const_reverse_iterator rbegin() const
Definition: unicodetext.h:283
void clear()
Definition: unicodetext.cc:350
friend bool operator>(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:222
friend bool operator<=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:225
const_iterator operator++(int)
Definition: unicodetext.h:201
friend class UTF8StateTableProperty
Definition: unicodetext.h:253
const_iterator operator--(int)
Definition: unicodetext.h:208
friend difference_type distance(const CI &first, const CI &last)
Definition: unicodetext.cc:44
std::bidirectional_iterator_tag iterator_category
Definition: unicodetext.h:185
const_iterator & operator++()
Definition: unicodetext.cc:443
const_iterator & operator--()
Definition: unicodetext.cc:448
int get_utf8(char *buf) const
Definition: unicodetext.cc:454
const_iterator & operator=(const const_iterator &other)
Definition: unicodetext.cc:402
friend bool operator>=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:228
string get_utf8_string() const
Definition: unicodetext.cc:468
const char * utf8_data() const
Definition: unicodetext.h:244
friend bool operator!=(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:218
friend bool operator<(const CI &lhs, const CI &rhs)
Definition: unicodetext.cc:416
friend bool operator==(const CI &lhs, const CI &rhs)
Definition: unicodetext.h:215
const char * utf8_data() const
Definition: unicodetext.h:266
const_reverse_iterator(const_iterator it)
Definition: unicodetext.h:264