tesseract v5.3.3.20231005
unilib.h
Go to the documentation of this file.
1
17// Routines to do manipulation of Unicode characters or text
18//
19// The StructurallyValid routines accept buffers of arbitrary bytes.
20// For CoerceToStructurallyValid(), the input buffer and output buffers may
21// point to exactly the same memory.
22//
23// In all other cases, the UTF-8 string must be structurally valid and
24// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
25// Debug builds take a fatal error for invalid UTF-8 input.
26// The input and output buffers may not overlap at all.
27//
28// The char32 routines are here only for convenience; they convert to UTF-8
29// internally and use the UTF-8 routines.
30
31#ifndef UTIL_UTF8_UNILIB_H__
32#define UTIL_UTF8_UNILIB_H__
33
34#include <string>
35#include "syntaxnet/base.h"
36
37// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
38// but they are defined in unilib_utf8_utils.h.
39//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
40
41namespace UniLib {
42
43// Returns the length in bytes of the prefix of src that is all
44// interchange valid UTF-8
45int SpanInterchangeValid(const char *src, int byte_length);
46inline int SpanInterchangeValid(const std::string &src) {
47 return SpanInterchangeValid(src.data(), src.size());
48}
49
50// Returns true if the source is all interchange valid UTF-8
51// "Interchange valid" is a stronger than structurally valid --
52// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
53bool IsInterchangeValid(char32 codepoint);
54inline bool IsInterchangeValid(const char *src, int byte_length) {
55 return (byte_length == SpanInterchangeValid(src, byte_length));
56}
57inline bool IsInterchangeValid(const std::string &src) {
58 return IsInterchangeValid(src.data(), src.size());
59}
60
61} // namespace UniLib
62
63#endif // UTIL_UTF8_PUBLIC_UNILIB_H_
signed int char32
Definition: unilib.cc:24
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39