tesseract v5.3.3.20231005
UniLib Namespace Reference

Functions

bool IsInterchangeValid (char32 c)
 
int SpanInterchangeValid (const char *begin, int byte_length)
 
int SpanInterchangeValid (const std::string &src)
 
bool IsInterchangeValid (const char *src, int byte_length)
 
bool IsInterchangeValid (const std::string &src)
 
bool IsValidCodepoint (char32 c)
 
int OneCharLen (const char *src)
 
bool IsTrailByte (char x)
 

Detailed Description

Copyright 2010 Google Inc.

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Function Documentation

◆ IsInterchangeValid() [1/3]

bool UniLib::IsInterchangeValid ( char32  c)

Definition at line 33 of file unilib.cc.

33 {
34 return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
35 (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) ||
36 (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE);
37}

◆ IsInterchangeValid() [2/3]

bool UniLib::IsInterchangeValid ( const char *  src,
int  byte_length 
)
inline

Definition at line 54 of file unilib.h.

54 {
55 return (byte_length == SpanInterchangeValid(src, byte_length));
56}
int SpanInterchangeValid(const std::string &src)
Definition: unilib.h:46

◆ IsInterchangeValid() [3/3]

bool UniLib::IsInterchangeValid ( const std::string &  src)
inline

Definition at line 57 of file unilib.h.

57 {
58 return IsInterchangeValid(src.data(), src.size());
59}
bool IsInterchangeValid(const std::string &src)
Definition: unilib.h:57

◆ IsTrailByte()

bool UniLib::IsTrailByte ( char  x)
inline

Definition at line 58 of file unilib_utf8_utils.h.

58 {
59 // return (x & 0xC0) == 0x80;
60 // Since trail bytes are always in [0x80, 0xBF], we can optimize:
61 return static_cast<signed char>(x) < -0x40;
62}

◆ IsValidCodepoint()

bool UniLib::IsValidCodepoint ( char32  c)
inline

Definition at line 31 of file unilib_utf8_utils.h.

31 {
32 return (static_cast<uint32_t>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
33}

◆ OneCharLen()

int UniLib::OneCharLen ( const char *  src)
inline

Definition at line 53 of file unilib_utf8_utils.h.

53 {
54 return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
55}

◆ SpanInterchangeValid() [1/2]

int UniLib::SpanInterchangeValid ( const char *  begin,
int  byte_length 
)

Definition at line 39 of file unilib.cc.

39 {
40 char32 rune;
41 const char *p = begin;
42 const char *end = begin + byte_length;
43 while (p < end) {
44 int bytes_consumed = charntorune(&rune, p, end - p);
45 // We want to accept Runeerror == U+FFFD as a valid char, but it is used
46 // by chartorune to indicate error. Luckily, the real codepoint is size 3
47 // while errors return bytes_consumed <= 1.
48 if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) {
49 break; // Found
50 }
51 p += bytes_consumed;
52 }
53 return p - begin;
54}
signed int char32
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64
@ Runeerror
Definition: utf.h:25
const char * p
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33

◆ SpanInterchangeValid() [2/2]

int UniLib::SpanInterchangeValid ( const std::string &  src)
inline

Definition at line 46 of file unilib.h.

46 {
47 return SpanInterchangeValid(src.data(), src.size());
48}