tesseract v5.3.3.20231005
unilib.cc
Go to the documentation of this file.
1
17// Author: sligocki@google.com (Shawn Ligocki)
18
19#include "util/utf8/unilib.h"
20
21#include "syntaxnet/base.h"
22#include "third_party/utf/utf.h"
23
24namespace UniLib {
25
26// Codepoints not allowed for interchange are:
27// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
28// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
29// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
30// C1 controls: U+007F to U+009F
31// Surrogates: U+D800 to U+DFFF
32// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
34 return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
35 (c >= 0x7F && c <= 0x9F) || (c >= 0xD800 && c <= 0xDFFF) ||
36 (c >= 0xFDD0 && c <= 0xFDEF) || (c & 0xFFFE) == 0xFFFE);
37}
38
39int SpanInterchangeValid(const char *begin, int byte_length) {
40 char32 rune;
41 const char *p = begin;
42 const char *end = begin + byte_length;
43 while (p < end) {
44 int bytes_consumed = charntorune(&rune, p, end - p);
45 // We want to accept Runeerror == U+FFFD as a valid char, but it is used
46 // by chartorune to indicate error. Luckily, the real codepoint is size 3
47 // while errors return bytes_consumed <= 1.
48 if ((rune == Runeerror && bytes_consumed <= 1) || !IsInterchangeValid(rune)) {
49 break; // Found
50 }
51 p += bytes_consumed;
52 }
53 return p - begin;
54}
55
56} // namespace UniLib
signed int char32
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64
@ Runeerror
Definition: utf.h:25
const char * p
Definition: unilib.cc:24
bool IsInterchangeValid(char32 c)
Definition: unilib.cc:33
int SpanInterchangeValid(const char *begin, int byte_length)
Definition: unilib.cc:39