tesseract v5.3.3.20231005
tesseract::UNICHAR Class Reference

#include <unichar.h>

Classes

class  const_iterator
 

Public Member Functions

 UNICHAR ()
 
 UNICHAR (const char *utf8_str, int len)
 
 UNICHAR (int unicode)
 
int first_uni () const
 
int utf8_len () const
 
const char * utf8 () const
 
char * utf8_str () const
 

Static Public Member Functions

static int utf8_step (const char *utf8_str)
 
static const_iterator begin (const char *utf8_str, int byte_length)
 
static const_iterator end (const char *utf8_str, int byte_length)
 
static std::vector< char32UTF8ToUTF32 (const char *utf8_str)
 
static std::string UTF32ToUTF8 (const std::vector< char32 > &str32)
 

Detailed Description

Definition at line 55 of file unichar.h.

Constructor & Destructor Documentation

◆ UNICHAR() [1/3]

tesseract::UNICHAR::UNICHAR ( )
inline

Definition at line 57 of file unichar.h.

57 {
58 memset(chars, 0, UNICHAR_LEN);
59 }
#define UNICHAR_LEN
Definition: unichar.h:31

◆ UNICHAR() [2/3]

tesseract::UNICHAR::UNICHAR ( const char *  utf8_str,
int  len 
)

Definition at line 31 of file unichar.cpp.

31 {
32 int total_len = 0;
33 int step = 0;
34 if (len < 0) {
35 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36 ;
37 }
38 }
39 for (total_len = 0; total_len < len; total_len += step) {
40 step = utf8_step(utf8_str + total_len);
41 if (total_len + step > UNICHAR_LEN) {
42 break; // Too long.
43 }
44 if (step == 0) {
45 break; // Illegal first byte.
46 }
47 int i;
48 for (i = 1; i < step; ++i) {
49 if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50 break;
51 }
52 }
53 if (i < step) {
54 break; // Illegal surrogate
55 }
56 }
57 memcpy(chars, utf8_str, total_len);
58 if (total_len < UNICHAR_LEN) {
59 chars[UNICHAR_LEN - 1] = total_len;
60 while (total_len < UNICHAR_LEN - 1) {
61 chars[total_len++] = 0;
62 }
63 }
64}
char * utf8_str() const
Definition: unichar.cpp:134
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143

◆ UNICHAR() [3/3]

tesseract::UNICHAR::UNICHAR ( int  unicode)
explicit

Definition at line 68 of file unichar.cpp.

68 {
69 const int bytemask = 0xBF;
70 const int bytemark = 0x80;
71
72 if (unicode < 0x80) {
73 chars[UNICHAR_LEN - 1] = 1;
74 chars[2] = 0;
75 chars[1] = 0;
76 chars[0] = static_cast<char>(unicode);
77 } else if (unicode < 0x800) {
78 chars[UNICHAR_LEN - 1] = 2;
79 chars[2] = 0;
80 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81 unicode >>= 6;
82 chars[0] = static_cast<char>(unicode | 0xc0);
83 } else if (unicode < 0x10000) {
84 chars[UNICHAR_LEN - 1] = 3;
85 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
86 unicode >>= 6;
87 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
88 unicode >>= 6;
89 chars[0] = static_cast<char>(unicode | 0xe0);
90 } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91 chars[UNICHAR_LEN - 1] = 4;
92 chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
93 unicode >>= 6;
94 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
95 unicode >>= 6;
96 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
97 unicode >>= 6;
98 chars[0] = static_cast<char>(unicode | 0xf0);
99 } else {
100 memset(chars, 0, UNICHAR_LEN);
101 }
102}
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:23

Member Function Documentation

◆ begin()

UNICHAR::const_iterator tesseract::UNICHAR::begin ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 209 of file unichar.cpp.

209 {
210 return UNICHAR::const_iterator(utf8_str);
211}

◆ end()

UNICHAR::const_iterator tesseract::UNICHAR::end ( const char *  utf8_str,
int  byte_length 
)
static

Definition at line 213 of file unichar.cpp.

213 {
214 return UNICHAR::const_iterator(utf8_str + len);
215}

◆ first_uni()

int tesseract::UNICHAR::first_uni ( ) const

Definition at line 105 of file unichar.cpp.

105 {
106 static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107 int uni = 0;
108 int len = utf8_step(chars);
109 const char *src = chars;
110
111 switch (len) {
112 default:
113 break;
114 case 4:
115 uni += static_cast<unsigned char>(*src++);
116 uni <<= 6;
117 // Fall through.
118 case 3:
119 uni += static_cast<unsigned char>(*src++);
120 uni <<= 6;
121 // Fall through.
122 case 2:
123 uni += static_cast<unsigned char>(*src++);
124 uni <<= 6;
125 // Fall through.
126 case 1:
127 uni += static_cast<unsigned char>(*src++);
128 }
129 uni -= utf8_offsets[len];
130 return uni;
131}

◆ UTF32ToUTF8()

std::string tesseract::UNICHAR::UTF32ToUTF8 ( const std::vector< char32 > &  str32)
static

Definition at line 237 of file unichar.cpp.

237 {
238 std::string utf8_str;
239 for (char32 ch : str32) {
240 UNICHAR uni_ch(ch);
241 int step;
242 if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243 utf8_str.append(uni_ch.utf8(), step);
244 } else {
245 return "";
246 }
247 }
248 return utf8_str;
249}
signed int char32

◆ utf8()

const char * tesseract::UNICHAR::utf8 ( ) const
inline

Definition at line 81 of file unichar.h.

81 {
82 return chars;
83 }

◆ utf8_len()

int tesseract::UNICHAR::utf8_len ( ) const
inline

Definition at line 75 of file unichar.h.

75 {
76 int len = chars[UNICHAR_LEN - 1];
77 return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
78 }

◆ utf8_step()

int tesseract::UNICHAR::utf8_step ( const char *  utf8_str)
static

Definition at line 143 of file unichar.cpp.

143 {
144 static const char utf8_bytes[256] = {
145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154
155 return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156}

◆ utf8_str()

char * tesseract::UNICHAR::utf8_str ( ) const

Definition at line 134 of file unichar.cpp.

134 {
135 int len = utf8_len();
136 char *str = new char[len + 1];
137 memcpy(str, chars, len);
138 str[len] = 0;
139 return str;
140}
int utf8_len() const
Definition: unichar.h:75

◆ UTF8ToUTF32()

std::vector< char32 > tesseract::UNICHAR::UTF8ToUTF32 ( const char *  utf8_str)
static

Definition at line 220 of file unichar.cpp.

220 {
221 const int utf8_length = strlen(utf8_str);
222 std::vector<char32> unicodes;
223 unicodes.reserve(utf8_length);
224 const_iterator end_it(end(utf8_str, utf8_length));
225 for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226 if (it.is_legal()) {
227 unicodes.push_back(*it);
228 } else {
229 unicodes.clear();
230 return unicodes;
231 }
232 }
233 return unicodes;
234}
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213

The documentation for this class was generated from the following files: