tesseract v5.3.3.20231005
unichar.cpp
Go to the documentation of this file.
1
2// File: unichar.cpp
3// Description: Unicode character/ligature class.
4// Author: Ray Smith
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include <tesseract/unichar.h>
20#include "errcode.h"
21#include "tprintf.h"
22
23#define UNI_MAX_LEGAL_UTF32 0x0010FFFF
24
25namespace tesseract {
26
27// Construct from a utf8 string. If len<0 then the string is null terminated.
28// If the string is too long to fit in the UNICHAR then it takes only what
29// will fit. Checks for illegal input and stops at an illegal sequence.
30// The resulting UNICHAR may be empty.
31UNICHAR::UNICHAR(const char *utf8_str, int len) {
32 int total_len = 0;
33 int step = 0;
34 if (len < 0) {
35 for (len = 0; len < UNICHAR_LEN && utf8_str[len] != 0; ++len) {
36 ;
37 }
38 }
39 for (total_len = 0; total_len < len; total_len += step) {
40 step = utf8_step(utf8_str + total_len);
41 if (total_len + step > UNICHAR_LEN) {
42 break; // Too long.
43 }
44 if (step == 0) {
45 break; // Illegal first byte.
46 }
47 int i;
48 for (i = 1; i < step; ++i) {
49 if ((utf8_str[total_len + i] & 0xc0) != 0x80) {
50 break;
51 }
52 }
53 if (i < step) {
54 break; // Illegal surrogate
55 }
56 }
57 memcpy(chars, utf8_str, total_len);
58 if (total_len < UNICHAR_LEN) {
59 chars[UNICHAR_LEN - 1] = total_len;
60 while (total_len < UNICHAR_LEN - 1) {
61 chars[total_len++] = 0;
62 }
63 }
64}
65
66// Construct from a single UCS4 character. Illegal values are ignored,
67// resulting in an empty UNICHAR.
68UNICHAR::UNICHAR(int unicode) {
69 const int bytemask = 0xBF;
70 const int bytemark = 0x80;
71
72 if (unicode < 0x80) {
73 chars[UNICHAR_LEN - 1] = 1;
74 chars[2] = 0;
75 chars[1] = 0;
76 chars[0] = static_cast<char>(unicode);
77 } else if (unicode < 0x800) {
78 chars[UNICHAR_LEN - 1] = 2;
79 chars[2] = 0;
80 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
81 unicode >>= 6;
82 chars[0] = static_cast<char>(unicode | 0xc0);
83 } else if (unicode < 0x10000) {
84 chars[UNICHAR_LEN - 1] = 3;
85 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
86 unicode >>= 6;
87 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
88 unicode >>= 6;
89 chars[0] = static_cast<char>(unicode | 0xe0);
90 } else if (unicode <= UNI_MAX_LEGAL_UTF32) {
91 chars[UNICHAR_LEN - 1] = 4;
92 chars[3] = static_cast<char>((unicode | bytemark) & bytemask);
93 unicode >>= 6;
94 chars[2] = static_cast<char>((unicode | bytemark) & bytemask);
95 unicode >>= 6;
96 chars[1] = static_cast<char>((unicode | bytemark) & bytemask);
97 unicode >>= 6;
98 chars[0] = static_cast<char>(unicode | 0xf0);
99 } else {
100 memset(chars, 0, UNICHAR_LEN);
101 }
102}
103
104// Get the first character as UCS-4.
106 static const int utf8_offsets[5] = {0, 0, 0x3080, 0xE2080, 0x3C82080};
107 int uni = 0;
108 int len = utf8_step(chars);
109 const char *src = chars;
110
111 switch (len) {
112 default:
113 break;
114 case 4:
115 uni += static_cast<unsigned char>(*src++);
116 uni <<= 6;
117 // Fall through.
118 case 3:
119 uni += static_cast<unsigned char>(*src++);
120 uni <<= 6;
121 // Fall through.
122 case 2:
123 uni += static_cast<unsigned char>(*src++);
124 uni <<= 6;
125 // Fall through.
126 case 1:
127 uni += static_cast<unsigned char>(*src++);
128 }
129 uni -= utf8_offsets[len];
130 return uni;
131}
132
133// Get a terminated UTF8 string: Must delete[] it after use.
134char *UNICHAR::utf8_str() const {
135 int len = utf8_len();
136 char *str = new char[len + 1];
137 memcpy(str, chars, len);
138 str[len] = 0;
139 return str;
140}
141
142// Get the number of bytes in the first character of the given utf8 string.
143int UNICHAR::utf8_step(const char *utf8_str) {
144 static const char utf8_bytes[256] = {
145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
152 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
153 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0};
154
155 return utf8_bytes[static_cast<unsigned char>(*utf8_str)];
156}
157
159 ASSERT_HOST(it_ != nullptr);
160 int step = utf8_step(it_);
161 if (step == 0) {
162 tprintf("ERROR: Illegal UTF8 encountered.\n");
163 for (int i = 0; i < 5 && it_[i] != '\0'; ++i) {
164 tprintf("Index %d char = 0x%x\n", i, it_[i]);
165 }
166 step = 1;
167 }
168 it_ += step;
169 return *this;
170}
171
173 ASSERT_HOST(it_ != nullptr);
174 const int len = utf8_step(it_);
175 if (len == 0) {
176 tprintf("WARNING: Illegal UTF8 encountered\n");
177 return ' ';
178 }
179 UNICHAR uch(it_, len);
180 return uch.first_uni();
181}
182
183int UNICHAR::const_iterator::get_utf8(char *utf8_output) const {
184 ASSERT_HOST(it_ != nullptr);
185 const int len = utf8_step(it_);
186 if (len == 0) {
187 tprintf("WARNING: Illegal UTF8 encountered\n");
188 utf8_output[0] = ' ';
189 return 1;
190 }
191 strncpy(utf8_output, it_, len);
192 return len;
193}
194
196 ASSERT_HOST(it_ != nullptr);
197 const int len = utf8_step(it_);
198 if (len == 0) {
199 tprintf("WARNING: Illegal UTF8 encountered\n");
200 return 1;
201 }
202 return len;
203}
204
206 return utf8_step(it_) > 0;
207}
208
211}
212
214 return UNICHAR::const_iterator(utf8_str + len);
215}
216
217// Converts a utf-8 string to a vector of unicodes.
218// Returns an empty vector if the input contains invalid UTF-8.
219/* static */
220std::vector<char32> UNICHAR::UTF8ToUTF32(const char *utf8_str) {
221 const int utf8_length = strlen(utf8_str);
222 std::vector<char32> unicodes;
223 unicodes.reserve(utf8_length);
224 const_iterator end_it(end(utf8_str, utf8_length));
225 for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) {
226 if (it.is_legal()) {
227 unicodes.push_back(*it);
228 } else {
229 unicodes.clear();
230 return unicodes;
231 }
232 }
233 return unicodes;
234}
235
236// Returns an empty string if the input contains an invalid unicode.
237std::string UNICHAR::UTF32ToUTF8(const std::vector<char32> &str32) {
238 std::string utf8_str;
239 for (char32 ch : str32) {
240 UNICHAR uni_ch(ch);
241 int step;
242 if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
243 utf8_str.append(uni_ch.utf8(), step);
244 } else {
245 return "";
246 }
247 }
248 return utf8_str;
249}
250
251} // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:54
#define UNI_MAX_LEGAL_UTF32
Definition: unichar.cpp:23
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
signed int char32
Definition: unichar.h:49
char * utf8_str() const
Definition: unichar.cpp:134
int first_uni() const
Definition: unichar.cpp:105
static const_iterator begin(const char *utf8_str, int byte_length)
Definition: unichar.cpp:209
static std::vector< char32 > UTF8ToUTF32(const char *utf8_str)
Definition: unichar.cpp:220
int utf8_len() const
Definition: unichar.h:75
static std::string UTF32ToUTF8(const std::vector< char32 > &str32)
Definition: unichar.cpp:237
const char * utf8() const
Definition: unichar.h:81
static const_iterator end(const char *utf8_str, int byte_length)
Definition: unichar.cpp:213
static int utf8_step(const char *utf8_str)
Definition: unichar.cpp:143
const_iterator & operator++()
Definition: unichar.cpp:158
int get_utf8(char *buf) const
Definition: unichar.cpp:183