tesseract v5.3.3.20231005
utf.h File Reference
#include <stdint.h>

Go to the source code of this file.

Typedefs

typedef signed int Rune
 

Enumerations

enum  {
  UTFmax = 4 , Runesync = 0x80 , Runeself = 0x80 , Runeerror = 0xFFFD ,
  Runemax = 0x10FFFF
}
 

Functions

int runetochar (char *s, const Rune *r)
 
int chartorune (Rune *r, const char *s)
 
int charntorune (Rune *r, const char *s, int n)
 
int isvalidcharntorune (const char *str, int n, Rune *r, int *consumed)
 
int runelen (Rune r)
 
int runenlen (const Rune *r, int n)
 
int fullrune (const char *s, int n)
 
int utflen (const char *s)
 
int utfnlen (const char *s, long n)
 
const char * utfrune (const char *s, Rune r)
 
const char * utfrrune (const char *s, Rune r)
 
const char * utfutf (const char *s1, const char *s2)
 
char * utfecpy (char *s1, char *es1, const char *s2)
 
Runerunestrcat (Rune *s1, const Rune *s2)
 
Runerunestrncat (Rune *s1, const Rune *s2, long n)
 
const Runerunestrchr (const Rune *s, Rune c)
 
int runestrcmp (const Rune *s1, const Rune *s2)
 
int runestrncmp (const Rune *s1, const Rune *s2, long n)
 
Runerunestrcpy (Rune *s1, const Rune *s2)
 
Runerunestrncpy (Rune *s1, const Rune *s2, long n)
 
Runerunestrecpy (Rune *s1, Rune *es1, const Rune *s2)
 
Runerunestrdup (const Rune *s)
 
const Runerunestrrchr (const Rune *s, Rune c)
 
long runestrlen (const Rune *s)
 
const Runerunestrstr (const Rune *s1, const Rune *s2)
 
Rune toupperrune (Rune r)
 
Rune tolowerrune (Rune r)
 
Rune totitlerune (Rune r)
 
int isupperrune (Rune r)
 
int islowerrune (Rune r)
 
int istitlerune (Rune r)
 
int isalpharune (Rune r)
 
int isdigitrune (Rune r)
 
int isideographicrune (Rune r)
 
int isspacerune (Rune r)
 

Typedef Documentation

◆ Rune

typedef signed int Rune

Definition at line 19 of file utf.h.

Enumeration Type Documentation

◆ anonymous enum

anonymous enum
Enumerator
UTFmax 
Runesync 
Runeself 
Runeerror 
Runemax 

Definition at line 21 of file utf.h.

21 {
22 UTFmax = 4, /* maximum bytes per rune */
23 Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
24 Runeself = 0x80, /* rune and UTF sequences are the same (<) */
25 Runeerror = 0xFFFD, /* decoding error in UTF */
26 Runemax = 0x10FFFF, /* maximum rune value */
27};
@ Runemax
Definition: utf.h:26
@ Runesync
Definition: utf.h:23
@ UTFmax
Definition: utf.h:22
@ Runeerror
Definition: utf.h:25
@ Runeself
Definition: utf.h:24

Function Documentation

◆ charntorune()

int charntorune ( Rune r,
const char *  s,
int  n 
)

Definition at line 64 of file rune.c.

64 {
65 int c, c1, c2, c3;
66 long l;
67
68 /* When we're not allowed to read anything */
69 if (length <= 0) {
70 goto badlen;
71 }
72
73 /*
74 * one character sequence (7-bit value)
75 * 00000-0007F => T1
76 */
77 c = *(uchar *)str;
78 if (c < Tx) {
79 *rune = c;
80 return 1;
81 }
82
83 // If we can't read more than one character we must stop
84 if (length <= 1) {
85 goto badlen;
86 }
87
88 /*
89 * two character sequence (11-bit value)
90 * 0080-07FF => T2 Tx
91 */
92 c1 = *(uchar *)(str + 1) ^ Tx;
93 if (c1 & Testx)
94 goto bad;
95 if (c < T3) {
96 if (c < T2)
97 goto bad;
98 l = ((c << Bitx) | c1) & Rune2;
99 if (l <= Rune1)
100 goto bad;
101 *rune = l;
102 return 2;
103 }
104
105 // If we can't read more than two characters we must stop
106 if (length <= 2) {
107 goto badlen;
108 }
109
110 /*
111 * three character sequence (16-bit value)
112 * 0800-FFFF => T3 Tx Tx
113 */
114 c2 = *(uchar *)(str + 2) ^ Tx;
115 if (c2 & Testx)
116 goto bad;
117 if (c < T4) {
118 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
119 if (l <= Rune2)
120 goto bad;
121 *rune = l;
122 return 3;
123 }
124
125 if (length <= 3)
126 goto badlen;
127
128 /*
129 * four character sequence (21-bit value)
130 * 10000-1FFFFF => T4 Tx Tx Tx
131 */
132 c3 = *(uchar *)(str + 3) ^ Tx;
133 if (c3 & Testx)
134 goto bad;
135 if (c < T5) {
136 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
137 if (l <= Rune3)
138 goto bad;
139 if (l > Runemax)
140 goto bad;
141 *rune = l;
142 return 4;
143 }
144
145 // Support for 5-byte or longer UTF-8 would go here, but
146 // since we don't have that, we'll just fall through to bad.
147
148 /*
149 * bad decoding
150 */
151bad:
152 *rune = Bad;
153 return 1;
154badlen:
155 *rune = Bad;
156 return 0;
157}
unsigned char uchar
Definition: utfdef.h:8
@ T4
Definition: rune.c:31
@ Testx
Definition: rune.c:41
@ T3
Definition: rune.c:30
@ Rune4
Definition: rune.c:37
@ Rune3
Definition: rune.c:36
@ T5
Definition: rune.c:32
@ T2
Definition: rune.c:29
@ Rune2
Definition: rune.c:35
@ Rune1
Definition: rune.c:34
@ Tx
Definition: rune.c:28
@ Bad
Definition: rune.c:43
@ Bitx
Definition: rune.c:21

◆ chartorune()

int chartorune ( Rune r,
const char *  s 
)

Definition at line 163 of file rune.c.

163 {
164 int c, c1, c2, c3;
165 long l;
166
167 /*
168 * one character sequence
169 * 00000-0007F => T1
170 */
171 c = *(uchar *)str;
172 if (c < Tx) {
173 *rune = c;
174 return 1;
175 }
176
177 /*
178 * two character sequence
179 * 0080-07FF => T2 Tx
180 */
181 c1 = *(uchar *)(str + 1) ^ Tx;
182 if (c1 & Testx)
183 goto bad;
184 if (c < T3) {
185 if (c < T2)
186 goto bad;
187 l = ((c << Bitx) | c1) & Rune2;
188 if (l <= Rune1)
189 goto bad;
190 *rune = l;
191 return 2;
192 }
193
194 /*
195 * three character sequence
196 * 0800-FFFF => T3 Tx Tx
197 */
198 c2 = *(uchar *)(str + 2) ^ Tx;
199 if (c2 & Testx)
200 goto bad;
201 if (c < T4) {
202 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
203 if (l <= Rune2)
204 goto bad;
205 *rune = l;
206 return 3;
207 }
208
209 /*
210 * four character sequence (21-bit value)
211 * 10000-1FFFFF => T4 Tx Tx Tx
212 */
213 c3 = *(uchar *)(str + 3) ^ Tx;
214 if (c3 & Testx)
215 goto bad;
216 if (c < T5) {
217 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
218 if (l <= Rune3)
219 goto bad;
220 if (l > Runemax)
221 goto bad;
222 *rune = l;
223 return 4;
224 }
225
226 /*
227 * Support for 5-byte or longer UTF-8 would go here, but
228 * since we don't have that, we'll just fall through to bad.
229 */
230
231 /*
232 * bad decoding
233 */
234bad:
235 *rune = Bad;
236 return 1;
237}

◆ fullrune()

int fullrune ( const char *  s,
int  n 
)

Definition at line 326 of file rune.c.

326 {
327 if (n > 0) {
328 int c = *(uchar *)str;
329 if (c < Tx)
330 return 1;
331 if (n > 1) {
332 if (c < T3)
333 return 1;
334 if (n > 2) {
335 if (c < T4 || n > 3)
336 return 1;
337 }
338 }
339 }
340 return 0;
341}

◆ isalpharune()

int isalpharune ( Rune  r)

◆ isdigitrune()

int isdigitrune ( Rune  r)

◆ isideographicrune()

int isideographicrune ( Rune  r)

◆ islowerrune()

int islowerrune ( Rune  r)

◆ isspacerune()

int isspacerune ( Rune  r)

◆ istitlerune()

int istitlerune ( Rune  r)

◆ isupperrune()

int isupperrune ( Rune  r)

◆ isvalidcharntorune()

int isvalidcharntorune ( const char *  str,
int  n,
Rune r,
int *  consumed 
)

Definition at line 239 of file rune.c.

239 {
240 *consumed = charntorune(rune, str, length);
241 return *rune != Runeerror || *consumed == 3;
242}
int charntorune(Rune *rune, const char *str, int length)
Definition: rune.c:64

◆ runelen()

int runelen ( Rune  r)

Definition at line 299 of file rune.c.

299 {
300 char str[10];
301
302 return runetochar(str, &rune);
303}
int runetochar(char *str, const Rune *rune)
Definition: rune.c:244

◆ runenlen()

int runenlen ( const Rune r,
int  n 
)

Definition at line 305 of file rune.c.

305 {
306 int nb;
307 ulong c; /* Rune is signed, so use unsigned for range check. */
308
309 nb = 0;
310 while (nrune--) {
311 c = *r++;
312 if (c <= Rune1)
313 nb++;
314 else if (c <= Rune2)
315 nb += 2;
316 else if (c <= Rune3)
317 nb += 3;
318 else if (c <= Runemax)
319 nb += 4;
320 else
321 nb += 3; /* Runeerror = 0xFFFD, see runetochar */
322 }
323 return nb;
324}
unsigned long ulong
Definition: utfdef.h:11

◆ runestrcat()

Rune * runestrcat ( Rune s1,
const Rune s2 
)

◆ runestrchr()

const Rune * runestrchr ( const Rune s,
Rune  c 
)

◆ runestrcmp()

int runestrcmp ( const Rune s1,
const Rune s2 
)

◆ runestrcpy()

Rune * runestrcpy ( Rune s1,
const Rune s2 
)

◆ runestrdup()

Rune * runestrdup ( const Rune s)

◆ runestrecpy()

Rune * runestrecpy ( Rune s1,
Rune es1,
const Rune s2 
)

◆ runestrlen()

long runestrlen ( const Rune s)

◆ runestrncat()

Rune * runestrncat ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrncmp()

int runestrncmp ( const Rune s1,
const Rune s2,
long  n 
)

◆ runestrncpy()

Rune * runestrncpy ( Rune s1,
const Rune s2,
long  n 
)

◆ runestrrchr()

const Rune * runestrrchr ( const Rune s,
Rune  c 
)

◆ runestrstr()

const Rune * runestrstr ( const Rune s1,
const Rune s2 
)

◆ runetochar()

int runetochar ( char *  s,
const Rune r 
)

Definition at line 244 of file rune.c.

244 {
245 /* Runes are signed, so convert to unsigned for range check. */
246 unsigned long c;
247
248 /*
249 * one character sequence
250 * 00000-0007F => 00-7F
251 */
252 c = *rune;
253 if (c <= Rune1) {
254 str[0] = c;
255 return 1;
256 }
257
258 /*
259 * two character sequence
260 * 0080-07FF => T2 Tx
261 */
262 if (c <= Rune2) {
263 str[0] = T2 | (c >> 1 * Bitx);
264 str[1] = Tx | (c & Maskx);
265 return 2;
266 }
267
268 /*
269 * If the Rune is out of range, convert it to the error rune.
270 * Do this test here because the error rune encodes to three bytes.
271 * Doing it earlier would duplicate work, since an out of range
272 * Rune wouldn't have fit in one or two bytes.
273 */
274 if (c > Runemax)
275 c = Runeerror;
276
277 /*
278 * three character sequence
279 * 0800-FFFF => T3 Tx Tx
280 */
281 if (c <= Rune3) {
282 str[0] = T3 | (c >> 2 * Bitx);
283 str[1] = Tx | ((c >> 1 * Bitx) & Maskx);
284 str[2] = Tx | (c & Maskx);
285 return 3;
286 }
287
288 /*
289 * four character sequence (21-bit value)
290 * 10000-1FFFFF => T4 Tx Tx Tx
291 */
292 str[0] = T4 | (c >> 3 * Bitx);
293 str[1] = Tx | ((c >> 2 * Bitx) & Maskx);
294 str[2] = Tx | ((c >> 1 * Bitx) & Maskx);
295 str[3] = Tx | (c & Maskx);
296 return 4;
297}
@ Maskx
Definition: rune.c:40

◆ tolowerrune()

Rune tolowerrune ( Rune  r)

◆ totitlerune()

Rune totitlerune ( Rune  r)

◆ toupperrune()

Rune toupperrune ( Rune  r)

◆ utfecpy()

char * utfecpy ( char *  s1,
char *  es1,
const char *  s2 
)

◆ utflen()

int utflen ( const char *  s)

◆ utfnlen()

int utfnlen ( const char *  s,
long  n 
)

◆ utfrrune()

const char * utfrrune ( const char *  s,
Rune  r 
)

◆ utfrune()

const char * utfrune ( const char *  s,
Rune  r 
)

◆ utfutf()

const char * utfutf ( const char *  s1,
const char *  s2 
)