tesseract v5.3.3.20231005
utf.h
Go to the documentation of this file.
1/*
2 * The authors of this software are Rob Pike and Ken Thompson.
3 * Copyright (c) 2002 by Lucent Technologies.
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose without fee is hereby granted, provided that this entire notice
6 * is included in all copies of any software which is or includes a copy
7 * or modification of this software and in all copies of the supporting
8 * documentation for such software.
9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
11 * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
13 */
14#ifndef _UTFH_
15#define _UTFH_ 1
16
17#include <stdint.h>
18
19typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/
20
21enum {
22 UTFmax = 4, /* maximum bytes per rune */
23 Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
24 Runeself = 0x80, /* rune and UTF sequences are the same (<) */
25 Runeerror = 0xFFFD, /* decoding error in UTF */
26 Runemax = 0x10FFFF, /* maximum rune value */
27};
28
29#ifdef __cplusplus
30extern "C" {
31#endif
32
33/*
34 * rune routines
35 */
36
37/*
38 * These routines were written by Rob Pike and Ken Thompson
39 * and first appeared in Plan 9.
40 * SEE ALSO
41 * utf (7)
42 * tcs (1)
43 */
44
45// runetochar copies (encodes) one rune, pointed to by r, to at most
46// UTFmax bytes starting at s and returns the number of bytes generated.
47
48int runetochar(char *s, const Rune *r);
49
50// chartorune copies (decodes) at most UTFmax bytes starting at s to
51// one rune, pointed to by r, and returns the number of bytes consumed.
52// If the input is not exactly in UTF format, chartorune will set *r
53// to Runeerror and return 1.
54//
55// Note: There is no special case for a "null-terminated" string. A
56// string whose first byte has the value 0 is the UTF8 encoding of the
57// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal
58// anywhere else in a UTF sequence.
59
60int chartorune(Rune *r, const char *s);
61
62// charntorune is like chartorune, except that it will access at most
63// n bytes of s. If the UTF sequence is incomplete within n bytes,
64// charntorune will set *r to Runeerror and return 0. If it is complete
65// but not in UTF format, it will set *r to Runeerror and return 1.
66//
67// Added 2004-09-24 by Wei-Hwa Huang
68
69int charntorune(Rune *r, const char *s, int n);
70
71// isvalidcharntorune(str, n, r, consumed)
72// is a convenience function that calls "*consumed = charntorune(r, str, n)"
73// and returns an int (logically boolean) indicating whether the first
74// n bytes of str was a valid and complete UTF sequence.
75
76int isvalidcharntorune(const char *str, int n, Rune *r, int *consumed);
77
78// runelen returns the number of bytes required to convert r into UTF.
79
80int runelen(Rune r);
81
82// runenlen returns the number of bytes required to convert the n
83// runes pointed to by r into UTF.
84
85int runenlen(const Rune *r, int n);
86
87// fullrune returns 1 if the string s of length n is long enough to be
88// decoded by chartorune, and 0 otherwise. This does not guarantee
89// that the string contains a legal UTF encoding. This routine is used
90// by programs that obtain input one byte at a time and need to know
91// when a full rune has arrived.
92
93int fullrune(const char *s, int n);
94
95// The following routines are analogous to the corresponding string
96// routines with "utf" substituted for "str", and "rune" substituted
97// for "chr".
98
99// utflen returns the number of runes that are represented by the UTF
100// string s. (cf. strlen)
101
102int utflen(const char *s);
103
104// utfnlen returns the number of complete runes that are represented
105// by the first n bytes of the UTF string s. If the last few bytes of
106// the string contain an incompletely coded rune, utfnlen will not
107// count them; in this way, it differs from utflen, which includes
108// every byte of the string. (cf. strnlen)
109
110int utfnlen(const char *s, long n);
111
112// utfrune returns a pointer to the first occurrence of rune r in the
113// UTF string s, or 0 if r does not occur in the string. The NULL
114// byte terminating a string is considered to be part of the string s.
115// (cf. strchr)
116
117const char *utfrune(const char *s, Rune r);
118
119// utfrrune returns a pointer to the last occurrence of rune r in the
120// UTF string s, or 0 if r does not occur in the string. The NULL
121// byte terminating a string is considered to be part of the string s.
122// (cf. strrchr)
123
124const char *utfrrune(const char *s, Rune r);
125
126// utfutf returns a pointer to the first occurrence of the UTF string
127// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the
128// null string, utfutf returns s1. (cf. strstr)
129
130const char *utfutf(const char *s1, const char *s2);
131
132// utfecpy copies UTF sequences until a null sequence has been copied,
133// but writes no sequences beyond es1. If any sequences are copied,
134// s1 is terminated by a null sequence, and a pointer to that sequence
135// is returned. Otherwise, the original s1 is returned. (cf. strecpy)
136
137char *utfecpy(char *s1, char *es1, const char *s2);
138
139// These functions are rune-string analogues of the corresponding
140// functions in strcat (3).
141//
142// These routines first appeared in Plan 9.
143// SEE ALSO
144// memmove (3)
145// rune (3)
146// strcat (2)
147//
148// BUGS: The outcome of overlapping moves varies among implementations.
149
150Rune *runestrcat(Rune *s1, const Rune *s2);
151Rune *runestrncat(Rune *s1, const Rune *s2, long n);
152
153const Rune *runestrchr(const Rune *s, Rune c);
154
155int runestrcmp(const Rune *s1, const Rune *s2);
156int runestrncmp(const Rune *s1, const Rune *s2, long n);
157
158Rune *runestrcpy(Rune *s1, const Rune *s2);
159Rune *runestrncpy(Rune *s1, const Rune *s2, long n);
160Rune *runestrecpy(Rune *s1, Rune *es1, const Rune *s2);
161
163
164const Rune *runestrrchr(const Rune *s, Rune c);
165long runestrlen(const Rune *s);
166const Rune *runestrstr(const Rune *s1, const Rune *s2);
167
168// The following routines test types and modify cases for Unicode
169// characters. Unicode defines some characters as letters and
170// specifies three cases: upper, lower, and title. Mappings among the
171// cases are also defined, although they are not exhaustive: some
172// upper case letters have no lower case mapping, and so on. Unicode
173// also defines several character properties, a subset of which are
174// checked by these routines. These routines are based on Unicode
175// version 3.0.0.
176//
177// NOTE: The routines are implemented in C, so the boolean functions
178// (e.g., isupperrune) return 0 for false and 1 for true.
179//
180//
181// toupperrune, tolowerrune, and totitlerune are the Unicode case
182// mappings. These routines return the character unchanged if it has
183// no defined mapping.
184
188
189// isupperrune tests for upper case characters, including Unicode
190// upper case letters and targets of the toupper mapping. islowerrune
191// and istitlerune are defined analogously.
192
196
197// isalpharune tests for Unicode letters; this includes ideographs in
198// addition to alphabetic characters.
199
201
202// isdigitrune tests for digits. Non-digit numbers, such as Roman
203// numerals, are not included.
204
206
207// isideographicrune tests for ideographic characters and numbers, as
208// defined by the Unicode standard.
209
211
212// isspacerune tests for whitespace characters, including "C" locale
213// whitespace, Unicode defined whitespace, and the "zero-width
214// non-break space" character.
215
217
218// (The comments in this file were copied from the manpage files rune.3,
219// isalpharune.3, and runestrcat.3. Some formatting changes were also made
220// to conform to Google style. /JRM 11/11/05)
221
222#ifdef __cplusplus
223}
224#endif
225
226#endif
const Rune * runestrstr(const Rune *s1, const Rune *s2)
int utfnlen(const char *s, long n)
int runestrcmp(const Rune *s1, const Rune *s2)
int charntorune(Rune *r, const char *s, int n)
Definition: rune.c:64
int chartorune(Rune *r, const char *s)
Definition: rune.c:163
Rune tolowerrune(Rune r)
int istitlerune(Rune r)
int isvalidcharntorune(const char *str, int n, Rune *r, int *consumed)
Definition: rune.c:239
int isdigitrune(Rune r)
int islowerrune(Rune r)
int runestrncmp(const Rune *s1, const Rune *s2, long n)
const char * utfutf(const char *s1, const char *s2)
int runenlen(const Rune *r, int n)
Definition: rune.c:305
int fullrune(const char *s, int n)
Definition: rune.c:326
@ Runemax
Definition: utf.h:26
@ Runesync
Definition: utf.h:23
@ UTFmax
Definition: utf.h:22
@ Runeerror
Definition: utf.h:25
@ Runeself
Definition: utf.h:24
long runestrlen(const Rune *s)
const char * utfrune(const char *s, Rune r)
signed int Rune
Definition: utf.h:19
Rune * runestrncat(Rune *s1, const Rune *s2, long n)
Rune * runestrdup(const Rune *s)
int isupperrune(Rune r)
Rune toupperrune(Rune r)
const char * utfrrune(const char *s, Rune r)
const Rune * runestrrchr(const Rune *s, Rune c)
int isideographicrune(Rune r)
Rune * runestrcat(Rune *s1, const Rune *s2)
const Rune * runestrchr(const Rune *s, Rune c)
int isspacerune(Rune r)
Rune * runestrncpy(Rune *s1, const Rune *s2, long n)
int isalpharune(Rune r)
int runelen(Rune r)
Definition: rune.c:299
Rune totitlerune(Rune r)
Rune * runestrecpy(Rune *s1, Rune *es1, const Rune *s2)
int utflen(const char *s)
int runetochar(char *s, const Rune *r)
Definition: rune.c:244
Rune * runestrcpy(Rune *s1, const Rune *s2)
char * utfecpy(char *s1, char *es1, const char *s2)