tesseract v5.3.3.20231005
unichar.h
Go to the documentation of this file.
1// SPDX-License-Identifier: Apache-2.0
2// File: unichar.h
3// Description: Unicode character/ligature class.
4// Author: Ray Smith
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16
17#ifndef TESSERACT_CCUTIL_UNICHAR_H_
18#define TESSERACT_CCUTIL_UNICHAR_H_
19
20#include "export.h"
21
22#include <memory.h>
23#include <cstring>
24#include <string>
25#include <vector>
26
27namespace tesseract {
28
29// Maximum number of characters that can be stored in a UNICHAR. Must be
30// at least 4. Must not exceed 31 without changing the coding of length.
31#define UNICHAR_LEN 30
32
33// A UNICHAR_ID is the unique id of a unichar.
34using UNICHAR_ID = int;
35
36// A variable to indicate an invalid or uninitialized unichar id.
37static const int INVALID_UNICHAR_ID = -1;
38// A special unichar that corresponds to INVALID_UNICHAR_ID.
39static const char INVALID_UNICHAR[] = "__INVALID_UNICHAR__";
40
42 DIR_NEUTRAL = 0, // Text contains only neutral characters.
43 DIR_LEFT_TO_RIGHT = 1, // Text contains no Right-to-Left characters.
44 DIR_RIGHT_TO_LEFT = 2, // Text contains no Left-to-Right characters.
45 DIR_MIX = 3, // Text contains a mixture of left-to-right
46 // and right-to-left characters.
47};
48
49using char32 = signed int;
50
51// The UNICHAR class holds a single classification result. This may be
52// a single Unicode character (stored as between 1 and 4 utf8 bytes) or
53// multiple Unicode characters representing the NFKC expansion of a ligature
54// such as fi, ffl etc. These are also stored as utf8.
56public:
58 memset(chars, 0, UNICHAR_LEN);
59 }
60
61 // Construct from a utf8 string. If len<0 then the string is null terminated.
62 // If the string is too long to fit in the UNICHAR then it takes only what
63 // will fit.
64 UNICHAR(const char *utf8_str, int len);
65
66 // Construct from a single UCS4 character.
67 explicit UNICHAR(int unicode);
68
69 // Default copy constructor and operator= are OK.
70
71 // Get the first character as UCS-4.
72 int first_uni() const;
73
74 // Get the length of the UTF8 string.
75 int utf8_len() const {
76 int len = chars[UNICHAR_LEN - 1];
77 return len >= 0 && len < UNICHAR_LEN ? len : UNICHAR_LEN;
78 }
79
80 // Get a UTF8 string, but NOT nullptr terminated.
81 const char *utf8() const {
82 return chars;
83 }
84
85 // Get a terminated UTF8 string: Must delete[] it after use.
86 char *utf8_str() const;
87
88 // Get the number of bytes in the first character of the given utf8 string.
89 static int utf8_step(const char *utf8_str);
90
91 // A class to simplify iterating over and accessing elements of a UTF8
92 // string. Note that unlike the UNICHAR class, const_iterator does NOT COPY or
93 // take ownership of the underlying byte array. It also does not permit
94 // modification of the array (as the name suggests).
95 //
96 // Example:
97 // for (UNICHAR::const_iterator it = UNICHAR::begin(str, str_len);
98 // it != UNICHAR::end(str, len);
99 // ++it) {
100 // printf("UCS-4 symbol code = %d\n", *it);
101 // char buf[5];
102 // int char_len = it.get_utf8(buf); buf[char_len] = '\0';
103 // printf("Char = %s\n", buf);
104 // }
106 using CI = const_iterator;
107
108 public:
109 // Step to the next UTF8 character.
110 // If the current position is at an illegal UTF8 character, then print an
111 // error message and step by one byte. If the current position is at a
112 // nullptr value, don't step past it.
113 const_iterator &operator++();
114
115 // Return the UCS-4 value at the current position.
116 // If the current position is at an illegal UTF8 value, return a single
117 // space character.
118 int operator*() const;
119
120 // Store the UTF-8 encoding of the current codepoint into buf, which must be
121 // at least 4 bytes long. Return the number of bytes written.
122 // If the current position is at an illegal UTF8 value, writes a single
123 // space character and returns 1.
124 // Note that this method does not null-terminate the buffer.
125 int get_utf8(char *buf) const;
126 // Returns the number of bytes of the current codepoint. Returns 1 if the
127 // current position is at an illegal UTF8 value.
128 int utf8_len() const;
129 // Returns true if the UTF-8 encoding at the current position is legal.
130 bool is_legal() const;
131
132 // Return the pointer into the string at the current position.
133 const char *utf8_data() const {
134 return it_;
135 }
136
137 // Iterator equality operators.
138 friend bool operator==(const CI &lhs, const CI &rhs) {
139 return lhs.it_ == rhs.it_;
140 }
141 friend bool operator!=(const CI &lhs, const CI &rhs) {
142 return !(lhs == rhs);
143 }
144
145 private:
146 friend class UNICHAR;
147 explicit const_iterator(const char *it) : it_(it) {}
148
149 const char *it_; // Pointer into the string.
150 };
151
152 // Create a start/end iterator pointing to a string. Note that these methods
153 // are static and do NOT create a copy or take ownership of the underlying
154 // array.
155 static const_iterator begin(const char *utf8_str, int byte_length);
156 static const_iterator end(const char *utf8_str, int byte_length);
157
158 // Converts a utf-8 string to a vector of unicodes.
159 // Returns an empty vector if the input contains invalid UTF-8.
160 static std::vector<char32> UTF8ToUTF32(const char *utf8_str);
161 // Converts a vector of unicodes to a utf8 string.
162 // Returns an empty string if the input contains an invalid unicode.
163 static std::string UTF32ToUTF8(const std::vector<char32> &str32);
164
165private:
166 // A UTF-8 representation of 1 or more Unicode characters.
167 // The last element (chars[UNICHAR_LEN - 1]) is a length if
168 // its value < UNICHAR_LEN, otherwise it is a genuine character.
169 char chars[UNICHAR_LEN]{};
170};
171
172} // namespace tesseract
173
174#endif // TESSERACT_CCUTIL_UNICHAR_H_
#define UNICHAR_LEN
Definition: unichar.h:31
signed int char32
Definition: unichar.h:49
int32_t operator*(const ICOORD &op1, const ICOORD &op2)
Definition: points.h:428
StrongScriptDirection
Definition: unichar.h:41
@ DIR_MIX
Definition: unichar.h:45
@ DIR_LEFT_TO_RIGHT
Definition: unichar.h:43
@ DIR_RIGHT_TO_LEFT
Definition: unichar.h:44
@ DIR_NEUTRAL
Definition: unichar.h:42
int UNICHAR_ID
Definition: unichar.h:34
int utf8_len() const
Definition: unichar.h:75
const char * utf8() const
Definition: unichar.h:81
const char * utf8_data() const
Definition: unichar.h:133
friend bool operator!=(const CI &lhs, const CI &rhs)
Definition: unichar.h:141
friend bool operator==(const CI &lhs, const CI &rhs)
Definition: unichar.h:138
#define TESS_API
Definition: export.h:32