All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
normstrngs.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: normstrngs.h
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  * UTF-8 strings.
5  * Author: Ranjith Unnikrishnan
6  * Created: Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
23 
24 #include "genericvector.h"
25 #include "strngs.h"
26 
27 typedef signed int char32;
28 
29 namespace tesseract {
30 
31 // UTF-8 to UTF-32 conversion function.
32 void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
33 
34 // UTF-32 to UTF-8 convesion function.
35 void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
36 
37 // Normalize a single char32 using NFKC + OCR-specific transformations.
38 // NOTE that proper NFKC may require multiple characters as input. The
39 // assumption of this function is that the input is already as fully composed
40 // as it can be, but may require some compatibility normalizations or just
41 // OCR evaluation related normalizations.
43 
44 // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
45 // can contain multiple UTF32 code points.
46 STRING NormalizeUTF8String(const char* str8);
47 
48 // Apply just the OCR-specific normalizations and return the normalized char.
50 
51 // Returns true if the OCRNormalized ch1 and ch2 are the same.
52 bool IsOCREquivalent(char32 ch1, char32 ch2);
53 
54 // Returns true if the value lies in the range of valid unicodes.
55 bool IsValidCodepoint(const char32 ch);
56 
57 // Returns true a code point has the White_Space Unicode property.
58 bool IsWhitespace(const char32 ch);
59 // Returns true if every char in the given (null-terminated) string has the
60 // White_Space Unicode property.
61 bool IsUTF8Whitespace(const char* text);
62 
63 // Returns the length of bytes of the prefix of 'text' that have the White_Space
64 // unicode property.
65 int SpanUTF8Whitespace(const char* text);
66 
67 // Returns the length of bytes of the prefix of 'text' that DO NOT have the
68 // White_Space unicode property.
69 int SpanUTF8NotWhitespace(const char* text);
70 
71 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes
72 // (other than CR LF HT FF) and no non-characters.
73 bool IsInterchangeValid(const char32 ch);
74 // Same as above but restricted to 7-bit ASCII.
75 bool IsInterchangeValid7BitAscii(const char32 ch);
76 
77 // Convert a full-width UTF-8 string to half-width.
79 
80 } // namespace tesseract
81 
82 #endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
void UTF8ToUTF32(const char *utf8_str, GenericVector< char32 > *str32)
Definition: normstrngs.cpp:31
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:182
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:156
int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:186
int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:197
void NormalizeChar32(char32 ch, GenericVector< char32 > *str)
Definition: normstrngs.cpp:131
signed int char32
Definition: normstrngs.h:27
STRING NormalizeUTF8String(const char *str8)
Definition: normstrngs.cpp:116
void UTF32ToUTF8(const GenericVector< char32 > &str32, STRING *utf8_str)
Definition: normstrngs.cpp:45
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:208
Definition: strngs.h:44
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:239
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:170
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:166
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:176
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:232