tesseract v5.3.3.20231005
normstrngs.h
Go to the documentation of this file.
1/**********************************************************************
2 * File: normstrngs.h
3 * Description: Utilities to normalize and manipulate UTF-32 and
4 * UTF-8 strings.
5 * Author: Ranjith Unnikrishnan
6 * Created: Thu July 4 2013
7 *
8 * (C) Copyright 2013, Google Inc.
9 * Licensed under the Apache License, Version 2.0 (the "License");
10 * you may not use this file except in compliance with the License.
11 * You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
18 *
19 **********************************************************************/
20
21#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22#define TESSERACT_CCUTIL_NORMSTRNGS_H_
23
24#include "export.h"
25
26#include "validator.h"
27
28#include <string>
29#include <vector>
30
31namespace tesseract {
32
33// The standard unicode normalizations.
34enum class UnicodeNormMode {
35 kNFD,
36 kNFC,
37 kNFKD,
38 kNFKC,
39};
40
41// To normalize away differences in punctuation that are ambiguous, like
42// curly quotes and different widths of dash.
43enum class OCRNorm {
44 kNone,
46};
47
48// To validate and normalize away some subtle differences that can occur in
49// Indic scripts, eg ensuring that an explicit virama is always followed by
50// a zero-width non-joiner.
51enum class GraphemeNorm {
52 kNone,
54};
55
56// Normalizes a UTF8 string according to the given modes. Returns true on
57// success. If false is returned, some failure or invalidity was present, and
58// the result string is produced on a "best effort" basis.
59TESS_UNICHARSET_TRAINING_API
60bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
61 GraphemeNorm grapheme_normalize, const char *str8,
62 std::string *normalized);
63
64// Normalizes a UTF8 string according to the given modes and splits into
65// graphemes according to g_mode. Returns true on success. If false is returned,
66// some failure or invalidity was present, and the result string is produced on
67// a "best effort" basis.
68TESS_UNICHARSET_TRAINING_API
70 GraphemeNormMode g_mode, bool report_errors, const char *str8,
71 std::vector<std::string> *graphemes);
72
73// Applies just the OCR-specific normalizations and return the normalized char.
75
76// Returns true if the OCRNormalized ch1 and ch2 are the same.
77bool IsOCREquivalent(char32 ch1, char32 ch2);
78
79// Returns true if the value lies in the range of valid unicodes.
80bool IsValidCodepoint(const char32 ch);
81
82// Returns true a code point has the White_Space Unicode property.
83TESS_UNICHARSET_TRAINING_API
84bool IsWhitespace(const char32 ch);
85
86// Returns true if every char in the given (null-terminated) string has the
87// White_Space Unicode property.
88TESS_UNICHARSET_TRAINING_API
89bool IsUTF8Whitespace(const char *text);
90
91// Returns the length of bytes of the prefix of 'text' that have the White_Space
92// unicode property.
93TESS_UNICHARSET_TRAINING_API
94unsigned int SpanUTF8Whitespace(const char *text);
95
96// Returns the length of bytes of the prefix of 'text' that DO NOT have the
97// White_Space unicode property.
98TESS_UNICHARSET_TRAINING_API
99unsigned int SpanUTF8NotWhitespace(const char *text);
100
101// Returns true if the char is interchange valid i.e. no C0 or C1 control codes
102// (other than CR LF HT FF) and no non-characters.
103TESS_UNICHARSET_TRAINING_API
104bool IsInterchangeValid(const char32 ch);
105
106// Same as above but restricted to 7-bit ASCII.
107TESS_UNICHARSET_TRAINING_API
109
110// Convert a full-width UTF-8 string to half-width.
111TESS_UNICHARSET_TRAINING_API
113
114} // namespace tesseract
115
116#endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
signed int char32
bool IsOCREquivalent(char32 ch1, char32 ch2)
Definition: normstrngs.cpp:219
GraphemeNormMode
Definition: validator.h:36
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:228
char32 OCRNormalize(char32 ch)
Definition: normstrngs.cpp:208
unsigned int SpanUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:237
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:261
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNormMode g_mode, bool report_errors, const char *str8, std::vector< std::string > *graphemes)
Definition: normstrngs.cpp:179
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, GraphemeNorm grapheme_normalize, const char *str8, std::string *normalized)
Definition: normstrngs.cpp:152
bool IsInterchangeValid7BitAscii(const char32 ch)
Definition: normstrngs.cpp:276
char32 FullwidthToHalfwidth(const char32 ch)
Definition: normstrngs.cpp:282
unsigned int SpanUTF8NotWhitespace(const char *text)
Definition: normstrngs.cpp:249
bool IsValidCodepoint(const char32 ch)
Definition: normstrngs.cpp:223
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:233