tesseract v5.3.3.20231005
unicharmap.cpp
Go to the documentation of this file.
1
2// File: unicharmap.cpp
3// Description: Unicode character/ligature to integer id class.
4// Author: Thomas Kielbus
5//
6// (C) Copyright 2006, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
16//
18
19#include "unicharmap.h"
20
21#include <tesseract/unichar.h>
22
23#include <cassert>
24
25namespace tesseract {
26
27UNICHARMAP::UNICHARMAP() : nodes(nullptr) {}
28
30 delete[] nodes;
31}
32
33// Search the given unichar representation in the tree, using length characters
34// from it maximum. Each character in the string is interpreted as an index in
35// an array of nodes.
36UNICHAR_ID UNICHARMAP::unichar_to_id(const char *const unichar_repr, int length) const {
37 UNICHARMAP_NODE *current_nodes = nodes;
38
39 assert(*unichar_repr != '\0');
40 assert(length > 0 && length <= UNICHAR_LEN);
41
42 int index = 0;
43 if (length <= 0 || unichar_repr[index] == '\0') {
44 return INVALID_UNICHAR_ID;
45 }
46 do {
47 if (index + 1 >= length || unichar_repr[index + 1] == '\0') {
48 return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
49 }
50 current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
51 ++index;
52 } while (true);
53}
54
55// Search the given unichar representation in the tree, creating the possibly
56// missing nodes. Once the right place has been found, insert the given id and
57// update the inserted flag to keep track of the insert. Each character in the
58// string is interpreted as an index in an array of nodes.
59void UNICHARMAP::insert(const char *const unichar_repr, UNICHAR_ID id) {
60 const char *current_char = unichar_repr;
61 if (*current_char == '\0') {
62 return;
63 }
64 UNICHARMAP_NODE **current_nodes_pointer = &nodes;
65 do {
66 if (*current_nodes_pointer == nullptr) {
67 *current_nodes_pointer = new UNICHARMAP_NODE[256];
68 }
69 if (current_char[1] == '\0') {
70 (*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].id = id;
71 return;
72 }
73 current_nodes_pointer =
74 &((*current_nodes_pointer)[static_cast<unsigned char>(*current_char)].children);
75 ++current_char;
76 } while (true);
77}
78
79// Search the given unichar representation in the tree, using length characters
80// from it maximum. Each character in the string is interpreted as an index in
81// an array of nodes. Stop once the tree does not have anymore nodes or once we
82// found the right unichar_repr.
83bool UNICHARMAP::contains(const char *const unichar_repr, int length) const {
84 if (unichar_repr == nullptr || *unichar_repr == '\0') {
85 return false;
86 }
87 if (length <= 0 || length > UNICHAR_LEN) {
88 return false;
89 }
90 int index = 0;
91 if (unichar_repr[index] == '\0') {
92 return false;
93 }
94 UNICHARMAP_NODE *current_nodes = nodes;
95
96 while (current_nodes != nullptr && index + 1 < length && unichar_repr[index + 1] != '\0') {
97 current_nodes = current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
98 ++index;
99 }
100 return current_nodes != nullptr && (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
101 current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
102}
103
104// Return the minimum number of characters that must be used from this string
105// to obtain a match in the UNICHARMAP.
106int UNICHARMAP::minmatch(const char *const unichar_repr) const {
107 const char *current_char = unichar_repr;
108 if (*current_char == '\0') {
109 return 0;
110 }
111 UNICHARMAP_NODE *current_nodes = nodes;
112
113 while (current_nodes != nullptr && *current_char != '\0') {
114 if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0) {
115 return current_char + 1 - unichar_repr;
116 }
117 current_nodes = current_nodes[static_cast<unsigned char>(*current_char)].children;
118 ++current_char;
119 }
120 return 0;
121}
122
124 delete[] nodes;
125 nodes = nullptr;
126}
127
128UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() : children(nullptr), id(-1) {}
129
130// Recursively delete the children
131UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
132 delete[] children;
133}
134
135} // namespace tesseract
#define UNICHAR_LEN
Definition: unichar.h:31
int UNICHAR_ID
Definition: unichar.h:34
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:83
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:36
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:106