tesseract  4.00.00dev
unicharmap.cpp
Go to the documentation of this file.
1 // File: unicharmap.cpp
3 // Description: Unicode character/ligature to integer id class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include <assert.h>
21 #include "unichar.h"
22 #include "host.h"
23 #include "unicharmap.h"
24 
26 nodes(0) {
27 }
28 
30  delete[] nodes;
31 }
32 
33 // Search the given unichar representation in the tree. Each character in the
34 // string is interpreted as an index in an array of nodes.
35 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
36  const char* current_char = unichar_repr;
37  UNICHARMAP_NODE* current_nodes = nodes;
38 
39  assert(*unichar_repr != '\0');
40 
41  do {
42  if (*(current_char + 1) == '\0')
43  return current_nodes[static_cast<unsigned char>(*current_char)].id;
44  current_nodes =
45  current_nodes[static_cast<unsigned char>(*current_char)].children;
46  ++current_char;
47  } while (true);
48 }
49 
50 // Search the given unichar representation in the tree, using length characters
51 // from it maximum. Each character in the string is interpreted as an index in
52 // an array of nodes.
53 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
54  int length) const {
55  const char* current_char = unichar_repr;
56  UNICHARMAP_NODE* current_nodes = nodes;
57 
58  assert(*unichar_repr != '\0');
59  assert(length > 0 && length <= UNICHAR_LEN);
60 
61  do {
62  if (length == 1 || *(current_char + 1) == '\0')
63  return current_nodes[static_cast<unsigned char>(*current_char)].id;
64  current_nodes =
65  current_nodes[static_cast<unsigned char>(*current_char)].children;
66  ++current_char;
67  --length;
68  } while (true);
69 }
70 
71 // Search the given unichar representation in the tree, creating the possibly
72 // missing nodes. Once the right place has been found, insert the given id and
73 // update the inserted flag to keep track of the insert. Each character in the
74 // string is interpreted as an index in an array of nodes.
75 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
76  const char* current_char = unichar_repr;
77  UNICHARMAP_NODE** current_nodes_pointer = &nodes;
78 
79  assert(*unichar_repr != '\0');
80  assert(id >= 0);
81 
82  do {
83  if (*current_nodes_pointer == 0)
84  *current_nodes_pointer = new UNICHARMAP_NODE[256];
85  if (*(current_char + 1) == '\0') {
86  (*current_nodes_pointer)
87  [static_cast<unsigned char>(*current_char)].id = id;
88  return;
89  }
90  current_nodes_pointer =
91  &((*current_nodes_pointer)
92  [static_cast<unsigned char>(*current_char)].children);
93  ++current_char;
94  } while (true);
95 }
96 
97 // Search the given unichar representation in the tree. Each character in the
98 // string is interpreted as an index in an array of nodes. Stop once the tree
99 // does not have anymore nodes or once we found the right unichar_repr.
100 bool UNICHARMAP::contains(const char* const unichar_repr) const {
101  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
102 
103  const char* current_char = unichar_repr;
104  UNICHARMAP_NODE* current_nodes = nodes;
105 
106  while (current_nodes != 0 && *(current_char + 1) != '\0') {
107  current_nodes =
108  current_nodes[static_cast<unsigned char>(*current_char)].children;
109  ++current_char;
110  }
111  return current_nodes != 0 && *(current_char + 1) == '\0' &&
112  current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
113 }
114 
115 // Search the given unichar representation in the tree, using length characters
116 // from it maximum. Each character in the string is interpreted as an index in
117 // an array of nodes. Stop once the tree does not have anymore nodes or once we
118 // found the right unichar_repr.
119 bool UNICHARMAP::contains(const char* const unichar_repr,
120  int length) const {
121  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
122  if (length <= 0 || length > UNICHAR_LEN) return false;
123 
124  const char* current_char = unichar_repr;
125  UNICHARMAP_NODE* current_nodes = nodes;
126 
127  while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
128  current_nodes =
129  current_nodes[static_cast<unsigned char>(*current_char)].children;
130  --length;
131  ++current_char;
132  }
133  return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
134  current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
135 }
136 
137 // Return the minimum number of characters that must be used from this string
138 // to obtain a match in the UNICHARMAP.
139 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
140  const char* current_char = unichar_repr;
141  UNICHARMAP_NODE* current_nodes = nodes;
142 
143  while (current_nodes != NULL && *current_char != '\0') {
144  if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
145  return current_char + 1 - unichar_repr;
146  current_nodes =
147  current_nodes[static_cast<unsigned char>(*current_char)].children;
148  ++current_char;
149  }
150  return 0;
151 }
152 
154  delete[] nodes;
155  nodes = 0;
156 }
157 
158 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
159 children(0),
160 id(-1) {
161 }
162 
163 // Recursively delete the children
164 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
165  delete[] children;
166 }
int UNICHAR_ID
Definition: unichar.h:33
bool contains(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
#define UNICHAR_LEN
Definition: unichar.h:30
void clear()
Definition: unicharmap.cpp:153
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:139
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:75
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharmap.cpp:35