tesseract  4.00.00dev
unicharmap.cpp
Go to the documentation of this file.
1 // File: unicharmap.cpp
3 // Description: Unicode character/ligature to integer id class.
4 // Author: Thomas Kielbus
5 // Created: Wed Jun 28 17:05:01 PDT 2006
6 //
7 // (C) Copyright 2006, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include <assert.h>
21 #include "unichar.h"
22 #include "host.h"
23 #include "unicharmap.h"
24 
26 nodes(0) {
27 }
28 
30  if (nodes != 0)
31  delete[] nodes;
32 }
33 
34 // Search the given unichar representation in the tree, using length characters
35 // from it maximum. Each character in the string is interpreted as an index in
36 // an array of nodes.
37 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
38  int length) const {
39  UNICHARMAP_NODE* current_nodes = nodes;
40 
41  assert(*unichar_repr != '\0');
42  assert(length > 0 && length <= UNICHAR_LEN);
43 
44  int index = 0;
45  if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
46  do {
47  if (index + 1 >= length || unichar_repr[index + 1] == '\0')
48  return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
49  current_nodes =
50  current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
51  ++index;
52  } while (true);
53 }
54 
55 // Search the given unichar representation in the tree, creating the possibly
56 // missing nodes. Once the right place has been found, insert the given id and
57 // update the inserted flag to keep track of the insert. Each character in the
58 // string is interpreted as an index in an array of nodes.
59 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
60  const char* current_char = unichar_repr;
61  if (*current_char == '\0') return;
62  UNICHARMAP_NODE** current_nodes_pointer = &nodes;
63  do {
64  if (*current_nodes_pointer == 0)
65  *current_nodes_pointer = new UNICHARMAP_NODE[256];
66  if (current_char[1] == '\0') {
67  (*current_nodes_pointer)
68  [static_cast<unsigned char>(*current_char)].id = id;
69  return;
70  }
71  current_nodes_pointer =
72  &((*current_nodes_pointer)
73  [static_cast<unsigned char>(*current_char)].children);
74  ++current_char;
75  } while (true);
76 }
77 
78 // Search the given unichar representation in the tree, using length characters
79 // from it maximum. Each character in the string is interpreted as an index in
80 // an array of nodes. Stop once the tree does not have anymore nodes or once we
81 // found the right unichar_repr.
82 bool UNICHARMAP::contains(const char* const unichar_repr,
83  int length) const {
84  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
85  if (length <= 0 || length > UNICHAR_LEN) return false;
86  int index = 0;
87  if (index >= length || unichar_repr[index] == '\0') return false;
88  UNICHARMAP_NODE* current_nodes = nodes;
89 
90  while (current_nodes != 0 && index + 1 < length &&
91  unichar_repr[index + 1] != '\0') {
92  current_nodes =
93  current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
94  ++index;
95  }
96  return current_nodes != 0 &&
97  (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
98  current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
99 }
100 
101 // Return the minimum number of characters that must be used from this string
102 // to obtain a match in the UNICHARMAP.
103 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
104  const char* current_char = unichar_repr;
105  if (*current_char == '\0') return 0;
106  UNICHARMAP_NODE* current_nodes = nodes;
107 
108  while (current_nodes != NULL && *current_char != '\0') {
109  if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
110  return current_char + 1 - unichar_repr;
111  current_nodes =
112  current_nodes[static_cast<unsigned char>(*current_char)].children;
113  ++current_char;
114  }
115  return 0;
116 }
117 
119  if (nodes != 0)
120  {
121  delete[] nodes;
122  nodes = 0;
123  }
124 }
125 
126 UNICHARMAP::UNICHARMAP_NODE::UNICHARMAP_NODE() :
127 children(0),
128 id(-1) {
129 }
130 
131 // Recursively delete the children
132 UNICHARMAP::UNICHARMAP_NODE::~UNICHARMAP_NODE() {
133  if (children != 0) {
134  delete[] children;
135  }
136 }
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:37
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:59
void clear()
Definition: unicharmap.cpp:118
#define UNICHAR_LEN
Definition: unichar.h:31
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:82
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:103
int UNICHAR_ID
Definition: unichar.h:35