tesseract v5.3.3.20231005
bitvector.cpp
Go to the documentation of this file.
1// Copyright 2011 Google Inc. All Rights Reserved.
2// Author: rays@google.com (Ray Smith)
4// File: bitvector.cpp
5// Description: Class replacement for BITVECTOR.
6// Author: Ray Smith
7//
8// (C) Copyright 2011, Google Inc.
9// Licensed under the Apache License, Version 2.0 (the "License");
10// you may not use this file except in compliance with the License.
11// You may obtain a copy of the License at
12// http://www.apache.org/licenses/LICENSE-2.0
13// Unless required by applicable law or agreed to in writing, software
14// distributed under the License is distributed on an "AS IS" BASIS,
15// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16// See the License for the specific language governing permissions and
17// limitations under the License.
18//
20
21#include "bitvector.h"
22#include <algorithm>
23#include <cstring>
24#include "helpers.h"
25#include "serialis.h" // for tesseract::Serialize
26
27namespace tesseract {
28
29// Fast lookup table to get the first least significant set bit in a byte.
30// For zero, the table has 255, but since it is a special case, most code
31// that uses this table will check for zero before looking up lsb_index_.
32const uint8_t BitVector::lsb_index_[256] = {
33 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,
34 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0,
35 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1,
36 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0,
37 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,
38 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0,
39 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1,
40 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0,
41 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
42
43// Fast lookup table to get the residual bits after zeroing the first (lowest)
44// set bit in a byte.
45const uint8_t BitVector::lsb_eroded_[256] = {
46 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e,
47 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, 0x1c, 0x1c, 0x1e,
48 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e,
49 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e,
50 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, 0x4c, 0x4c, 0x4e,
51 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e,
52 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e,
53 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, 0x78, 0x7c, 0x7c, 0x7e,
54 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e,
55 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e,
56 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, 0xa8, 0xac, 0xac, 0xae,
57 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe,
58 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce,
59 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, 0xd8, 0xdc, 0xdc, 0xde,
60 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee,
61 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe};
62
63// Fast lookup table to give the number of set bits in a byte.
64const int BitVector::hamming_table_[256] = {
65 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
66 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
67 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
68 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
69 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
70 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
71 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
72 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
73
75 array_ = src.array_;
76 bit_size_ = src.bit_size_;
77 return *this;
78}
79
80// Initializes the array to length * false.
81void BitVector::Init(int length) {
82 Alloc(length);
84}
85
86// Writes to the given file. Returns false in case of error.
87bool BitVector::Serialize(FILE *fp) const {
88 if (!tesseract::Serialize(fp, &bit_size_)) {
89 return false;
90 }
91 int wordlen = WordLength();
92 return tesseract::Serialize(fp, &array_[0], wordlen);
93}
94
95// Reads from the given file. Returns false in case of error.
96// If swap is true, assumes a big/little-endian swap is needed.
97bool BitVector::DeSerialize(bool swap, FILE *fp) {
98 uint32_t new_bit_size;
99 if (!tesseract::DeSerialize(fp, &new_bit_size)) {
100 return false;
101 }
102 if (swap) {
103 ReverseN(&new_bit_size, sizeof(new_bit_size));
104 }
105 Alloc(new_bit_size);
106 int wordlen = WordLength();
107 if (!tesseract::DeSerialize(fp, &array_[0], wordlen)) {
108 return false;
109 }
110 if (swap) {
111 for (int i = 0; i < wordlen; ++i) {
112 ReverseN(&array_[i], sizeof(array_[i]));
113 }
114 }
115 return true;
116}
117
119 memset(&array_[0], 0, ByteLength());
120}
122 memset(&array_[0], ~0, ByteLength());
123}
124
125// Returns the index of the next set bit after the given index.
126// Useful for quickly iterating through the set bits in a sparse vector.
127int BitVector::NextSetBit(int prev_bit) const {
128 // Move on to the next bit.
129 int next_bit = prev_bit + 1;
130 if (next_bit >= bit_size_) {
131 return -1;
132 }
133 // Check the remains of the word containing the next_bit first.
134 int next_word = WordIndex(next_bit);
135 int bit_index = next_word * kBitFactor;
136 int word_end = bit_index + kBitFactor;
137 uint32_t word = array_[next_word];
138 uint8_t byte = word & 0xff;
139 while (bit_index < word_end) {
140 if (bit_index + 8 > next_bit && byte != 0) {
141 while (bit_index + lsb_index_[byte] < next_bit && byte != 0) {
142 byte = lsb_eroded_[byte];
143 }
144 if (byte != 0) {
145 return bit_index + lsb_index_[byte];
146 }
147 }
148 word >>= 8;
149 bit_index += 8;
150 byte = word & 0xff;
151 }
152 // next_word didn't contain a 1, so find the next word with set bit.
153 ++next_word;
154 int wordlen = WordLength();
155 while (next_word < wordlen && (word = array_[next_word]) == 0) {
156 ++next_word;
157 bit_index += kBitFactor;
158 }
159 if (bit_index >= bit_size_) {
160 return -1;
161 }
162 // Find the first non-zero byte within the word.
163 while ((word & 0xff) == 0) {
164 word >>= 8;
165 bit_index += 8;
166 }
167 return bit_index + lsb_index_[word & 0xff];
168}
169
170// Returns the number of set bits in the vector.
172 int wordlen = WordLength();
173 int total_bits = 0;
174 for (int w = 0; w < wordlen; ++w) {
175 uint32_t word = array_[w];
176 for (int i = 0; i < 4; ++i) {
177 total_bits += hamming_table_[word & 0xff];
178 word >>= 8;
179 }
180 }
181 return total_bits;
182}
183
184// Logical in-place operations on whole bit vectors. Tries to do something
185// sensible if they aren't the same size, but they should be really.
187 int length = std::min(WordLength(), other.WordLength());
188 for (int w = 0; w < length; ++w) {
189 array_[w] |= other.array_[w];
190 }
191}
193 int length = std::min(WordLength(), other.WordLength());
194 for (int w = 0; w < length; ++w) {
195 array_[w] &= other.array_[w];
196 }
197 for (int w = WordLength() - 1; w >= length; --w) {
198 array_[w] = 0;
199 }
200}
202 int length = std::min(WordLength(), other.WordLength());
203 for (int w = 0; w < length; ++w) {
204 array_[w] ^= other.array_[w];
205 }
206}
207// Set subtraction *this = v1 - v2.
208void BitVector::SetSubtract(const BitVector &v1, const BitVector &v2) {
209 Alloc(v1.size());
210 int length = std::min(v1.WordLength(), v2.WordLength());
211 for (int w = 0; w < length; ++w) {
212 array_[w] = v1.array_[w] ^ (v1.array_[w] & v2.array_[w]);
213 }
214 for (int w = WordLength() - 1; w >= length; --w) {
215 array_[w] = v1.array_[w];
216 }
217}
218
219// Allocates memory for a vector of the given length.
220// Reallocates if the array is a different size, larger or smaller.
221void BitVector::Alloc(int length) {
222 int initial_wordlength = WordLength();
223 bit_size_ = length;
224 int new_wordlength = WordLength();
225 if (new_wordlength != initial_wordlength) {
226 array_.resize(new_wordlength);
227 }
228}
229
230} // namespace tesseract.
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:205
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
void Init(int length)
Definition: bitvector.cpp:81
static const uint8_t lsb_index_[256]
Definition: bitvector.h:38
void operator^=(const BitVector &other)
Definition: bitvector.cpp:201
static const int hamming_table_[256]
Definition: bitvector.h:43
BitVector & operator=(const BitVector &src)
Definition: bitvector.cpp:74
void SetSubtract(const BitVector &v1, const BitVector &v2)
Definition: bitvector.cpp:208
void operator&=(const BitVector &other)
Definition: bitvector.cpp:192
int NumSetBits() const
Definition: bitvector.cpp:171
bool DeSerialize(bool swap, FILE *fp)
Definition: bitvector.cpp:97
int NextSetBit(int prev_bit) const
Definition: bitvector.cpp:127
void operator|=(const BitVector &other)
Definition: bitvector.cpp:186
int size() const
Definition: bitvector.h:62
bool Serialize(FILE *fp) const
Definition: bitvector.cpp:87
static const uint8_t lsb_eroded_[256]
Definition: bitvector.h:41