tesseract v5.3.3.20231005
weightmatrix.cpp
Go to the documentation of this file.
1
2// File: weightmatrix.cpp
3// Description: Hides distinction between float/int implementations.
4// Author: Ray Smith
5//
6// (C) Copyright 2014, Google Inc.
7// Licensed under the Apache License, Version 2.0 (the "License");
8// you may not use this file except in compliance with the License.
9// You may obtain a copy of the License at
10// http://www.apache.org/licenses/LICENSE-2.0
11// Unless required by applicable law or agreed to in writing, software
12// distributed under the License is distributed on an "AS IS" BASIS,
13// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14// See the License for the specific language governing permissions and
15// limitations under the License.
17
18#include "weightmatrix.h"
19
20#include <cassert> // for assert
21#include "intsimdmatrix.h"
22#include "simddetect.h" // for DotProduct
23#include "statistc.h"
24#include "tprintf.h" // forTFloat
25
26namespace tesseract {
27
28#if defined(ANDROID)
29static inline TFloat log2(TFloat n) {
30 return log(n) / log(2.0);
31}
32#endif // ANDROID
33
34// Number of iterations after which the correction effectively becomes unity.
35const int kAdamCorrectionIterations = 200000;
36// Epsilon in Adam to prevent division by zero.
37const TFloat kAdamEpsilon = 1e-8;
38
39// Utility functions convert between double and float arrays.
40#ifdef FAST_FLOAT
41static void DoubleToFloat(const GENERIC_2D_ARRAY<double> &src, GENERIC_2D_ARRAY<float> &dst) {
42 const auto dim1 = src.dim1();
43 const auto dim2 = src.dim2();
44 dst.ResizeNoInit(dim1, dim2);
45 for (int i = 0; i < dim1; ++i) {
46 const auto *src_i = src[i];
47 auto *dst_i = dst[i];
48 for (int j = 0; j < dim2; ++j) {
49 dst_i[j] = static_cast<float>(src_i[j]);
50 }
51 }
52}
53#endif
54
55static void FloatToDouble(const GENERIC_2D_ARRAY<float> &src, GENERIC_2D_ARRAY<double> &dst) {
56 const auto dim1 = src.dim1();
57 const auto dim2 = src.dim2();
58 dst.ResizeNoInit(dim1, dim2);
59 for (int i = 0; i < dim1; ++i) {
60 const auto *src_i = src[i];
61 auto *dst_i = dst[i];
62 for (int j = 0; j < dim2; ++j) {
63 dst_i[j] = static_cast<double>(src_i[j]);
64 }
65 }
66}
67
68static bool DeSerialize(TFile *fp, GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
69#ifdef FAST_FLOAT
70 GENERIC_2D_ARRAY<double> double_array;
71 if (!double_array.DeSerialize(fp)) {
72 return false;
73 }
74 DoubleToFloat(double_array, tfloat_array);
75 return true;
76#else
77 return tfloat_array.DeSerialize(fp);
78#endif
79}
80
81static bool Serialize(TFile *fp, const GENERIC_2D_ARRAY<TFloat> &tfloat_array) {
82#ifdef FAST_FLOAT
83 GENERIC_2D_ARRAY<double> double_array;
84 FloatToDouble(tfloat_array, double_array);
85 return double_array.Serialize(fp);
86#else
87 return tfloat_array.Serialize(fp);
88#endif
89}
90
91// Computes matrix.vector v = Wu.
92// u is of size W.dim2() - add_bias_fwd and the output v is of size
93// W.dim1() - skip_bias_back.
94// If add_bias_fwd, u is imagined to have an extra element at the end with value
95// 1, to implement the bias, weight.
96// If skip_bias_back, we are actually performing the backwards product on a
97// transposed matrix, so we need to drop the v output corresponding to the last
98// element in dim1.
99static inline void MatrixDotVectorInternal(const GENERIC_2D_ARRAY<TFloat> &w, bool add_bias_fwd,
100 bool skip_bias_back, const TFloat *u, TFloat *v) {
101 int num_results = w.dim1() - skip_bias_back;
102 int extent = w.dim2() - add_bias_fwd;
103 for (int i = 0; i < num_results; ++i) {
104 const TFloat *wi = w[i];
105 TFloat total = DotProduct(wi, u, extent);
106 if (add_bias_fwd) {
107 total += wi[extent]; // The bias value.
108 }
109 v[i] = total;
110 }
111}
112
113// Copies the whole input transposed, converted to TFloat, into *this.
115 int width = input.dim1();
116 int num_features = input.dim2();
117 ResizeNoInit(num_features, width);
118 for (int t = 0; t < width; ++t) {
119 WriteStrided(t, input[t]);
120 }
121}
122
123// Destructor.
124// It is defined here, so the compiler can create a single vtable
125// instead of weak vtables in every compilation unit.
127
128// Sets up the network for training. Initializes weights using weights of
129// scale `range` picked according to the random number generator `randomizer`.
130int WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam, float weight_range,
131 TRand *randomizer) {
132 int_mode_ = false;
133 wf_.Resize(no, ni, 0.0);
134 if (randomizer != nullptr) {
135 for (int i = 0; i < no; ++i) {
136 for (int j = 0; j < ni; ++j) {
137 wf_[i][j] = randomizer->SignedRand(weight_range);
138 }
139 }
140 }
141 use_adam_ = use_adam;
142 InitBackward();
143 return ni * no;
144}
145
146// Changes the number of outputs to the size of the given code_map, copying
147// the old weight matrix entries for each output from code_map[output] where
148// non-negative, and uses the mean (over all outputs) of the existing weights
149// for all outputs with negative code_map entries. Returns the new number of
150// weights.
151int WeightMatrix::RemapOutputs(const std::vector<int> &code_map) {
152 GENERIC_2D_ARRAY<TFloat> old_wf(wf_);
153 int old_no = wf_.dim1();
154 int new_no = code_map.size();
155 int ni = wf_.dim2();
156 std::vector<TFloat> means(ni, 0.0);
157 for (int c = 0; c < old_no; ++c) {
158 const TFloat *weights = wf_[c];
159 for (int i = 0; i < ni; ++i) {
160 means[i] += weights[i];
161 }
162 }
163 for (auto &mean : means) {
164 mean /= old_no;
165 }
166 wf_.Resize(new_no, ni, 0.0);
167 InitBackward();
168 for (int dest = 0; dest < new_no; ++dest) {
169 int src = code_map[dest];
170 const TFloat *src_data = src >= 0 ? old_wf[src] : means.data();
171 memcpy(wf_[dest], src_data, ni * sizeof(*src_data));
172 }
173 return ni * new_no;
174}
175
176// Converts a float network to an int network. Each set of input weights that
177// corresponds to a single output weight is converted independently:
178// Compute the max absolute value of the weight set.
179// Scale so the max absolute value becomes INT8_MAX.
180// Round to integer.
181// Store a multiplicative scale factor (as a TFloat) that will reproduce
182// the original value, subject to rounding errors.
184 wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
185 scales_.reserve(wi_.dim1());
186 int dim2 = wi_.dim2();
187 for (int t = 0; t < wi_.dim1(); ++t) {
188 TFloat *f_line = wf_[t];
189 int8_t *i_line = wi_[t];
190 TFloat max_abs = 0;
191 for (int f = 0; f < dim2; ++f) {
192 TFloat abs_val = fabs(f_line[f]);
193 if (abs_val > max_abs) {
194 max_abs = abs_val;
195 }
196 }
197 TFloat scale = max_abs / INT8_MAX;
198 scales_.push_back(scale / INT8_MAX);
199 if (scale == 0.0) {
200 scale = 1.0;
201 }
202 for (int f = 0; f < dim2; ++f) {
203 i_line[f] = IntCastRounded(f_line[f] / scale);
204 }
205 }
206 wf_.Resize(1, 1, 0.0);
207 int_mode_ = true;
209 int32_t rounded_num_out;
210 IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);
211 scales_.resize(rounded_num_out);
212 }
213}
214
215// Allocates any needed memory for running Backward, and zeroes the deltas,
216// thus eliminating any existing momentum.
218 int no = int_mode_ ? wi_.dim1() : wf_.dim1();
219 int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
220 dw_.Resize(no, ni, 0.0);
221 updates_.Resize(no, ni, 0.0);
222 wf_t_.Transpose(wf_);
223 if (use_adam_) {
224 dw_sq_sum_.Resize(no, ni, 0.0);
225 }
226}
227
228// Flag on mode to indicate that this weightmatrix uses int8_t.
229const int kInt8Flag = 1;
230// Flag on mode to indicate that this weightmatrix uses adam.
231const int kAdamFlag = 4;
232// Flag on mode to indicate that this weightmatrix uses double. Set
233// independently of kInt8Flag as even in int mode the scales can
234// be float or double.
235const int kDoubleFlag = 128;
236
237// Writes to the given file. Returns false in case of error.
238bool WeightMatrix::Serialize(bool training, TFile *fp) const {
239 // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
240 // format, without errs, so we can detect and read old format weight matrices.
241 uint8_t mode = (int_mode_ ? kInt8Flag : 0) | (use_adam_ ? kAdamFlag : 0) | kDoubleFlag;
242 if (!fp->Serialize(&mode)) {
243 return false;
244 }
245 if (int_mode_) {
246 if (!wi_.Serialize(fp)) {
247 return false;
248 }
249 uint32_t size = scales_.size();
250 if (!fp->Serialize(&size)) {
251 return false;
252 }
253 for (auto scale : scales_) {
254 // The scales stored in memory have an extra factor applied to them
255 // to allow faster operation. We have to remove that factor here
256 // before writing to disc.
257 double value = scale * INT8_MAX;
258 if (!fp->Serialize(&value)) {
259 return false;
260 }
261 }
262 } else {
263 if (!tesseract::Serialize(fp, wf_)) {
264 return false;
265 }
266 if (training) {
267 if (!tesseract::Serialize(fp, updates_)) {
268 return false;
269 }
270 if (use_adam_ && !tesseract::Serialize(fp, dw_sq_sum_)) {
271 return false;
272 }
273 }
274 }
275 return true;
276}
277
278// Reads from the given file. Returns false in case of error.
279
280bool WeightMatrix::DeSerialize(bool training, TFile *fp) {
281 uint8_t mode;
282 if (!fp->DeSerialize(&mode)) {
283 return false;
284 }
285 int_mode_ = (mode & kInt8Flag) != 0;
286 use_adam_ = (mode & kAdamFlag) != 0;
287 if ((mode & kDoubleFlag) == 0) {
288 return DeSerializeOld(training, fp);
289 }
290 if (int_mode_) {
291 if (!wi_.DeSerialize(fp)) {
292 return false;
293 }
294 uint32_t size;
295 if (!fp->DeSerialize(&size)) {
296 return false;
297 }
298#ifdef FAST_FLOAT
299 scales_.reserve(size);
300 for (auto n = size; n > 0; n--) {
301 double val;
302 if (!fp->DeSerialize(&val)) {
303 return false;
304 }
305 scales_.push_back(val / INT8_MAX);
306 }
307#else
308 scales_.resize(size);
309 if (!fp->DeSerialize(&scales_[0], size)) {
310 return false;
311 }
312 for (auto &scale : scales_) {
313 scale /= INT8_MAX;
314 }
315#endif
317 int32_t rounded_num_out;
318 IntSimdMatrix::intSimdMatrix->Init(wi_, shaped_w_, rounded_num_out);
319 scales_.resize(rounded_num_out);
320 }
321 } else {
322 if (!tesseract::DeSerialize(fp, wf_)) {
323 return false;
324 }
325 if (training) {
326 InitBackward();
327 if (!tesseract::DeSerialize(fp, updates_)) {
328 return false;
329 }
330 if (use_adam_) {
331 if (!tesseract::DeSerialize(fp, dw_sq_sum_)) {
332 return false;
333 }
334 }
335 }
336 }
337 return true;
338}
339
340// As DeSerialize, but reads an old (float) format WeightMatrix for
341// backward compatibility.
342bool WeightMatrix::DeSerializeOld(bool training, TFile *fp) {
343#ifdef FAST_FLOAT
344 // Not implemented.
345 ASSERT_HOST(!"not implemented");
346 return false;
347#else
348 if (int_mode_) {
349 if (!wi_.DeSerialize(fp)) {
350 return false;
351 }
352 std::vector<float> old_scales;
353 if (!fp->DeSerialize(old_scales)) {
354 return false;
355 }
356 scales_.reserve(old_scales.size());
357 for (float old_scale : old_scales) {
358 scales_.push_back(old_scale);
359 }
360 } else {
361 GENERIC_2D_ARRAY<float> float_array;
362 if (!float_array.DeSerialize(fp)) {
363 return false;
364 }
365 FloatToDouble(float_array, wf_);
366 }
367 if (training) {
368 InitBackward();
369 GENERIC_2D_ARRAY<float> float_array;
370 if (!float_array.DeSerialize(fp)) {
371 return false;
372 }
373 FloatToDouble(float_array, updates_);
374 // Errs was only used in int training, which is now dead.
375 if (!float_array.DeSerialize(fp)) {
376 return false;
377 }
378 }
379 return true;
380#endif
381}
382
383// Computes matrix.vector v = Wu.
384// u is of size W.dim2() - 1 and the output v is of size W.dim1().
385// u is imagined to have an extra element at the end with value 1, to
386// implement the bias, but it doesn't actually have it.
387// Asserts that the call matches what we have.
389 assert(!int_mode_);
390 MatrixDotVectorInternal(wf_, true, false, u, v);
391}
392
393void WeightMatrix::MatrixDotVector(const int8_t *u, TFloat *v) const {
394 assert(int_mode_);
397 &scales_[0], u, v);
398 } else {
399 IntSimdMatrix::MatrixDotVector(wi_, scales_, u, v);
400 }
401}
402
403// MatrixDotVector for peep weights, MultiplyAccumulate adds the
404// component-wise products of *this[0] and v to inout.
406 assert(!int_mode_);
407 assert(wf_.dim1() == 1);
408 int n = wf_.dim2();
409 const TFloat *u = wf_[0];
410 for (int i = 0; i < n; ++i) {
411 inout[i] += u[i] * v[i];
412 }
413}
414
415// Computes vector.matrix v = uW.
416// u is of size W.dim1() and the output v is of size W.dim2() - 1.
417// The last result is discarded, as v is assumed to have an imaginary
418// last value of 1, as with MatrixDotVector.
420 assert(!int_mode_);
421 MatrixDotVectorInternal(wf_t_, false, true, u, v);
422}
423
424// Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
425// u and v. In terms of the neural network, u is the gradients and v is the
426// inputs.
427// Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
428// Runs parallel if requested. Note that u and v must be transposed.
430 bool in_parallel) {
431 assert(!int_mode_);
432 int num_outputs = dw_.dim1();
433 assert(u.dim1() == num_outputs);
434 assert(u.dim2() == v.dim2());
435 int num_inputs = dw_.dim2() - 1;
436 int num_samples = u.dim2();
437 // v is missing the last element in dim1.
438 assert(v.dim1() == num_inputs);
439#ifdef _OPENMP
440# pragma omp parallel for num_threads(4) if (in_parallel)
441#endif
442 for (int i = 0; i < num_outputs; ++i) {
443 TFloat *dwi = dw_[i];
444 const TFloat *ui = u[i];
445 for (int j = 0; j < num_inputs; ++j) {
446 dwi[j] = DotProduct(ui, v[j], num_samples);
447 }
448 // The last element of v is missing, presumed 1.0f.
449 TFloat total = 0;
450 for (int k = 0; k < num_samples; ++k) {
451 total += ui[k];
452 }
453 dwi[num_inputs] = total;
454 }
455}
456
457// Updates the weights using the given learning rate and momentum.
458// num_samples is the quotient to be used in the adam computation iff
459// use_adam_ is true.
460void WeightMatrix::Update(float learning_rate, float momentum, float adam_beta, int num_samples) {
461 assert(!int_mode_);
462 if (use_adam_ && momentum > 0.0f && num_samples > 0 && num_samples < kAdamCorrectionIterations) {
463 learning_rate *= sqrt(1.0f - pow(adam_beta, num_samples));
464 learning_rate /= 1.0f - pow(momentum, num_samples);
465 }
466 if (use_adam_ && num_samples > 0 && momentum > 0.0f) {
467 dw_sq_sum_.SumSquares(dw_, adam_beta);
468 dw_ *= learning_rate * (1.0f - momentum);
469 updates_ *= momentum;
470 updates_ += dw_;
471 wf_.AdamUpdate(updates_, dw_sq_sum_, learning_rate * kAdamEpsilon);
472 } else {
473 dw_ *= learning_rate;
474 updates_ += dw_;
475 if (momentum > 0.0f) {
476 wf_ += updates_;
477 }
478 if (momentum >= 0.0f) {
479 updates_ *= momentum;
480 }
481 }
482 wf_t_.Transpose(wf_);
483}
484
485// Adds the dw_ in other to the dw_ is *this.
487 assert(dw_.dim1() == other.dw_.dim1());
488 assert(dw_.dim2() == other.dw_.dim2());
489 dw_ += other.dw_;
490}
491
492// Sums the products of weight updates in *this and other, splitting into
493// positive (same direction) in *same and negative (different direction) in
494// *changed.
496 TFloat *changed) const {
497 int num_outputs = updates_.dim1();
498 int num_inputs = updates_.dim2();
499 assert(num_outputs == other.updates_.dim1());
500 assert(num_inputs == other.updates_.dim2());
501 for (int i = 0; i < num_outputs; ++i) {
502 const TFloat *this_i = updates_[i];
503 const TFloat *other_i = other.updates_[i];
504 for (int j = 0; j < num_inputs; ++j) {
505 TFloat product = this_i[j] * other_i[j];
506 if (product < 0.0) {
507 *changed -= product;
508 } else {
509 *same += product;
510 }
511 }
512 }
513}
514
515// Helper computes an integer histogram bucket for a weight and adds it
516// to the histogram.
517const int kHistogramBuckets = 16;
518static void HistogramWeight(TFloat weight, STATS *histogram) {
519 int bucket = kHistogramBuckets - 1;
520 if (weight != 0.0) {
521 TFloat logval = -log2(fabs(weight));
522 bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
523 }
524 histogram->add(bucket, 1);
525}
526
527void WeightMatrix::Debug2D(const char *msg) {
528 STATS histogram(0, kHistogramBuckets - 1);
529 if (int_mode_) {
530 for (int i = 0; i < wi_.dim1(); ++i) {
531 for (int j = 0; j < wi_.dim2(); ++j) {
532 HistogramWeight(wi_[i][j] * scales_[i], &histogram);
533 }
534 }
535 } else {
536 for (int i = 0; i < wf_.dim1(); ++i) {
537 for (int j = 0; j < wf_.dim2(); ++j) {
538 HistogramWeight(wf_[i][j], &histogram);
539 }
540 }
541 }
542 tprintf("%s\n", msg);
543 histogram.print();
544}
545
546} // namespace tesseract.
#define ASSERT_HOST(x)
Definition: errcode.h:54
int value
const int kInt8Flag
const TFloat kAdamEpsilon
void tprintf(const char *format,...)
Definition: tprintf.cpp:41
int IntCastRounded(double x)
Definition: helpers.h:170
bool DeSerialize(bool swap, FILE *fp, std::vector< T > &data)
Definition: helpers.h:205
const int kDoubleFlag
bool Serialize(FILE *fp, const std::vector< T > &data)
Definition: helpers.h:236
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:105
DotProductFunction DotProduct
Definition: simddetect.cpp:80
const int kAdamFlag
double TFloat
Definition: tesstypes.h:39
const int kHistogramBuckets
const int kAdamCorrectionIterations
dest
Definition: upload.py:409
void AdamUpdate(const GENERIC_2D_ARRAY< T > &sum, const GENERIC_2D_ARRAY< T > &sqsum, const T &epsilon)
Definition: matrix.h:429
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:175
void SumSquares(const GENERIC_2D_ARRAY< T > &src, const T &decay_factor)
Definition: matrix.h:419
void Resize(int size1, int size2, const T &empty)
Definition: matrix.h:110
void ResizeNoInit(int size1, int size2, int pad=0)
Definition: matrix.h:94
bool Serialize(FILE *fp) const
Definition: matrix.h:150
static void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const std::vector< TFloat > &scales, const int8_t *u, TFloat *v)
MatrixDotVectorFunction matrixDotVectorFunction
static const IntSimdMatrix * intSimdMatrix
void Init(const GENERIC_2D_ARRAY< int8_t > &w, std::vector< int8_t > &shaped_w, int32_t &rounded_num_out) const
void print() const
Definition: statistc.cpp:547
void add(int32_t value, int32_t count)
Definition: statistc.cpp:99
double SignedRand(double range)
Definition: helpers.h:78
bool DeSerialize(std::string &data)
Definition: serialis.cpp:94
bool Serialize(const std::string &data)
Definition: serialis.cpp:107
void Transpose(const GENERIC_2D_ARRAY< TFloat > &input)
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:40
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
bool Serialize(bool training, TFile *fp) const
bool DeSerializeOld(bool training, TFile *fp)
void MultiplyAccumulate(const TFloat *v, TFloat *inout)
int InitWeightsFloat(int no, int ni, bool use_adam, float weight_range, TRand *randomizer)
void Update(float learning_rate, float momentum, float adam_beta, int num_samples)
void Debug2D(const char *msg)
void AddDeltas(const WeightMatrix &other)
int RemapOutputs(const std::vector< int > &code_map)
void VectorDotMatrix(const TFloat *u, TFloat *v) const
void MatrixDotVector(const TFloat *u, TFloat *v) const
bool DeSerialize(bool training, TFile *fp)
void CountAlternators(const WeightMatrix &other, TFloat *same, TFloat *changed) const