tesseract  4.00.00dev
weightmatrix.cpp
Go to the documentation of this file.
1 // File: weightmatrix.h
3 // Description: Hides distinction between float/int implementations.
4 // Author: Ray Smith
5 // Created: Tue Jun 17 11:46:20 PST 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "weightmatrix.h"
20 
21 #include "dotproductavx.h"
22 #include "dotproductsse.h"
23 #include "simddetect.h"
24 #include "statistc.h"
25 #include "tprintf.h"
26 
27 namespace tesseract {
28 
29 // Copies the whole input transposed, converted to double, into *this.
31  int width = input.dim1();
32  int num_features = input.dim2();
33  ResizeNoInit(num_features, width);
34  for (int t = 0; t < width; ++t) WriteStrided(t, input[t]);
35 }
36 
37 // Sets up the network for training. Initializes weights using weights of
38 // scale `range` picked according to the random number generator `randomizer`.
39 int WeightMatrix::InitWeightsFloat(int no, int ni, bool ada_grad,
40  float weight_range, TRand* randomizer) {
41  int_mode_ = false;
42  wf_.Resize(no, ni, 0.0);
43  if (randomizer != NULL) {
44  for (int i = 0; i < no; ++i) {
45  for (int j = 0; j < ni; ++j) {
46  wf_[i][j] = randomizer->SignedRand(weight_range);
47  }
48  }
49  }
50  use_ada_grad_ = ada_grad;
51  InitBackward();
52  return ni * no;
53 }
54 
55 // Converts a float network to an int network. Each set of input weights that
56 // corresponds to a single output weight is converted independently:
57 // Compute the max absolute value of the weight set.
58 // Scale so the max absolute value becomes MAX_INT8.
59 // Round to integer.
60 // Store a multiplicative scale factor (as a double) that will reproduce
61 // the original value, subject to rounding errors.
63  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
64  scales_.init_to_size(wi_.dim1(), 0.0);
65  int dim2 = wi_.dim2();
66  for (int t = 0; t < wi_.dim1(); ++t) {
67  double* f_line = wf_[t];
68  inT8* i_line = wi_[t];
69  double max_abs = 0.0;
70  for (int f = 0; f < dim2; ++f) {
71  double abs_val = fabs(f_line[f]);
72  if (abs_val > max_abs) max_abs = abs_val;
73  }
74  double scale = max_abs / MAX_INT8;
75  scales_[t] = scale;
76  if (scale == 0.0) scale = 1.0;
77  for (int f = 0; f < dim2; ++f) {
78  i_line[f] = IntCastRounded(f_line[f] / scale);
79  }
80  }
81  wf_.Resize(1, 1, 0.0);
82  int_mode_ = true;
83 }
84 
85 // Allocates any needed memory for running Backward, and zeroes the deltas,
86 // thus eliminating any existing momentum.
88  int no = int_mode_ ? wi_.dim1() : wf_.dim1();
89  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
90  dw_.Resize(no, ni, 0.0);
91  updates_.Resize(no, ni, 0.0);
92  wf_t_.Transpose(wf_);
93  if (use_ada_grad_) dw_sq_sum_.Resize(no, ni, 0.0);
94 }
95 
96 // Flag on mode to indicate that this weightmatrix uses inT8.
97 const int kInt8Flag = 1;
98 // Flag on mode to indicate that this weightmatrix uses ada grad.
99 const int kAdaGradFlag = 4;
100 // Flag on mode to indicate that this weightmatrix uses double. Set
101 // independently of kInt8Flag as even in int mode the scales can
102 // be float or double.
103 const int kDoubleFlag = 128;
104 
105 // Writes to the given file. Returns false in case of error.
106 bool WeightMatrix::Serialize(bool training, TFile* fp) const {
107  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
108  // format, without errs, so we can detect and read old format weight matrices.
109  uinT8 mode = (int_mode_ ? kInt8Flag : 0) |
110  (use_ada_grad_ ? kAdaGradFlag : 0) | kDoubleFlag;
111  if (fp->FWrite(&mode, sizeof(mode), 1) != 1) return false;
112  if (int_mode_) {
113  if (!wi_.Serialize(fp)) return false;
114  if (!scales_.Serialize(fp)) return false;
115  } else {
116  if (!wf_.Serialize(fp)) return false;
117  if (training && !updates_.Serialize(fp)) return false;
118  if (training && use_ada_grad_ && !dw_sq_sum_.Serialize(fp)) return false;
119  }
120  return true;
121 }
122 
123 // Reads from the given file. Returns false in case of error.
124 
125 bool WeightMatrix::DeSerialize(bool training, TFile* fp) {
126  uinT8 mode = 0;
127  if (fp->FRead(&mode, sizeof(mode), 1) != 1) return false;
128  int_mode_ = (mode & kInt8Flag) != 0;
129  use_ada_grad_ = (mode & kAdaGradFlag) != 0;
130  if ((mode & kDoubleFlag) == 0) return DeSerializeOld(training, fp);
131  if (int_mode_) {
132  if (!wi_.DeSerialize(fp)) return false;
133  if (!scales_.DeSerialize(fp)) return false;
134  } else {
135  if (!wf_.DeSerialize(fp)) return false;
136  if (training) {
137  InitBackward();
138  if (!updates_.DeSerialize(fp)) return false;
139  if (use_ada_grad_ && !dw_sq_sum_.DeSerialize(fp)) return false;
140  }
141  }
142  return true;
143 }
144 
145 // As DeSerialize, but reads an old (float) format WeightMatrix for
146 // backward compatibility.
147 bool WeightMatrix::DeSerializeOld(bool training, TFile* fp) {
148  GENERIC_2D_ARRAY<float> float_array;
149  if (int_mode_) {
150  if (!wi_.DeSerialize(fp)) return false;
151  GenericVector<float> old_scales;
152  if (!old_scales.DeSerialize(fp)) return false;
153  scales_.resize_no_init(old_scales.size());
154  for (int i = 0; i < old_scales.size(); ++i) scales_[i] = old_scales[i];
155  } else {
156  if (!float_array.DeSerialize(fp)) return false;
157  FloatToDouble(float_array, &wf_);
158  }
159  if (training) {
160  InitBackward();
161  if (!float_array.DeSerialize(fp)) return false;
162  FloatToDouble(float_array, &updates_);
163  // Errs was only used in int training, which is now dead.
164  if (!float_array.DeSerialize(fp)) return false;
165  }
166  return true;
167 }
168 
169 // Computes matrix.vector v = Wu.
170 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
171 // u is imagined to have an extra element at the end with value 1, to
172 // implement the bias, but it doesn't actually have it.
173 // Asserts that the call matches what we have.
174 void WeightMatrix::MatrixDotVector(const double* u, double* v) const {
175  ASSERT_HOST(!int_mode_);
176  MatrixDotVectorInternal(wf_, true, false, u, v);
177 }
178 
179 void WeightMatrix::MatrixDotVector(const inT8* u, double* v) const {
180  ASSERT_HOST(int_mode_);
181  int num_out = wi_.dim1();
182  int num_in = wi_.dim2() - 1;
183  for (int i = 0; i < num_out; ++i) {
184  const inT8* Wi = wi_[i];
185  int total = 0;
187  total = IntDotProductSSE(u, Wi, num_in);
188  } else {
189  for (int j = 0; j < num_in; ++j) total += Wi[j] * u[j];
190  }
191  // Add in the bias and correct for integer values.
192  v[i] = (static_cast<double>(total) / MAX_INT8 + Wi[num_in]) * scales_[i];
193  }
194 }
195 
196 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
197 // component-wise products of *this[0] and v to inout.
198 void WeightMatrix::MultiplyAccumulate(const double* v, double* inout) {
199  ASSERT_HOST(!int_mode_);
200  ASSERT_HOST(wf_.dim1() == 1);
201  int n = wf_.dim2();
202  const double* u = wf_[0];
203  for (int i = 0; i < n; ++i) {
204  inout[i] += u[i] * v[i];
205  }
206 }
207 
208 // Computes vector.matrix v = uW.
209 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
210 // The last result is discarded, as v is assumed to have an imaginary
211 // last value of 1, as with MatrixDotVector.
212 void WeightMatrix::VectorDotMatrix(const double* u, double* v) const {
213  ASSERT_HOST(!int_mode_);
214  MatrixDotVectorInternal(wf_t_, false, true, u, v);
215 }
216 
217 // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
218 // u and v. In terms of the neural network, u is the gradients and v is the
219 // inputs.
220 // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
221 // Runs parallel if requested. Note that u and v must be transposed.
223  const TransposedArray& v,
224  bool in_parallel) {
225  ASSERT_HOST(!int_mode_);
226  int num_outputs = dw_.dim1();
227  ASSERT_HOST(u.dim1() == num_outputs);
228  ASSERT_HOST(u.dim2() == v.dim2());
229  int num_inputs = dw_.dim2() - 1;
230  int num_samples = u.dim2();
231  // v is missing the last element in dim1.
232  ASSERT_HOST(v.dim1() == num_inputs);
233 #ifdef _OPENMP
234 #pragma omp parallel for num_threads(4) if (in_parallel)
235 #endif
236  for (int i = 0; i < num_outputs; ++i) {
237  double* dwi = dw_[i];
238  const double* ui = u[i];
239  for (int j = 0; j < num_inputs; ++j) {
240  dwi[j] = DotProduct(ui, v[j], num_samples);
241  }
242  // The last element of v is missing, presumed 1.0f.
243  double total = 0.0;
244  for (int k = 0; k < num_samples; ++k) total += ui[k];
245  dwi[num_inputs] = total;
246  }
247 }
248 
249 // Updates the weights using the given learning rate and momentum.
250 // num_samples is the quotient to be used in the adagrad computation iff
251 // use_ada_grad_ is true.
252 void WeightMatrix::Update(double learning_rate, double momentum,
253  int num_samples) {
254  ASSERT_HOST(!int_mode_);
255  if (use_ada_grad_ && num_samples > 0) {
256  dw_sq_sum_.SumSquares(dw_);
257  dw_.AdaGradScaling(dw_sq_sum_, num_samples);
258  }
259  dw_ *= learning_rate;
260  updates_ += dw_;
261  if (momentum > 0.0) wf_ += updates_;
262  if (momentum >= 0.0) updates_ *= momentum;
263  wf_t_.Transpose(wf_);
264 }
265 
266 // Adds the dw_ in other to the dw_ is *this.
268  ASSERT_HOST(dw_.dim1() == other.dw_.dim1());
269  ASSERT_HOST(dw_.dim2() == other.dw_.dim2());
270  dw_ += other.dw_;
271 }
272 
273 // Sums the products of weight updates in *this and other, splitting into
274 // positive (same direction) in *same and negative (different direction) in
275 // *changed.
276 void WeightMatrix::CountAlternators(const WeightMatrix& other, double* same,
277  double* changed) const {
278  int num_outputs = updates_.dim1();
279  int num_inputs = updates_.dim2();
280  ASSERT_HOST(num_outputs == other.updates_.dim1());
281  ASSERT_HOST(num_inputs == other.updates_.dim2());
282  for (int i = 0; i < num_outputs; ++i) {
283  const double* this_i = updates_[i];
284  const double* other_i = other.updates_[i];
285  for (int j = 0; j < num_inputs; ++j) {
286  double product = this_i[j] * other_i[j];
287  if (product < 0.0)
288  *changed -= product;
289  else
290  *same += product;
291  }
292  }
293 }
294 
295 // Helper computes an integer histogram bucket for a weight and adds it
296 // to the histogram.
297 const int kHistogramBuckets = 16;
298 static void HistogramWeight(double weight, STATS* histogram) {
299  int bucket = kHistogramBuckets - 1;
300  if (weight != 0.0) {
301  double logval = -log2(fabs(weight));
302  bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
303  }
304  histogram->add(bucket, 1);
305 }
306 
307 void WeightMatrix::Debug2D(const char* msg) {
308  STATS histogram(0, kHistogramBuckets);
309  if (int_mode_) {
310  for (int i = 0; i < wi_.dim1(); ++i) {
311  for (int j = 0; j < wi_.dim2(); ++j) {
312  HistogramWeight(wi_[i][j] * scales_[i], &histogram);
313  }
314  }
315  } else {
316  for (int i = 0; i < wf_.dim1(); ++i) {
317  for (int j = 0; j < wf_.dim2(); ++j) {
318  HistogramWeight(wf_[i][j], &histogram);
319  }
320  }
321  }
322  tprintf("%s\n", msg);
323  histogram.print();
324 }
325 
326 // Computes and returns the dot product of the two n-vectors u and v.
327 /* static */
328 double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
329  // Note: because the order of addition is different among the 3 DotProduct
330  // functions, the results can (and do) vary slightly (although they agree
331  // to within about 4e-15). This produces different results when running
332  // training, despite all random inputs being precisely equal.
333  // To get consistent results, use just one of these DotProduct functions.
334  // On a test multi-layer network, serial is 57% slower than sse, and avx
335  // is about 8% faster than sse. This suggests that the time is memory
336  // bandwidth constrained and could benefit from holding the reused vector
337  // in AVX registers.
338 /*
339 omp simd code
340 real 4m17,294s
341 user 12m39,344s
342 sys 0m2,252s
343 
344 real 4m22,403s
345 user 12m53,408s
346 sys 0m2,116s
347 
348 old code
349 real 2m52,396s
350 user 7m42,624s
351 sys 0m2,008s
352 */
353 
354 #ifndef _OPENMP
355  if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
356  if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
357 #endif
358  double total = 0.0;
359 #ifdef _OPENMP
360 #pragma omp simd
361 #endif
362  for (int k = 0; k < n; ++k) total += u[k] * v[k];
363  return total;
364 }
365 
366 // Utility function converts an array of float to the corresponding array
367 // of double.
368 /* static */
371  int dim1 = wf.dim1();
372  int dim2 = wf.dim2();
373  wd->ResizeNoInit(dim1, dim2);
374  for (int i = 0; i < dim1; ++i) {
375  const float* wfi = wf[i];
376  double* wdi = (*wd)[i];
377  for (int j = 0; j < dim2; ++j) wdi[j] = static_cast<double>(wfi[j]);
378  }
379 }
380 
381 // Computes matrix.vector v = Wu.
382 // u is of size W.dim2() - add_bias_fwd and the output v is of size
383 // W.dim1() - skip_bias_back.
384 // If add_bias_fwd, u is imagined to have an extra element at the end with value
385 // 1, to implement the bias, weight.
386 // If skip_bias_back, we are actullay performing the backwards product on a
387 // transposed matrix, so we need to drop the v output corresponding to the last
388 // element in dim1.
389 void WeightMatrix::MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
390  bool add_bias_fwd,
391  bool skip_bias_back, const double* u,
392  double* v) {
393  int num_results = w.dim1() - skip_bias_back;
394  int extent = w.dim2() - add_bias_fwd;
395 #ifdef _OPENMP
396 #pragma omp simd
397 #endif
398  for (int i = 0; i < num_results; ++i) {
399  const double* wi = w[i];
400 #ifndef OLD
401  double total = 0.0;
402  for (int k = 0; k < extent; ++k) total += wi[k] * u[k];
403 #else
404  double total = DotProduct(wi, u, extent);
405 #endif
406  if (add_bias_fwd) total += wi[extent]; // The bias value.
407  v[i] = total;
408  }
409 }
410 
411 } // namespace tesseract.
double u[max]
bool DeSerialize(bool swap, FILE *fp)
const int kHistogramBuckets
void Debug2D(const char *msg)
void MatrixDotVector(const double *u, double *v) const
void AddDeltas(const WeightMatrix &other)
double DotProductSSE(const double *u, const double *v, int n)
#define tprintf(...)
Definition: tprintf.h:31
void Transpose(const GENERIC_2D_ARRAY< double > &input)
void resize_no_init(int size)
Definition: genericvector.h:66
void VectorDotMatrix(const double *u, double *v) const
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:37
void CountAlternators(const WeightMatrix &other, double *same, double *changed) const
int IntCastRounded(double x)
Definition: helpers.h:179
int size() const
Definition: genericvector.h:72
static bool IsAVXAvailable()
Definition: simddetect.h:26
#define ASSERT_HOST(x)
Definition: errcode.h:84
int dim1() const
Definition: matrix.h:201
int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n)
int dim2() const
Definition: matrix.h:202
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:155
const char int mode
Definition: ioapi.h:38
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
const int kDoubleFlag
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
const int kInt8Flag
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
void Update(double learning_rate, double momentum, int num_samples)
void ResizeNoInit(int size1, int size2)
Definition: matrix.h:86
double DotProductAVX(const double *u, const double *v, int n)
static double DotProduct(const double *u, const double *v, int n)
int8_t inT8
Definition: host.h:34
bool DeSerialize(bool training, TFile *fp)
int InitWeightsFloat(int no, int ni, bool ada_grad, float weight_range, TRand *randomizer)
static void FloatToDouble(const GENERIC_2D_ARRAY< float > &wf, GENERIC_2D_ARRAY< double > *wd)
uint8_t uinT8
Definition: host.h:35
bool Serialize(bool training, TFile *fp) const
bool DeSerializeOld(bool training, TFile *fp)
Definition: statistc.h:33
double DotProduct(const double *u, const double *v, int n)
#define MAX_INT8
Definition: host.h:60
void MultiplyAccumulate(const double *v, double *inout)
void print() const
Definition: statistc.cpp:534
const int kAdaGradFlag
double SignedRand(double range)
Definition: helpers.h:60
static bool IsSSEAvailable()
Definition: simddetect.h:28
double v[max]
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108