tesseract  4.00.00dev
weightmatrix.cpp
Go to the documentation of this file.
1 // File: weightmatrix.cpp
3 // Description: Hides distinction between float/int implementations.
4 // Author: Ray Smith
5 // Created: Tue Jun 17 11:46:20 PST 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "weightmatrix.h"
20 
21 #include "dotproductavx.h"
22 #include "dotproductsse.h"
23 #include "intsimdmatrix.h"
24 #include "simddetect.h"
25 #include "statistc.h"
26 #include "tprintf.h"
27 
28 namespace tesseract {
29 
30 // Number of iterations after which the correction effectively becomes unity.
31 const int kAdamCorrectionIterations = 200000;
32 // Epsilon in Adam to prevent division by zero.
33 const double kAdamEpsilon = 1e-8;
34 
35 // Copies the whole input transposed, converted to double, into *this.
37  int width = input.dim1();
38  int num_features = input.dim2();
39  ResizeNoInit(num_features, width);
40  for (int t = 0; t < width; ++t) WriteStrided(t, input[t]);
41 }
42 
43 // Sets up the network for training. Initializes weights using weights of
44 // scale `range` picked according to the random number generator `randomizer`.
45 int WeightMatrix::InitWeightsFloat(int no, int ni, bool use_adam,
46  float weight_range, TRand* randomizer) {
47  int_mode_ = false;
48  wf_.Resize(no, ni, 0.0);
49  if (randomizer != NULL) {
50  for (int i = 0; i < no; ++i) {
51  for (int j = 0; j < ni; ++j) {
52  wf_[i][j] = randomizer->SignedRand(weight_range);
53  }
54  }
55  }
56  use_adam_ = use_adam;
57  InitBackward();
58  return ni * no;
59 }
60 
61 // Changes the number of outputs to the size of the given code_map, copying
62 // the old weight matrix entries for each output from code_map[output] where
63 // non-negative, and uses the mean (over all outputs) of the existing weights
64 // for all outputs with negative code_map entries. Returns the new number of
65 // weights.
66 int WeightMatrix::RemapOutputs(const std::vector<int>& code_map) {
67  GENERIC_2D_ARRAY<double> old_wf(wf_);
68  int old_no = wf_.dim1();
69  int new_no = code_map.size();
70  int ni = wf_.dim2();
71  std::vector<double> means(ni, 0.0);
72  for (int c = 0; c < old_no; ++c) {
73  const double* weights = wf_[c];
74  for (int i = 0; i < ni; ++i) means[i] += weights[i];
75  }
76  for (double& mean : means) mean /= old_no;
77  wf_.ResizeNoInit(new_no, ni);
78  InitBackward();
79  for (int dest = 0; dest < new_no; ++dest) {
80  int src = code_map[dest];
81  const double* src_data = src >= 0 ? old_wf[src] : means.data();
82  memcpy(wf_[dest], src_data, ni * sizeof(*src_data));
83  }
84  return ni * new_no;
85 }
86 
87 // Converts a float network to an int network. Each set of input weights that
88 // corresponds to a single output weight is converted independently:
89 // Compute the max absolute value of the weight set.
90 // Scale so the max absolute value becomes MAX_INT8.
91 // Round to integer.
92 // Store a multiplicative scale factor (as a double) that will reproduce
93 // the original value, subject to rounding errors.
95  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
96  scales_.init_to_size(wi_.dim1(), 0.0);
97  int dim2 = wi_.dim2();
98  for (int t = 0; t < wi_.dim1(); ++t) {
99  double* f_line = wf_[t];
100  inT8* i_line = wi_[t];
101  double max_abs = 0.0;
102  for (int f = 0; f < dim2; ++f) {
103  double abs_val = fabs(f_line[f]);
104  if (abs_val > max_abs) max_abs = abs_val;
105  }
106  double scale = max_abs / MAX_INT8;
107  scales_[t] = scale;
108  if (scale == 0.0) scale = 1.0;
109  for (int f = 0; f < dim2; ++f) {
110  i_line[f] = IntCastRounded(f_line[f] / scale);
111  }
112  }
113  wf_.Resize(1, 1, 0.0);
114  int_mode_ = true;
115  multiplier_.reset(IntSimdMatrix::GetFastestMultiplier());
116  if (multiplier_ != nullptr) multiplier_->Init(wi_);
117 }
118 
119 // Allocates any needed memory for running Backward, and zeroes the deltas,
120 // thus eliminating any existing momentum.
122  int no = int_mode_ ? wi_.dim1() : wf_.dim1();
123  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
124  dw_.Resize(no, ni, 0.0);
125  updates_.Resize(no, ni, 0.0);
126  wf_t_.Transpose(wf_);
127  if (use_adam_) dw_sq_sum_.Resize(no, ni, 0.0);
128 }
129 
130 // Flag on mode to indicate that this weightmatrix uses inT8.
131 const int kInt8Flag = 1;
132 // Flag on mode to indicate that this weightmatrix uses adam.
133 const int kAdamFlag = 4;
134 // Flag on mode to indicate that this weightmatrix uses double. Set
135 // independently of kInt8Flag as even in int mode the scales can
136 // be float or double.
137 const int kDoubleFlag = 128;
138 
139 // Writes to the given file. Returns false in case of error.
140 bool WeightMatrix::Serialize(bool training, TFile* fp) const {
141  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
142  // format, without errs, so we can detect and read old format weight matrices.
143  uinT8 mode =
144  (int_mode_ ? kInt8Flag : 0) | (use_adam_ ? kAdamFlag : 0) | kDoubleFlag;
145  if (fp->FWrite(&mode, sizeof(mode), 1) != 1) return false;
146  if (int_mode_) {
147  if (!wi_.Serialize(fp)) return false;
148  if (!scales_.Serialize(fp)) return false;
149  } else {
150  if (!wf_.Serialize(fp)) return false;
151  if (training && !updates_.Serialize(fp)) return false;
152  if (training && use_adam_ && !dw_sq_sum_.Serialize(fp)) return false;
153  }
154  return true;
155 }
156 
157 // Reads from the given file. Returns false in case of error.
158 
159 bool WeightMatrix::DeSerialize(bool training, TFile* fp) {
160  uinT8 mode = 0;
161  if (fp->FRead(&mode, sizeof(mode), 1) != 1) return false;
162  int_mode_ = (mode & kInt8Flag) != 0;
163  use_adam_ = (mode & kAdamFlag) != 0;
164  if ((mode & kDoubleFlag) == 0) return DeSerializeOld(training, fp);
165  if (int_mode_) {
166  if (!wi_.DeSerialize(fp)) return false;
167  if (!scales_.DeSerialize(fp)) return false;
168  multiplier_.reset(IntSimdMatrix::GetFastestMultiplier());
169  if (multiplier_ != nullptr) multiplier_->Init(wi_);
170  } else {
171  if (!wf_.DeSerialize(fp)) return false;
172  if (training) {
173  InitBackward();
174  if (!updates_.DeSerialize(fp)) return false;
175  if (use_adam_ && !dw_sq_sum_.DeSerialize(fp)) return false;
176  }
177  }
178  return true;
179 }
180 
181 // As DeSerialize, but reads an old (float) format WeightMatrix for
182 // backward compatibility.
183 bool WeightMatrix::DeSerializeOld(bool training, TFile* fp) {
184  GENERIC_2D_ARRAY<float> float_array;
185  if (int_mode_) {
186  if (!wi_.DeSerialize(fp)) return false;
187  GenericVector<float> old_scales;
188  if (!old_scales.DeSerialize(fp)) return false;
189  scales_.resize_no_init(old_scales.size());
190  for (int i = 0; i < old_scales.size(); ++i) scales_[i] = old_scales[i];
191  } else {
192  if (!float_array.DeSerialize(fp)) return false;
193  FloatToDouble(float_array, &wf_);
194  }
195  if (training) {
196  InitBackward();
197  if (!float_array.DeSerialize(fp)) return false;
198  FloatToDouble(float_array, &updates_);
199  // Errs was only used in int training, which is now dead.
200  if (!float_array.DeSerialize(fp)) return false;
201  }
202  return true;
203 }
204 
205 // Computes matrix.vector v = Wu.
206 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
207 // u is imagined to have an extra element at the end with value 1, to
208 // implement the bias, but it doesn't actually have it.
209 // Asserts that the call matches what we have.
210 void WeightMatrix::MatrixDotVector(const double* u, double* v) const {
211  ASSERT_HOST(!int_mode_);
212  MatrixDotVectorInternal(wf_, true, false, u, v);
213 }
214 
215 void WeightMatrix::MatrixDotVector(const inT8* u, double* v) const {
216  ASSERT_HOST(int_mode_);
217  ASSERT_HOST(multiplier_ != nullptr);
218  multiplier_->MatrixDotVector(wi_, scales_, u, v);
219 }
220 
221 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
222 // component-wise products of *this[0] and v to inout.
223 void WeightMatrix::MultiplyAccumulate(const double* v, double* inout) {
224  ASSERT_HOST(!int_mode_);
225  ASSERT_HOST(wf_.dim1() == 1);
226  int n = wf_.dim2();
227  const double* u = wf_[0];
228  for (int i = 0; i < n; ++i) {
229  inout[i] += u[i] * v[i];
230  }
231 }
232 
233 // Computes vector.matrix v = uW.
234 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
235 // The last result is discarded, as v is assumed to have an imaginary
236 // last value of 1, as with MatrixDotVector.
237 void WeightMatrix::VectorDotMatrix(const double* u, double* v) const {
238  ASSERT_HOST(!int_mode_);
239  MatrixDotVectorInternal(wf_t_, false, true, u, v);
240 }
241 
242 // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
243 // u and v. In terms of the neural network, u is the gradients and v is the
244 // inputs.
245 // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
246 // Runs parallel if requested. Note that u and v must be transposed.
248  const TransposedArray& v,
249  bool in_parallel) {
250  ASSERT_HOST(!int_mode_);
251  int num_outputs = dw_.dim1();
252  ASSERT_HOST(u.dim1() == num_outputs);
253  ASSERT_HOST(u.dim2() == v.dim2());
254  int num_inputs = dw_.dim2() - 1;
255  int num_samples = u.dim2();
256  // v is missing the last element in dim1.
257  ASSERT_HOST(v.dim1() == num_inputs);
258 #ifdef _OPENMP
259 #pragma omp parallel for num_threads(4) if (in_parallel)
260 #endif
261  for (int i = 0; i < num_outputs; ++i) {
262  double* dwi = dw_[i];
263  const double* ui = u[i];
264  for (int j = 0; j < num_inputs; ++j) {
265  dwi[j] = DotProduct(ui, v[j], num_samples);
266  }
267  // The last element of v is missing, presumed 1.0f.
268  double total = 0.0;
269  for (int k = 0; k < num_samples; ++k) total += ui[k];
270  dwi[num_inputs] = total;
271  }
272 }
273 
274 // Updates the weights using the given learning rate and momentum.
275 // num_samples is the quotient to be used in the adam computation iff
276 // use_adam_ is true.
277 void WeightMatrix::Update(double learning_rate, double momentum,
278  double adam_beta, int num_samples) {
279  ASSERT_HOST(!int_mode_);
280  if (use_adam_ && num_samples > 0 && num_samples < kAdamCorrectionIterations) {
281  learning_rate *= sqrt(1.0 - pow(adam_beta, num_samples));
282  learning_rate /= 1.0 - pow(momentum, num_samples);
283  }
284  if (use_adam_ && num_samples > 0 && momentum > 0.0) {
285  dw_sq_sum_.SumSquares(dw_, adam_beta);
286  dw_ *= learning_rate * (1.0 - momentum);
287  updates_ *= momentum;
288  updates_ += dw_;
289  wf_.AdamUpdate(updates_, dw_sq_sum_, learning_rate * kAdamEpsilon);
290  } else {
291  dw_ *= learning_rate;
292  updates_ += dw_;
293  if (momentum > 0.0) wf_ += updates_;
294  if (momentum >= 0.0) updates_ *= momentum;
295  }
296  wf_t_.Transpose(wf_);
297 }
298 
299 // Adds the dw_ in other to the dw_ is *this.
301  ASSERT_HOST(dw_.dim1() == other.dw_.dim1());
302  ASSERT_HOST(dw_.dim2() == other.dw_.dim2());
303  dw_ += other.dw_;
304 }
305 
306 // Sums the products of weight updates in *this and other, splitting into
307 // positive (same direction) in *same and negative (different direction) in
308 // *changed.
309 void WeightMatrix::CountAlternators(const WeightMatrix& other, double* same,
310  double* changed) const {
311  int num_outputs = updates_.dim1();
312  int num_inputs = updates_.dim2();
313  ASSERT_HOST(num_outputs == other.updates_.dim1());
314  ASSERT_HOST(num_inputs == other.updates_.dim2());
315  for (int i = 0; i < num_outputs; ++i) {
316  const double* this_i = updates_[i];
317  const double* other_i = other.updates_[i];
318  for (int j = 0; j < num_inputs; ++j) {
319  double product = this_i[j] * other_i[j];
320  if (product < 0.0)
321  *changed -= product;
322  else
323  *same += product;
324  }
325  }
326 }
327 
328 // Helper computes an integer histogram bucket for a weight and adds it
329 // to the histogram.
330 const int kHistogramBuckets = 16;
331 static void HistogramWeight(double weight, STATS* histogram) {
332  int bucket = kHistogramBuckets - 1;
333  if (weight != 0.0) {
334  double logval = -log2(fabs(weight));
335  bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
336  }
337  histogram->add(bucket, 1);
338 }
339 
340 void WeightMatrix::Debug2D(const char* msg) {
341  STATS histogram(0, kHistogramBuckets);
342  if (int_mode_) {
343  for (int i = 0; i < wi_.dim1(); ++i) {
344  for (int j = 0; j < wi_.dim2(); ++j) {
345  HistogramWeight(wi_[i][j] * scales_[i], &histogram);
346  }
347  }
348  } else {
349  for (int i = 0; i < wf_.dim1(); ++i) {
350  for (int j = 0; j < wf_.dim2(); ++j) {
351  HistogramWeight(wf_[i][j], &histogram);
352  }
353  }
354  }
355  tprintf("%s\n", msg);
356  histogram.print();
357 }
358 
359 // Computes and returns the dot product of the two n-vectors u and v.
360 /* static */
361 double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
362  // Note: because the order of addition is different among the 3 DotProduct
363  // functions, the results can (and do) vary slightly (although they agree
364  // to within about 4e-15). This produces different results when running
365  // training, despite all random inputs being precisely equal.
366  // To get consistent results, use just one of these DotProduct functions.
367  // On a test multi-layer network, serial is 57% slower than sse, and avx
368  // is about 8% faster than sse. This suggests that the time is memory
369  // bandwidth constrained and could benefit from holding the reused vector
370  // in AVX registers.
371  if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
372  if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
373  double total = 0.0;
374  for (int k = 0; k < n; ++k) total += u[k] * v[k];
375  return total;
376 }
377 
378 // Utility function converts an array of float to the corresponding array
379 // of double.
380 /* static */
383  int dim1 = wf.dim1();
384  int dim2 = wf.dim2();
385  wd->ResizeNoInit(dim1, dim2);
386  for (int i = 0; i < dim1; ++i) {
387  const float* wfi = wf[i];
388  double* wdi = (*wd)[i];
389  for (int j = 0; j < dim2; ++j) wdi[j] = static_cast<double>(wfi[j]);
390  }
391 }
392 
393 // Computes matrix.vector v = Wu.
394 // u is of size W.dim2() - add_bias_fwd and the output v is of size
395 // W.dim1() - skip_bias_back.
396 // If add_bias_fwd, u is imagined to have an extra element at the end with value
397 // 1, to implement the bias, weight.
398 // If skip_bias_back, we are actullay performing the backwards product on a
399 // transposed matrix, so we need to drop the v output corresponding to the last
400 // element in dim1.
401 void WeightMatrix::MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
402  bool add_bias_fwd,
403  bool skip_bias_back, const double* u,
404  double* v) {
405  int num_results = w.dim1() - skip_bias_back;
406  int extent = w.dim2() - add_bias_fwd;
407  for (int i = 0; i < num_results; ++i) {
408  const double* wi = w[i];
409  double total = DotProduct(wi, u, extent);
410  if (add_bias_fwd) total += wi[extent]; // The bias value.
411  v[i] = total;
412  }
413 }
414 
415 } // namespace tesseract.
static bool IsSSEAvailable()
Definition: simddetect.h:38
#define MAX_INT8
Definition: host.h:60
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
void add(inT32 value, inT32 count)
Definition: statistc.cpp:99
double DotProductSSE(const double *u, const double *v, int n)
int RemapOutputs(const std::vector< int > &code_map)
void ResizeNoInit(int size1, int size2, int pad=0)
Definition: matrix.h:88
static double DotProduct(const double *u, const double *v, int n)
const int kDoubleFlag
void resize_no_init(int size)
Definition: genericvector.h:66
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
int size() const
Definition: genericvector.h:72
void MatrixDotVector(const double *u, double *v) const
int InitWeightsFloat(int no, int ni, bool use_adam, float weight_range, TRand *randomizer)
void Transpose(const GENERIC_2D_ARRAY< double > &input)
const int kInt8Flag
void VectorDotMatrix(const double *u, double *v) const
#define tprintf(...)
Definition: tprintf.h:31
int dim2() const
Definition: matrix.h:206
double DotProductAVX(const double *u, const double *v, int n)
uint8_t uinT8
Definition: host.h:35
void CountAlternators(const WeightMatrix &other, double *same, double *changed) const
static void FloatToDouble(const GENERIC_2D_ARRAY< float > &wf, GENERIC_2D_ARRAY< double > *wd)
int8_t inT8
Definition: host.h:34
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
double SignedRand(double range)
Definition: helpers.h:60
const int kHistogramBuckets
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:159
const double kAdamEpsilon
void Debug2D(const char *msg)
void Update(double learning_rate, double momentum, double adam_beta, int num_samples)
const int kAdamFlag
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool DeSerialize(bool swap, FILE *fp)
Definition: statistc.h:33
CMD_EVENTS mode
Definition: pgedit.cpp:116
void AddDeltas(const WeightMatrix &other)
const int kAdamCorrectionIterations
static bool IsAVXAvailable()
Definition: simddetect.h:26
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:39
bool DeSerialize(bool training, TFile *fp)
int IntCastRounded(double x)
Definition: helpers.h:179
static IntSimdMatrix * GetFastestMultiplier()
bool DeSerializeOld(bool training, TFile *fp)
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108
void print() const
Definition: statistc.cpp:532
void MultiplyAccumulate(const double *v, double *inout)
int dim1() const
Definition: matrix.h:205
bool Serialize(bool training, TFile *fp) const