tesseract-ocr.github.io/4.0.0-beta.1/a00044_source.html

 // File:        intsimdmatrix.h
 // Description: Base class for 8-bit int SIMD matrix multipliers.
 // Author:      Ray Smith
 // Created:     Tue Aug 15 07:37:20 PST 2017
 //
 // (C) Copyright 2017, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef TESSERACT_ARCH_INTSIMDMATRIX_H_
 #define TESSERACT_ARCH_INTSIMDMATRIX_H_

 #include <stdint.h>
 #include <vector>
 #include "genericvector.h"
 #include "matrix.h"

 namespace tesseract {

 // Base class for a SIMD function to multiply a matrix by a vector, with sources
 // of 8-bit signed integer, and result in a double, after appropriate scaling.
 // Assumes a specific method of multiplication that can be applied to any size
 // and number of SIMD registers as follows:
 // int32_t results are computed with num_outputs_per_register_ in each of
 // max_output_registers_ result registers, repeatedly until it would make too
 // many results, then the number of registers is halved, and so-on down to a
 // single result register. The last calculation only outputs the required number
 // of results instead of writing beyond the bounds. Eg: matrix has 75 outputs,
 //  num_outputs_per_register_ = 4, and max_output_registers_ = 8,
 // Step 1: 8x4=32 results are computed,
 // Step 2: 8x4=32 again, total 64,
 // Step 3: 2x4=8 (since 8x4 is too many, so is 4x4), total 72,
 // Step 4: 1x3, total 75.
 // Each step above is computed using a PartialFunc, which runs over the input
 // vector once. The input is read one registerful of num_inputs_per_register_
 // at a time (presumably 4x num_outputs_per_register_ since they are int8_t)
 // so the inputs MUST BE PADDED to a multiple of num_inputs_per_register_.
 // Since it is slow (on Intel at least) to horizontally add in a register,
 // provision is made to process num_inputs_per_group_ inputs at a time, with
 // the group being replicated num_input_groups_ times and multiplied by a
 // num_inputs_per_group_ by num_input_groups_ rectangle of the weights matrix.
 // This is most convenient if num_inputs_per_group_ is 4, and the product
 // sign-extends and sums 8x8=16 bit results to 32 bits, adding 4 adjacent
 // results in the process, but it doesn't have to be implemented that way.
 // The weights are re-ordered by Init() to be used sequentially by the above
 // algorithm, followed by the biases, so they can be added at the end.
 // The base class computes the base C++ implementation.
 // NOTE that, although the subclasses execute on different SIMD hardware, no
 // virtual methods are needed, as the constructor sets up everything that
 // is required to allow the base class implementation to do all the work.
 class IntSimdMatrix {
  public:
   // Constructor should set the data members to indicate the sizes.
   // NOTE: Base constructor public only for test purposes.
   IntSimdMatrix()
       : num_outputs_per_register_(1),
         max_output_registers_(1),
         num_inputs_per_register_(1),
         num_inputs_per_group_(1),
         num_input_groups_(1) {}

   // Factory makes and returns an IntSimdMatrix (sub)class of the best
   // available type for the current architecture.
   static IntSimdMatrix* GetFastestMultiplier();

   // Computes a reshaped copy of the weight matrix w. If there are no
   // partial_funcs_, it does nothing.
   void Init(const GENERIC_2D_ARRAY<int8_t>& w);

   // Rounds the size up to a multiple of the input register size (in int8_t).
   int RoundInputs(int size) const {
     return Roundup(size, num_inputs_per_register_);
   }
   // Rounds the size up to a multiple of the output register size (in int32_t).
   int RoundOutputs(int size) const {
     return Roundup(size, num_outputs_per_register_);
   }

   // Computes matrix.vector v = Wu.
   // u is of size W.dim2() - 1 and the output v is of size W.dim1().
   // u is imagined to have an extra element at the end with value 1, to
   // implement the bias, but it doesn't actually have it.
   // Computes the base C++ implementation, if there are no partial_funcs_.
   // NOTE: The size of the input vector (u) must be padded using
   // RoundInputs above.
   // The input will be over-read to the extent of the padding. There are no
   // alignment requirements.
   void MatrixDotVector(const GENERIC_2D_ARRAY<int8_t>& w,
                        const GenericVector<double>& scales, const int8_t* u,
                        double* v) const;

  protected:
   // Function to compute part of a matrix.vector multiplication. The weights
   // are in a very specific order (see above) in w, which is multiplied by
   // u of length num_in, to produce output v after scaling the integer results
   // by the corresponding member of scales.
   // The amount of w and scales consumed is fixed and not available to the
   // caller. The number of outputs written to v will be at most num_out.
   typedef void (*PartialFunc)(const int8_t* w, const double* scales,
                               const int8_t* u, int num_in, int num_out,
                               double* v);

   // Rounds the input up to a multiple of the given factor.
   static int Roundup(int input, int factor) {
     return (input + factor - 1) / factor * factor;
   }

   // Number of 32 bit outputs held in each register.
   int num_outputs_per_register_;
   // Maximum number of registers that we will use to hold outputs.
   int max_output_registers_;
   // Number of 8 bit inputs in the inputs register.
   int num_inputs_per_register_;
   // Number of inputs in each weight group.
   int num_inputs_per_group_;
   // Number of groups of inputs to be broadcast.
   int num_input_groups_;
   // The weights matrix reorganized in whatever way suits this instance.
   std::vector<int8_t> shaped_w_;
   // A series of functions to compute a partial result.
   std::vector<PartialFunc> partial_funcs_;
 };

 }  // namespace tesseract

 #endif  // TESSERACT_ARCH_INTSIMDMATRIX_H_
genericvector.h

tesseract::IntSimdMatrix::num_outputs_per_register_
int num_outputs_per_register_
Definition: intsimdmatrix.h:118

tesseract::IntSimdMatrix::MatrixDotVector
void MatrixDotVector(const GENERIC_2D_ARRAY< int8_t > &w, const GenericVector< double > &scales, const int8_t *u, double *v) const
Definition: intsimdmatrix.cpp:94

tesseract::IntSimdMatrix::RoundInputs
int RoundInputs(int size) const
Definition: intsimdmatrix.h:80

matrix.h

tesseract
Definition: baseapi.cpp:84

tesseract::IntSimdMatrix::num_inputs_per_register_
int num_inputs_per_register_
Definition: intsimdmatrix.h:122

tesseract::IntSimdMatrix::PartialFunc
void(* PartialFunc)(const int8_t *w, const double *scales, const int8_t *u, int num_in, int num_out, double *v)
Definition: intsimdmatrix.h:108

tesseract::IntSimdMatrix::IntSimdMatrix
IntSimdMatrix()
Definition: intsimdmatrix.h:64

tesseract::IntSimdMatrix::partial_funcs_
std::vector< PartialFunc > partial_funcs_
Definition: intsimdmatrix.h:130

tesseract::IntSimdMatrix::Init
void Init(const GENERIC_2D_ARRAY< int8_t > &w)
Definition: intsimdmatrix.cpp:44

GenericVector< double >

GENERIC_2D_ARRAY
Definition: matrix.h:47

tesseract::IntSimdMatrix::RoundOutputs
int RoundOutputs(int size) const
Definition: intsimdmatrix.h:84

tesseract::IntSimdMatrix::shaped_w_
std::vector< int8_t > shaped_w_
Definition: intsimdmatrix.h:128

tesseract::IntSimdMatrix
Definition: intsimdmatrix.h:60

tesseract::IntSimdMatrix::num_input_groups_
int num_input_groups_
Definition: intsimdmatrix.h:126

tesseract::IntSimdMatrix::max_output_registers_
int max_output_registers_
Definition: intsimdmatrix.h:120

tesseract::IntSimdMatrix::Roundup
static int Roundup(int input, int factor)
Definition: intsimdmatrix.h:113

tesseract::IntSimdMatrix::GetFastestMultiplier
static IntSimdMatrix * GetFastestMultiplier()
Definition: intsimdmatrix.cpp:29

tesseract::IntSimdMatrix::num_inputs_per_group_
int num_inputs_per_group_
Definition: intsimdmatrix.h:124