d5/d1f/TensorCostModel_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H

 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H


 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {


 // Class storing the cost of evaluating a tensor expression in terms of the

 // estimated number of operand bytes loads, bytes stored, and compute cycles.

 class TensorOpCost {

  public:

   // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple

   // model based on minimal reciprocal throughput numbers from Intel or

   // Agner Fog's tables would be better than what is there now.

   template <typename ArgType>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {

     return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType> >::Cost;

   }

   template <typename ArgType>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {

     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;

   }

   template <typename ArgType>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {

     return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType> >::Cost;

   }

   template <typename ArgType>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {

     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;

   }

   template <typename SrcType, typename TargetType>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {

     return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType> >::Cost;

   }


   EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}

   EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)

       : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {}


   EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized,

                                  double packet_size)

       : bytes_loaded_(bytes_loaded),

         bytes_stored_(bytes_stored),

         compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {

     eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));

     eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));

     eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,

                                                           double compute_cost) const {

     return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;

   }


   // Drop memory access component. Intended for cases when memory accesses are

   // sequential or are completely masked by computations.

   EIGEN_DEVICE_FUNC void dropMemoryCost() {

     bytes_loaded_ = 0;

     bytes_stored_ = 0;

   }


   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const {

     double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());

     double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());

     double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());

     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);

   }


   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const {

     double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());

     double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());

     double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());

     return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) {

     bytes_loaded_ += rhs.bytes_loaded();

     bytes_stored_ += rhs.bytes_stored();

     compute_cycles_ += rhs.compute_cycles();

     return *this;

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {

     bytes_loaded_ *= rhs;

     bytes_stored_ *= rhs;

     compute_cycles_ *= rhs;

     return *this;

   }


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) {

     lhs += rhs;

     return lhs;

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) {

     lhs *= rhs;

     return lhs;

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) {

     rhs *= lhs;

     return rhs;

   }


   friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {

     return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored()

               << ", compute_cycles = " << tc.compute_cycles() << "]";

   }


  private:

   double bytes_loaded_;

   double bytes_stored_;

   double compute_cycles_;

 };


 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads

 // in [1:max_threads] instead of just switching multi-threading off for small

 // work units.

 template <typename Device>

 class TensorCostModel {

  public:

   // Scaling from Eigen compute cost to device cycles.

   static const int kDeviceCyclesPerComputeCycle = 1;


   // Costs in device cycles.

   static const int kStartupCycles = 100000;

   static const int kPerThreadCycles = 100000;

   static const int kTaskSize = 40000;


   // Returns the number of threads in [1:max_threads] to use for

   // evaluating an expression with the given output size and cost per

   // coefficient.

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,

                                                               int max_threads) {

     double cost = totalCost(output_size, cost_per_coeff);

     double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;

     // Make sure we don't invoke undefined behavior when we convert to an int.

     threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());

     return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));

   }


   // taskSize assesses parallel task size.

   // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task

   // granularity needs to be increased to mitigate parallelization overheads.

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) {

     return totalCost(output_size, cost_per_coeff) / kTaskSize;

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,

                                                                 const TensorOpCost& cost_per_coeff) {

     // Cost of memory fetches from L2 cache. 64 is typical cache line size.

     // 11 is L2 cache latency on Haswell.

     // We don't know whether data is in L1, L2 or L3. But we are most interested

     // in single-threaded computational time around 100us-10ms (smaller time

     // is too small for parallelization, larger time is not interesting

     // either because we are probably using all available threads already).

     // And for the target time range, L2 seems to be what matters. Data set

     // fitting into L1 is too small to take noticeable time. Data set fitting

     // only into L3 presumably will take more than 10ms to load and process.

     const double kLoadCycles = 1.0 / 64 * 11;

     const double kStoreCycles = 1.0 / 64 * 11;

     // Scaling from Eigen compute cost to device cycles.

     return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);

   }

 };


 }  // namespace Eigen


 #endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

eigen_assert
#define eigen_assert(x)
Definition: Macros.h:910

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

Eigen::TensorCostModel
Definition: TensorCostModel.h:141

Eigen::TensorCostModel::numThreads
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition: TensorCostModel.h:154

Eigen::TensorCostModel::kDeviceCyclesPerComputeCycle
static const int kDeviceCyclesPerComputeCycle
Definition: TensorCostModel.h:144

Eigen::TensorCostModel::kPerThreadCycles
static const int kPerThreadCycles
Definition: TensorCostModel.h:148

Eigen::TensorCostModel::kStartupCycles
static const int kStartupCycles
Definition: TensorCostModel.h:147

Eigen::TensorCostModel::taskSize
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:166

Eigen::TensorCostModel::totalCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:170

Eigen::TensorCostModel::kTaskSize
static const int kTaskSize
Definition: TensorCostModel.h:149

Eigen::TensorOpCost
Definition: TensorCostModel.h:28

Eigen::TensorOpCost::cwiseMin
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:84

Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
Definition: TensorCostModel.h:55

Eigen::TensorOpCost::operator+=
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator+=(const TensorOpCost &rhs)
Definition: TensorCostModel.h:99

Eigen::TensorOpCost::bytes_loaded_
double bytes_loaded_
Definition: TensorCostModel.h:132

Eigen::TensorOpCost::operator*=
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator*=(double rhs)
Definition: TensorCostModel.h:106

Eigen::TensorOpCost::cwiseMax
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:92

Eigen::TensorOpCost::operator+
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
Definition: TensorCostModel.h:113

Eigen::TensorOpCost::bytes_stored
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const
Definition: TensorCostModel.h:69

Eigen::TensorOpCost::bytes_stored_
double bytes_stored_
Definition: TensorCostModel.h:133

Eigen::TensorOpCost::operator*
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
Definition: TensorCostModel.h:121

Eigen::TensorOpCost::MulCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()
Definition: TensorCostModel.h:34

Eigen::TensorOpCost::ModCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
Definition: TensorCostModel.h:46

Eigen::TensorOpCost::AddCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
Definition: TensorCostModel.h:38

Eigen::TensorOpCost::operator*
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
Definition: TensorCostModel.h:117

Eigen::TensorOpCost::CastCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
Definition: TensorCostModel.h:50

Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost()
Definition: TensorCostModel.h:54

Eigen::TensorOpCost::operator<<
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
Definition: TensorCostModel.h:126

Eigen::TensorOpCost::bytes_loaded
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const
Definition: TensorCostModel.h:68

Eigen::TensorOpCost::DivCost
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
Definition: TensorCostModel.h:42

Eigen::TensorOpCost::compute_cycles_
double compute_cycles_
Definition: TensorCostModel.h:134

Eigen::TensorOpCost::compute_cycles
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const
Definition: TensorCostModel.h:70

Eigen::TensorOpCost::TensorOpCost
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
Definition: TensorCostModel.h:58

Eigen::TensorOpCost::dropMemoryCost
EIGEN_DEVICE_FUNC void dropMemoryCost()
Definition: TensorCostModel.h:78

Eigen::TensorOpCost::total_cost
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
Definition: TensorCostModel.h:71

Eigen::numext::isfinite
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
Definition: BFloat16.h:752

Eigen::numext::maxi
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926

Eigen::numext::mini
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::GenericNumTraits
Definition: NumTraits.h:172

Eigen::internal::functor_traits
Definition: XprHelper.h:205

InternalHeaderCheck.h