Eigen::TensorCostModel< Device > Class Template Reference

#include <TensorCostModel.h>

Static Public Member Functions

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads (double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
 
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize (double output_size, const TensorOpCost &cost_per_coeff)
 
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost (double output_size, const TensorOpCost &cost_per_coeff)
 

Static Public Attributes

static const int kDeviceCyclesPerComputeCycle = 1
 
static const int kStartupCycles = 100000
 
static const int kPerThreadCycles = 100000
 
static const int kTaskSize = 40000
 

Member Function Documentation

◆ numThreads()

template<typename Device >
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int Eigen::TensorCostModel< Device >::numThreads ( double  output_size,
const TensorOpCost cost_per_coeff,
int  max_threads 
)
inlinestatic
155  {
156  double cost = totalCost(output_size, cost_per_coeff);
157  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
158  // Make sure we don't invoke undefined behavior when we convert to an int.
159  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
160  return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
161  }
static const int kPerThreadCycles
Definition: TensorCostModel.h:148
static const int kStartupCycles
Definition: TensorCostModel.h:147
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:170
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920

References Eigen::TensorCostModel< Device >::kPerThreadCycles, Eigen::TensorCostModel< Device >::kStartupCycles, Eigen::numext::mini(), and Eigen::TensorCostModel< Device >::totalCost().

◆ taskSize()

template<typename Device >
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double Eigen::TensorCostModel< Device >::taskSize ( double  output_size,
const TensorOpCost cost_per_coeff 
)
inlinestatic
166  {
167  return totalCost(output_size, cost_per_coeff) / kTaskSize;
168  }
static const int kTaskSize
Definition: TensorCostModel.h:149

References Eigen::TensorCostModel< Device >::kTaskSize, and Eigen::TensorCostModel< Device >::totalCost().

◆ totalCost()

template<typename Device >
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double Eigen::TensorCostModel< Device >::totalCost ( double  output_size,
const TensorOpCost cost_per_coeff 
)
inlinestatic
171  {
172  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
173  // 11 is L2 cache latency on Haswell.
174  // We don't know whether data is in L1, L2 or L3. But we are most interested
175  // in single-threaded computational time around 100us-10ms (smaller time
176  // is too small for parallelization, larger time is not interesting
177  // either because we are probably using all available threads already).
178  // And for the target time range, L2 seems to be what matters. Data set
179  // fitting into L1 is too small to take noticeable time. Data set fitting
180  // only into L3 presumably will take more than 10ms to load and process.
181  const double kLoadCycles = 1.0 / 64 * 11;
182  const double kStoreCycles = 1.0 / 64 * 11;
183  // Scaling from Eigen compute cost to device cycles.
184  return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
185  }
static const int kDeviceCyclesPerComputeCycle
Definition: TensorCostModel.h:144

References Eigen::TensorCostModel< Device >::kDeviceCyclesPerComputeCycle, and Eigen::TensorOpCost::total_cost().

Referenced by Eigen::TensorCostModel< Device >::numThreads(), and Eigen::TensorCostModel< Device >::taskSize().

Member Data Documentation

◆ kDeviceCyclesPerComputeCycle

template<typename Device >
const int Eigen::TensorCostModel< Device >::kDeviceCyclesPerComputeCycle = 1
static

◆ kPerThreadCycles

template<typename Device >
const int Eigen::TensorCostModel< Device >::kPerThreadCycles = 100000
static

◆ kStartupCycles

template<typename Device >
const int Eigen::TensorCostModel< Device >::kStartupCycles = 100000
static

◆ kTaskSize

template<typename Device >
const int Eigen::TensorCostModel< Device >::kTaskSize = 40000
static

The documentation for this class was generated from the following file: