#include <TensorCostModel.h>

Static Public Member Functions
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int	numThreads (double output_size, const TensorOpCost &cost_per_coeff, int max_threads)

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double	taskSize (double output_size, const TensorOpCost &cost_per_coeff)

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double	totalCost (double output_size, const TensorOpCost &cost_per_coeff)

Static Public Attributes
static const int	kDeviceCyclesPerComputeCycle = 1

static const int	kStartupCycles = 100000

static const int	kPerThreadCycles = 100000

static const int	kTaskSize = 40000

Member Function Documentation

◆ numThreads()

template<typename Device >

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int Eigen::TensorCostModel< Device >::numThreads	(	double	output_size,
		const TensorOpCost &	cost_per_coeff,
		int	max_threads
	)

inlinestatic

                                                                                {
     double cost = totalCost(output_size, cost_per_coeff);
     double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
     // Make sure we don't invoke undefined behavior when we convert to an int.
     threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
     return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
   }

References Eigen::TensorCostModel< Device >::kPerThreadCycles, Eigen::TensorCostModel< Device >::kStartupCycles, Eigen::numext::mini(), and Eigen::TensorCostModel< Device >::totalCost().

◆ taskSize()

template<typename Device >

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double Eigen::TensorCostModel< Device >::taskSize	(	double	output_size,
		const TensorOpCost &	cost_per_coeff
	)

inlinestatic

                                                                                                                        {
     return totalCost(output_size, cost_per_coeff) / kTaskSize;
   }

References Eigen::TensorCostModel< Device >::kTaskSize, and Eigen::TensorCostModel< Device >::totalCost().

◆ totalCost()

template<typename Device >

static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double Eigen::TensorCostModel< Device >::totalCost	(	double	output_size,
		const TensorOpCost &	cost_per_coeff
	)

inlinestatic

                                                                                                     {
     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
     // is too small for parallelization, larger time is not interesting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting
     // only into L3 presumably will take more than 10ms to load and process.
     const double kLoadCycles = 1.0 / 64 * 11;
     const double kStoreCycles = 1.0 / 64 * 11;
     // Scaling from Eigen compute cost to device cycles.
     return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
   }