TensorCostModel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
26 // Class storing the cost of evaluating a tensor expression in terms of the
27 // estimated number of operand bytes loads, bytes stored, and compute cycles.
28 class TensorOpCost {
29  public:
30  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
31  // model based on minimal reciprocal throughput numbers from Intel or
32  // Agner Fog's tables would be better than what is there now.
33  template <typename ArgType>
36  }
37  template <typename ArgType>
40  }
41  template <typename ArgType>
44  }
45  template <typename ArgType>
48  }
49  template <typename SrcType, typename TargetType>
52  }
53 
57 
58  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized,
59  double packet_size)
62  compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {
66  }
67 
71  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,
72  double compute_cost) const {
73  return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
74  }
75 
76  // Drop memory access component. Intended for cases when memory accesses are
77  // sequential or are completely masked by computations.
79  bytes_loaded_ = 0;
80  bytes_stored_ = 0;
81  }
82 
83  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
89  }
90 
91  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
97  }
98 
100  bytes_loaded_ += rhs.bytes_loaded();
101  bytes_stored_ += rhs.bytes_stored();
103  return *this;
104  }
105 
107  bytes_loaded_ *= rhs;
108  bytes_stored_ *= rhs;
109  compute_cycles_ *= rhs;
110  return *this;
111  }
112 
114  lhs += rhs;
115  return lhs;
116  }
118  lhs *= rhs;
119  return lhs;
120  }
122  rhs *= lhs;
123  return rhs;
124  }
125 
126  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
127  return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored()
128  << ", compute_cycles = " << tc.compute_cycles() << "]";
129  }
130 
131  private:
135 };
136 
137 // TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
138 // in [1:max_threads] instead of just switching multi-threading off for small
139 // work units.
140 template <typename Device>
142  public:
143  // Scaling from Eigen compute cost to device cycles.
144  static const int kDeviceCyclesPerComputeCycle = 1;
145 
146  // Costs in device cycles.
147  static const int kStartupCycles = 100000;
148  static const int kPerThreadCycles = 100000;
149  static const int kTaskSize = 40000;
150 
151  // Returns the number of threads in [1:max_threads] to use for
152  // evaluating an expression with the given output size and cost per
153  // coefficient.
154  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,
155  int max_threads) {
156  double cost = totalCost(output_size, cost_per_coeff);
157  double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
158  // Make sure we don't invoke undefined behavior when we convert to an int.
159  threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
160  return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
161  }
162 
163  // taskSize assesses parallel task size.
164  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
165  // granularity needs to be increased to mitigate parallelization overheads.
166  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) {
167  return totalCost(output_size, cost_per_coeff) / kTaskSize;
168  }
169 
170  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,
171  const TensorOpCost& cost_per_coeff) {
172  // Cost of memory fetches from L2 cache. 64 is typical cache line size.
173  // 11 is L2 cache latency on Haswell.
174  // We don't know whether data is in L1, L2 or L3. But we are most interested
175  // in single-threaded computational time around 100us-10ms (smaller time
176  // is too small for parallelization, larger time is not interesting
177  // either because we are probably using all available threads already).
178  // And for the target time range, L2 seems to be what matters. Data set
179  // fitting into L1 is too small to take noticeable time. Data set fitting
180  // only into L3 presumably will take more than 10ms to load and process.
181  const double kLoadCycles = 1.0 / 64 * 11;
182  const double kStoreCycles = 1.0 / 64 * 11;
183  // Scaling from Eigen compute cost to device cycles.
184  return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
185  }
186 };
187 
188 } // namespace Eigen
189 
190 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Definition: TensorCostModel.h:141
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition: TensorCostModel.h:154
static const int kDeviceCyclesPerComputeCycle
Definition: TensorCostModel.h:144
static const int kPerThreadCycles
Definition: TensorCostModel.h:148
static const int kStartupCycles
Definition: TensorCostModel.h:147
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:166
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:170
static const int kTaskSize
Definition: TensorCostModel.h:149
Definition: TensorCostModel.h:28
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:84
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
Definition: TensorCostModel.h:55
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator+=(const TensorOpCost &rhs)
Definition: TensorCostModel.h:99
double bytes_loaded_
Definition: TensorCostModel.h:132
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost & operator*=(double rhs)
Definition: TensorCostModel.h:106
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost &rhs) const
Definition: TensorCostModel.h:92
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost &rhs)
Definition: TensorCostModel.h:113
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const
Definition: TensorCostModel.h:69
double bytes_stored_
Definition: TensorCostModel.h:133
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs)
Definition: TensorCostModel.h:121
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost()
Definition: TensorCostModel.h:34
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost()
Definition: TensorCostModel.h:46
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost()
Definition: TensorCostModel.h:38
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs)
Definition: TensorCostModel.h:117
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost()
Definition: TensorCostModel.h:50
EIGEN_DEVICE_FUNC TensorOpCost()
Definition: TensorCostModel.h:54
friend std::ostream & operator<<(std::ostream &os, const TensorOpCost &tc)
Definition: TensorCostModel.h:126
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const
Definition: TensorCostModel.h:68
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost()
Definition: TensorCostModel.h:42
double compute_cycles_
Definition: TensorCostModel.h:134
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const
Definition: TensorCostModel.h:70
EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized, double packet_size)
Definition: TensorCostModel.h:58
EIGEN_DEVICE_FUNC void dropMemoryCost()
Definition: TensorCostModel.h:78
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost, double compute_cost) const
Definition: TensorCostModel.h:71
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool() isfinite(const Eigen::bfloat16 &h)
Definition: BFloat16.h:752
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
Definition: NumTraits.h:172
Definition: XprHelper.h:205