10 #ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
11 #define EIGEN_CORE_THREAD_POOL_DEVICE_H
43 using Task = std::function<void()>;
46 eigen_assert(threadCostThreshold >= 0.0f &&
"threadCostThreshold must be non-negative");
50 template <
int PacketSize>
52 eigen_assert(cost >= 0.0f &&
"cost must be non-negative");
55 float totalCost =
static_cast<float>(numOps) * cost;
57 if (idealThreads <
static_cast<float>(actualThreads)) {
59 actualThreads =
numext::mini(actualThreads,
static_cast<int>(idealThreads));
66 #if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
67 #define EIGEN_PARALLEL_FOR_INLINE
69 #define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
72 template <
typename UnaryFunctor,
int PacketSize>
78 eigen_assert(
size % PacketSize == 0 &&
"this function assumes size is a multiple of PacketSize");
80 Task right = [
this, mid,
end, &
f, &barrier, level]() {
81 parallelForImpl<UnaryFunctor, PacketSize>(mid,
end,
f, barrier, level);
90 template <
typename BinaryFunctor,
int PacketSize>
96 Index outerSize = outerEnd - outerBegin;
98 Index outerMid = outerBegin + (outerSize >> 1);
99 Task right = [
this, &
f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
100 parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd,
f, barrier, level);
105 Index innerSize = innerEnd - innerBegin;
106 eigen_assert(innerSize % PacketSize == 0 &&
"this function assumes innerSize is a multiple of PacketSize");
108 Task right = [
this, &
f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
109 parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd,
f, barrier, level);
115 for (
Index outer = outerBegin; outer < outerEnd; outer++)
116 for (
Index inner = innerBegin; inner < innerEnd; inner += PacketSize)
f(outer, inner);
120 #undef EIGEN_PARALLEL_FOR_INLINE
122 template <
typename UnaryFunctor,
int PacketSize>
125 int maxLevel = calculateLevels<PacketSize>(
size, cost);
126 Barrier barrier(1 << maxLevel);
127 parallelForImpl<UnaryFunctor, PacketSize>(begin,
end,
f, barrier, maxLevel);
131 template <
typename BinaryFunctor,
int PacketSize>
133 Index innerEnd, BinaryFunctor&
f,
float cost) {
134 Index outerSize = outerEnd - outerBegin;
135 Index innerSize = innerEnd - innerBegin;
137 int maxLevel = calculateLevels<PacketSize>(
size, cost);
138 Barrier barrier(1 << maxLevel);
139 parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd,
f, barrier, maxLevel);
153 template <
typename Kernel>
162 template <
typename Kernel>
165 struct AssignmentFunctor :
public Kernel {
168 this->assignCoeffByOuterInner(outer, inner);
173 const Index innerSize = kernel.innerSize();
174 const Index outerSize = kernel.outerSize();
175 constexpr
float cost =
static_cast<float>(XprEvaluationCost);
176 AssignmentFunctor functor(kernel);
177 device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
181 template <
typename Kernel>
185 struct AssignmentFunctor :
public Kernel {
192 const Index outerSize = kernel.outerSize();
193 AssignmentFunctor functor(kernel);
194 constexpr
float cost =
static_cast<float>(XprEvaluationCost) *
static_cast<float>(InnerSize);
195 device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
199 template <
typename Kernel>
203 SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
204 DstAlignment = Kernel::AssignmentTraits::DstAlignment;
205 struct AssignmentFunctor :
public Kernel {
208 this->
template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
212 const Index innerSize = kernel.innerSize();
213 const Index outerSize = kernel.outerSize();
214 const float cost =
static_cast<float>(XprEvaluationCost) *
static_cast<float>(innerSize);
215 AssignmentFunctor functor(kernel);
216 device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
220 template <
typename Kernel>
225 SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
226 DstAlignment = Kernel::AssignmentTraits::DstAlignment,
227 InnerSize = DstXprType::InnerSizeAtCompileTime;
228 struct AssignmentFunctor :
public Kernel {
235 const Index outerSize = kernel.outerSize();
236 constexpr
float cost =
static_cast<float>(XprEvaluationCost) *
static_cast<float>(InnerSize);
237 AssignmentFunctor functor(kernel);
238 device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
242 template <
typename Kernel>
247 struct PacketAssignmentFunctor :
public Kernel {
250 this->
template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
253 struct ScalarAssignmentFunctor :
public Kernel {
256 const Index innerSize = this->innerSize();
258 for (
Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
262 const Index outerSize = kernel.outerSize();
263 const Index innerSize = kernel.innerSize();
265 constexpr
float packetCost =
static_cast<float>(XprEvaluationCost);
266 const float scalarCost =
static_cast<float>(XprEvaluationCost) *
static_cast<float>(innerSize - packetAccessSize);
267 PacketAssignmentFunctor packetFunctor(kernel);
268 ScalarAssignmentFunctor scalarFunctor(kernel);
269 device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
271 device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
275 template <
typename Kernel>
278 struct AssignmentFunctor :
public Kernel {
284 constexpr
float cost =
static_cast<float>(XprEvaluationCost);
285 AssignmentFunctor functor(kernel);
286 device.template parallelFor<AssignmentFunctor, 1>(0,
size, functor, cost);
290 template <
typename Kernel>
295 RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
297 DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
299 : Kernel::AssignmentTraits::DstAlignment,
300 SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
301 struct AssignmentFunctor :
public Kernel {
304 this->
template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
309 const Index alignedStart =
310 DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(),
size);
315 constexpr
float cost =
static_cast<float>(XprEvaluationCost);
316 AssignmentFunctor functor(kernel);
317 device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_PARALLEL_FOR_INLINE
Definition: CoreThreadPoolDevice.h:69
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
void Wait()
Definition: Barrier.h:43
void Notify()
Definition: Barrier.h:28
Definition: NonBlockingThreadPool.h:19
void Schedule(std::function< void()> fn) EIGEN_OVERRIDE
Definition: NonBlockingThreadPool.h:120
int NumThreads() const EIGEN_FINAL
Definition: NonBlockingThreadPool.h:205
static int f(const TensorMap< Tensor< int, 3 > > &tensor)
Definition: cxx11_tensor_map.cpp:237
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
@ InnerVectorizedTraversal
Definition: Constants.h:284
@ DefaultTraversal
Definition: Constants.h:279
@ InnerUnrolling
Definition: Constants.h:303
@ NoUnrolling
Definition: Constants.h:301
int log2_ceil(const BitsType &x)
Definition: MathFunctions.h:758
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T round_down(T a, U b)
Definition: MathFunctions.h:1266
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
Definition: CoreThreadPoolDevice.h:42
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool &pool, float threadCostThreshold=3e-5f)
Definition: CoreThreadPoolDevice.h:44
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin, Index innerEnd, BinaryFunctor &f, float cost)
Definition: CoreThreadPoolDevice.h:132
EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor &f, Barrier &barrier, int level)
Definition: CoreThreadPoolDevice.h:73
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor &f, float cost)
Definition: CoreThreadPoolDevice.h:123
ThreadPool & m_pool
Definition: CoreThreadPoolDevice.h:143
std::function< void()> Task
Definition: CoreThreadPoolDevice.h:43
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const
Definition: CoreThreadPoolDevice.h:51
EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin, Index innerEnd, BinaryFunctor &f, Barrier &barrier, int level)
Definition: CoreThreadPoolDevice.h:91
float m_costFactor
Definition: CoreThreadPoolDevice.h:146
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
Definition: AssignEvaluator.h:220
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer)
Definition: AssignEvaluator.h:281
Definition: CoreThreadPoolDevice.h:154
typename Kernel::SrcEvaluatorType SrcEvaluatorType
Definition: CoreThreadPoolDevice.h:155
typename SrcEvaluatorType::XprType SrcXprType
Definition: CoreThreadPoolDevice.h:157
typename Kernel::DstEvaluatorType DstEvaluatorType
Definition: CoreThreadPoolDevice.h:156
static constexpr Index Cost
Definition: CoreThreadPoolDevice.h:159
typename DstEvaluatorType::XprType DstXprType
Definition: CoreThreadPoolDevice.h:158
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:254
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer)
Definition: CoreThreadPoolDevice.h:255
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:172
Definition: CoreThreadPoolDevice.h:291
typename Kernel::Scalar Scalar
Definition: CoreThreadPoolDevice.h:292
typename Kernel::PacketType PacketType
Definition: CoreThreadPoolDevice.h:293
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:307
Definition: CoreThreadPoolDevice.h:221
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:234
typename Kernel::DstEvaluatorType::XprType DstXprType
Definition: CoreThreadPoolDevice.h:223
typename Kernel::PacketType PacketType
Definition: CoreThreadPoolDevice.h:222
Definition: CoreThreadPoolDevice.h:276
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:282
typename Kernel::DstEvaluatorType::XprType DstXprType
Definition: CoreThreadPoolDevice.h:183
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:191
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner)
Definition: CoreThreadPoolDevice.h:249
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:248
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:229
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer)
Definition: CoreThreadPoolDevice.h:230
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:279
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index)
Definition: CoreThreadPoolDevice.h:280
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index)
Definition: CoreThreadPoolDevice.h:303
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:302
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:186
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer)
Definition: CoreThreadPoolDevice.h:187
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner)
Definition: CoreThreadPoolDevice.h:167
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:166
typename Kernel::PacketType PacketType
Definition: CoreThreadPoolDevice.h:201
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:211
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner)
Definition: CoreThreadPoolDevice.h:207
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel &kernel)
Definition: CoreThreadPoolDevice.h:206
Definition: CoreThreadPoolDevice.h:243
typename Kernel::Scalar Scalar
Definition: CoreThreadPoolDevice.h:244
typename Kernel::PacketType PacketType
Definition: CoreThreadPoolDevice.h:245
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel &kernel, CoreThreadPoolDevice &device)
Definition: CoreThreadPoolDevice.h:261
Definition: DeviceWrapper.h:88
Definition: XprHelper.h:263
Definition: GenericPacketMath.h:108
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel &, Index, Index)
Definition: AssignEvaluator.h:363
Definition: GenericPacketMath.h:134