10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
44 template <
typename Expression>
49 template <
typename LhsXprType,
typename RhsXprType>
54 template <
typename UnaryOp,
typename XprType>
59 template <
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
66 template <
typename Broadcast,
typename XprType>
77 template <
typename Expression,
typename Device,
bool Vectorizable, TiledEvaluation Tiling>
88 "Default executor instantiated with non-default device. "
89 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
90 "EIGEN_USE_SYCL before including Eigen headers.");
94 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
109 template <
typename Expression,
typename Device,
typename DoneCallback,
bool Vectorizable, TiledEvaluation Tiling>
115 template <
typename Expression>
123 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
126 const int PacketSize =
132 const StorageIndex UnrolledSize = (
size / (4 * PacketSize)) * 4 * PacketSize;
139 for (
StorageIndex i = UnrolledSize;
i < VectorizedSize;
i += PacketSize) {
154 template <
typename Expression,
bool Vectorizable>
176 const bool needs_assign =
evaluator.evalSubExprsIfNeeded(NULL);
185 TensorBlockScratch scratch(device);
209 #ifdef EIGEN_USE_THREADS
211 template <
typename TensorBlockMapper>
212 struct TensorExecutorTilingContext {
213 TensorExecutorTilingContext() =
default;
214 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
const TensorOpCost& b_cost,
size_t b_aligned_size)
215 : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
217 TensorBlockMapper block_mapper;
219 size_t aligned_blocksize;
224 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
225 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const Evaluator& evaluator) {
227 TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
231 requirements.size =
static_cast<size_t>(1.0 / taskSize);
235 size_t block_size = block_mapper.blockTotalSize();
237 const size_t aligned_blocksize =
238 align * numext::div_ceil<size_t>(block_size *
sizeof(
typename Evaluator::Scalar), align);
240 return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
243 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
245 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
const StorageIndex lastIdx) {
246 Evaluator evaluator = *evaluator_in;
248 for (StorageIndex
i = firstIdx;
i < lastIdx; ++
i) {
249 evaluator.evalScalar(
i);
253 static StorageIndex alignBlockSize(StorageIndex
size) {
return size; }
256 template <
typename Evaluator,
typename StorageIndex>
257 struct EvalRange<Evaluator, StorageIndex, true> {
260 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
const StorageIndex lastIdx) {
261 Evaluator evaluator = *evaluator_in;
263 StorageIndex
i = firstIdx;
264 if (lastIdx - firstIdx >= PacketSize) {
266 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
270 for (;
i <= last_chunk_offset;
i += 4 * PacketSize) {
271 for (StorageIndex
j = 0;
j < 4;
j++) {
272 evaluator.evalPacket(
i +
j * PacketSize);
275 last_chunk_offset = lastIdx - PacketSize;
276 for (;
i <= last_chunk_offset;
i += PacketSize) {
277 evaluator.evalPacket(
i);
280 for (;
i < lastIdx; ++
i) {
281 evaluator.evalScalar(
i);
285 static StorageIndex alignBlockSize(StorageIndex
size) {
287 if (
size >= 16 * PacketSize) {
288 return (
size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
291 return (
size + PacketSize - 1) & ~(PacketSize - 1);
295 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
296 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
301 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
302 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
304 Evaluator evaluator(expr, device);
305 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
309 size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
316 template <
typename Expression,
bool Vectorizable>
322 typedef std::remove_const_t<Scalar> ScalarNoConst;
324 static constexpr
int NumDims = traits<Expression>::NumDimensions;
326 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
327 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
328 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
330 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
331 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
334 Evaluator evaluator(expr, device);
336 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
338 const TilingContext tiling =
339 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
341 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
342 TensorBlockScratch scratch(device);
344 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
345 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
346 evaluator.evalBlock(desc, scratch);
352 if (tiling.block_mapper.blockCount() == 1) {
353 TensorBlockScratch scratch(device);
354 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
355 evaluator.evalBlock(desc, scratch);
357 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block);
364 template <
typename Expression,
typename DoneCallback,
bool Vectorizable, TiledEvaluation Tiling>
365 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
368 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
370 static EIGEN_STRONG_INLINE void runAsync(
const Expression& expr,
const ThreadPoolDevice& device, DoneCallback done) {
371 TensorAsyncExecutorContext*
const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
373 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
379 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
380 const StorageIndex
size =
array_prod(ctx->evaluator.dimensions());
381 device.parallelForAsync(
382 size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
383 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
384 [ctx]() { delete ctx; });
387 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
391 struct TensorAsyncExecutorContext {
392 TensorAsyncExecutorContext(
const Expression& expr,
const ThreadPoolDevice& thread_pool, DoneCallback done)
393 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
395 ~TensorAsyncExecutorContext() {
403 DoneCallback on_done;
407 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
408 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable,
TiledEvaluation::
On> {
412 typedef std::remove_const_t<Scalar> ScalarNoConst;
414 static constexpr
int NumDims = traits<Expression>::NumDimensions;
416 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
417 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
418 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
420 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
421 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
423 static EIGEN_STRONG_INLINE void runAsync(
const Expression& expr,
const ThreadPoolDevice& device, DoneCallback done) {
424 TensorAsyncExecutorContext*
const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
426 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
432 ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
434 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
435 TensorBlockScratch scratch(ctx->device);
437 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
438 TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
439 ctx->evaluator.evalBlock(desc, scratch);
445 if (ctx->tiling.block_mapper.blockCount() == 1) {
446 TensorBlockScratch scratch(ctx->device);
447 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
448 ctx->evaluator.evalBlock(desc, scratch);
451 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
452 [ctx]() { delete ctx; });
456 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
460 struct TensorAsyncExecutorContext {
461 TensorAsyncExecutorContext(
const Expression& expr,
const ThreadPoolDevice& thread_pool, DoneCallback done)
462 : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
464 ~TensorAsyncExecutorContext() {
469 const ThreadPoolDevice& device;
471 TilingContext tiling;
474 DoneCallback on_done;
481 #if defined(EIGEN_USE_GPU)
483 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
484 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
487 static void run(
const Expression& expr,
const GpuDevice& device);
490 #if defined(EIGEN_GPUCC)
492 template <
typename Index>
494 const Index highest = NumTraits<Index>::highest();
495 const Index lowest = NumTraits<Index>::lowest();
496 if (lhs > 0 && rhs > 0) {
497 return lhs > highest - rhs ? 1 : 0;
498 }
else if (lhs < 0 && rhs < 0) {
499 return lhs < lowest - rhs ? -1 : 0;
507 template <
typename Index>
509 const Index highest = NumTraits<Index>::highest();
510 const Index lowest = NumTraits<Index>::lowest();
511 int overflow = sum_will_overflow(lhs, rhs);
512 return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
518 template <
typename Index>
524 : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
528 return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
532 const bool can_overflow_;
533 const Index step_size_;
536 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
537 struct EigenMetaKernelEval {
539 StorageIndex step_size) {
540 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
541 for (StorageIndex
i = firstIdx;
i < lastIdx;
i = safe_step(
i)) {
547 template <
typename Evaluator,
typename StorageIndex>
548 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
550 StorageIndex step_size) {
552 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
553 const StorageIndex vectorized_step_size = step_size * PacketSize;
555 SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
557 for (StorageIndex
i = firstIdx * PacketSize;
i < vectorized_size;
i = safe_vectorized_step(
i)) {
560 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
561 for (StorageIndex
i = saturate_add(vectorized_size, firstIdx);
i < lastIdx;
i = safe_step(
i)) {
567 template <
typename Evaluator,
typename StorageIndex>
568 __global__
void __launch_bounds__(1024) EigenMetaKernel(Evaluator
eval, StorageIndex
size) {
570 const StorageIndex step_size =
blockDim.x * gridDim.x;
572 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
577 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
579 const GpuDevice& device) {
580 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
581 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
583 const int block_size = device.maxGpuThreadsPerBlock();
584 const int max_blocks =
static_cast<int>(
585 numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
586 NumTraits<StorageIndex>::highest()) /
590 const int num_blocks = numext::maxi<int>(
591 numext::mini<int>(max_blocks,
static_cast<int>(numext::div_ceil<StorageIndex>(
size, block_size))), 1);
593 LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
594 0, device, evaluator,
size);
603 #ifdef EIGEN_USE_SYCL
605 template <
typename Evaluator>
606 struct ExecExprFunctorKernel {
610 template <
typename Scratch>
612 : evaluator(evaluator_), range(range_) {}
615 template <
bool is_vec = Evaluator::PacketAccess>
617 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
618 Index total_threads = itemID.get_global_range(0);
620 for (
Index i = gId;
i < range;
i += total_threads) {
621 evaluator.evalScalar(
i);
624 template <
bool is_vec = Evaluator::PacketAccess>
626 const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
627 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
628 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
629 const Index start = Evaluator::PacketSize * gId;
631 evaluator.evalPacket(
i);
633 gId += vectorizedRange;
634 for (
Index i = gId;
i < range;
i += itemID.get_global_range(0)) {
635 evaluator.evalScalar(
i);
640 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
646 Evaluator evaluator(expr, dev);
647 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
649 Index range, GRange, tileSize;
651 total_size = (total_size == 0) ? 1 : total_size;
653 Index vectorizable_threads =
static_cast<Index>(total_size / PacketSize);
654 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
657 dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
658 evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
Definition: TensorAssign.h:57
Definition: TensorBroadcasting.h:66
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:166
Definition: TensorExpr.h:162
Definition: TensorExpr.h:97
Definition: TensorCostModel.h:28
Definition: TensorExecutor.h:110
Definition: TensorBlock.h:171
Definition: TensorBlock.h:314
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:327
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:318
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:335
Definition: TensorBlock.h:475
traits< Expression >::Index StorageIndex
Definition: TensorExecutor.h:162
std::remove_const_t< Scalar > ScalarNoConst
Definition: TensorExecutor.h:159
traits< Expression >::Scalar Scalar
Definition: TensorExecutor.h:158
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:166
TensorEvaluator< Expression, DefaultDevice > Evaluator
Definition: TensorExecutor.h:161
static EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:121
Expression::Index StorageIndex
Definition: TensorExecutor.h:119
Definition: TensorExecutor.h:78
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
Expression::Index StorageIndex
Definition: TensorExecutor.h:80
The tensor executor class.
EIGEN_DONT_INLINE void compute(Solver &solver, const MatrixType &A)
Definition: dense_solvers.cpp:23
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
TiledEvaluation
Definition: TensorForwardDeclarations.h:186
@ Off
Definition: TensorForwardDeclarations.h:187
@ On
Definition: TensorForwardDeclarations.h:188
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47
Definition: TensorDeviceDefault.h:19
Definition: TensorMeta.h:47
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
Definition: TensorExecutor.h:45
@ value
Definition: TensorExecutor.h:46
Definition: TensorBlock.h:75
Definition: CoreEvaluators.h:104
Definition: ForwardDeclarations.h:21
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317