TensorExecutor.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
32 namespace internal {
33 
42 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
43 
44 template <typename Expression>
46  enum { value = false };
47 };
48 
49 template <typename LhsXprType, typename RhsXprType>
50 struct ExpressionHasTensorBroadcastingOp<const TensorAssignOp<LhsXprType, RhsXprType> > {
52 };
53 
54 template <typename UnaryOp, typename XprType>
57 };
58 
59 template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
60 struct ExpressionHasTensorBroadcastingOp<const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
61  enum {
63  };
64 };
65 
66 template <typename Broadcast, typename XprType>
68  enum { value = true };
69 };
70 
71 // -------------------------------------------------------------------------- //
72 
77 template <typename Expression, typename Device, bool Vectorizable, TiledEvaluation Tiling>
79  public:
80  typedef typename Expression::Index StorageIndex;
81 
82  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
83  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
84  // violation. If this template is instantiated with a non-default device, it
85  // means that this header file was included without defining
86  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
88  "Default executor instantiated with non-default device. "
89  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
90  "EIGEN_USE_SYCL before including Eigen headers.");
91 
92  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) {
94  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
95  if (needs_assign) {
96  const StorageIndex size = array_prod(evaluator.dimensions());
97  for (StorageIndex i = 0; i < size; ++i) {
98  evaluator.evalScalar(i);
99  }
100  }
101  evaluator.cleanup();
102  }
103 };
104 
109 template <typename Expression, typename Device, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
111 
115 template <typename Expression>
116 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
117  /*Tiling=*/TiledEvaluation::Off> {
118  public:
119  typedef typename Expression::Index StorageIndex;
120 
121  static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
123  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
124  if (needs_assign) {
125  const StorageIndex size = array_prod(evaluator.dimensions());
126  const int PacketSize =
128 
129  // Give compiler a strong possibility to unroll the loop. But don't insist
130  // on unrolling, because if the function is expensive compiler should not
131  // unroll the loop at the expense of inlining.
132  const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
133  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
134  for (StorageIndex j = 0; j < 4; j++) {
135  evaluator.evalPacket(i + j * PacketSize);
136  }
137  }
138  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
139  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
140  evaluator.evalPacket(i);
141  }
142  for (StorageIndex i = VectorizedSize; i < size; ++i) {
143  evaluator.evalScalar(i);
144  }
145  }
146  evaluator.cleanup();
147  }
148 };
149 
154 template <typename Expression, bool Vectorizable>
155 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
156  /*Tiling=*/TiledEvaluation::On> {
157  public:
159  typedef std::remove_const_t<Scalar> ScalarNoConst;
160 
163 
164  static constexpr int NumDims = traits<Expression>::NumDimensions;
165 
166  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr,
167  const DefaultDevice& device = DefaultDevice()) {
169 
171  typedef internal::TensorBlockScratchAllocator<DefaultDevice> TensorBlockScratch;
172 
173  Evaluator evaluator(expr, device);
174 
175  // TODO(ezhulenev): Do not use tiling for small tensors?
176  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
177 
178  if (needs_assign) {
179  // Query expression tree for desired block size/shape.
180  const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
181 
182  const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements);
183 
184  // Share scratch memory allocator between all blocks.
185  TensorBlockScratch scratch(device);
186 
187  const StorageIndex total_block_count = block_mapper.blockCount();
188  for (StorageIndex i = 0; i < total_block_count; ++i) {
189  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
190  evaluator.evalBlock(desc, scratch);
191  scratch.reset();
192  }
193  }
194  evaluator.cleanup();
195  }
196 };
197 
209 #ifdef EIGEN_USE_THREADS
210 
211 template <typename TensorBlockMapper>
212 struct TensorExecutorTilingContext {
213  TensorExecutorTilingContext() = default;
214  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size)
215  : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
216 
217  TensorBlockMapper block_mapper; // navigate through blocks
218  TensorOpCost cost; // cost of computing a single block
219  size_t aligned_blocksize; // block size after memory alignment
220 };
221 
222 // Computes a block evaluation parameters, and allocates temporary memory buffer
223 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
224 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
225 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(const Evaluator& evaluator) {
226  // Query expression tree for desired block size/shape.
227  TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
228 
229  // Update target block size based on cost model.
230  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, requirements.cost_per_coeff);
231  requirements.size = static_cast<size_t>(1.0 / taskSize);
232 
233  TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements);
234 
235  size_t block_size = block_mapper.blockTotalSize();
236  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
237  const size_t aligned_blocksize =
238  align * numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
239 
240  return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
241 }
242 
243 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
244 struct EvalRange {
245  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
246  Evaluator evaluator = *evaluator_in;
247  eigen_assert(lastIdx >= firstIdx);
248  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
249  evaluator.evalScalar(i);
250  }
251  }
252 
253  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
254 };
255 
256 template <typename Evaluator, typename StorageIndex>
257 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
258  static constexpr int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
259 
260  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
261  Evaluator evaluator = *evaluator_in;
262  eigen_assert(lastIdx >= firstIdx);
263  StorageIndex i = firstIdx;
264  if (lastIdx - firstIdx >= PacketSize) {
265  eigen_assert(firstIdx % PacketSize == 0);
266  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
267  // Give compiler a strong possibility to unroll the loop. But don't insist
268  // on unrolling, because if the function is expensive compiler should not
269  // unroll the loop at the expense of inlining.
270  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
271  for (StorageIndex j = 0; j < 4; j++) {
272  evaluator.evalPacket(i + j * PacketSize);
273  }
274  }
275  last_chunk_offset = lastIdx - PacketSize;
276  for (; i <= last_chunk_offset; i += PacketSize) {
277  evaluator.evalPacket(i);
278  }
279  }
280  for (; i < lastIdx; ++i) {
281  evaluator.evalScalar(i);
282  }
283  }
284 
285  static StorageIndex alignBlockSize(StorageIndex size) {
286  // Align block size to packet size and account for unrolling in run above.
287  if (size >= 16 * PacketSize) {
288  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
289  }
290  // Aligning to 4 * PacketSize would increase block size by more than 25%.
291  return (size + PacketSize - 1) & ~(PacketSize - 1);
292  }
293 };
294 
295 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
296 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
297  public:
298  typedef typename Expression::Index StorageIndex;
299 
300  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
301  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
302  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
303 
304  Evaluator evaluator(expr, device);
305  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
306  if (needs_assign) {
307  const StorageIndex size = array_prod(evaluator.dimensions());
308  device.parallelFor(
309  size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
310  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
311  }
312  evaluator.cleanup();
313  }
314 };
315 
316 template <typename Expression, bool Vectorizable>
317 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
318  /*Tiling=*/TiledEvaluation::On> {
319  public:
320  typedef typename traits<Expression>::Index IndexType;
321  typedef typename traits<Expression>::Scalar Scalar;
322  typedef std::remove_const_t<Scalar> ScalarNoConst;
323 
324  static constexpr int NumDims = traits<Expression>::NumDimensions;
325 
326  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
327  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
328  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
329 
330  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
331  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
332 
333  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
334  Evaluator evaluator(expr, device);
335 
336  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
337  if (needs_assign) {
338  const TilingContext tiling =
339  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
340 
341  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
342  TensorBlockScratch scratch(device);
343 
344  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
345  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
346  evaluator.evalBlock(desc, scratch);
347  scratch.reset();
348  }
349  };
350 
351  // Evaluate small expressions directly as a single block.
352  if (tiling.block_mapper.blockCount() == 1) {
353  TensorBlockScratch scratch(device);
354  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
355  evaluator.evalBlock(desc, scratch);
356  } else {
357  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, eval_block);
358  }
359  }
360  evaluator.cleanup();
361  }
362 };
363 
364 template <typename Expression, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
365 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
366  public:
367  typedef typename Expression::Index StorageIndex;
368  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
369 
370  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
371  TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
372 
373  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
374  if (!need_assign) {
375  delete ctx;
376  return;
377  }
378 
379  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
380  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
381  device.parallelForAsync(
382  size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
383  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
384  [ctx]() { delete ctx; });
385  };
386 
387  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
388  }
389 
390  private:
391  struct TensorAsyncExecutorContext {
392  TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
393  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
394 
395  ~TensorAsyncExecutorContext() {
396  evaluator.cleanup();
397  on_done();
398  }
399 
400  Evaluator evaluator;
401 
402  private:
403  DoneCallback on_done;
404  };
405 };
406 
407 template <typename Expression, typename DoneCallback, bool Vectorizable>
408 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, /*Tileable*/ TiledEvaluation::On> {
409  public:
410  typedef typename traits<Expression>::Index IndexType;
411  typedef typename traits<Expression>::Scalar Scalar;
412  typedef std::remove_const_t<Scalar> ScalarNoConst;
413 
414  static constexpr int NumDims = traits<Expression>::NumDimensions;
415 
416  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
417  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
418  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
419 
420  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
421  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
422 
423  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
424  TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
425 
426  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
427  if (!need_assign) {
428  delete ctx;
429  return;
430  }
431 
432  ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
433 
434  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
435  TensorBlockScratch scratch(ctx->device);
436 
437  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
438  TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
439  ctx->evaluator.evalBlock(desc, scratch);
440  scratch.reset();
441  }
442  };
443 
444  // Evaluate small expressions directly as a single block.
445  if (ctx->tiling.block_mapper.blockCount() == 1) {
446  TensorBlockScratch scratch(ctx->device);
447  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
448  ctx->evaluator.evalBlock(desc, scratch);
449  delete ctx;
450  } else {
451  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
452  [ctx]() { delete ctx; });
453  }
454  };
455 
456  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
457  }
458 
459  private:
460  struct TensorAsyncExecutorContext {
461  TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
462  : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
463 
464  ~TensorAsyncExecutorContext() {
465  evaluator.cleanup();
466  on_done();
467  }
468 
469  const ThreadPoolDevice& device;
470  Evaluator evaluator;
471  TilingContext tiling;
472 
473  private:
474  DoneCallback on_done;
475  };
476 };
477 
478 #endif // EIGEN_USE_THREADS
479 
480 // GPU: the evaluation of the expression is offloaded to a GPU.
481 #if defined(EIGEN_USE_GPU)
482 
483 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
484 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
485  public:
486  typedef typename Expression::Index StorageIndex;
487  static void run(const Expression& expr, const GpuDevice& device);
488 };
489 
490 #if defined(EIGEN_GPUCC)
491 // Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
492 template <typename Index>
493 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) {
494  const Index highest = NumTraits<Index>::highest();
495  const Index lowest = NumTraits<Index>::lowest();
496  if (lhs > 0 && rhs > 0) {
497  return lhs > highest - rhs ? 1 : 0;
498  } else if (lhs < 0 && rhs < 0) {
499  return lhs < lowest - rhs ? -1 : 0;
500  } else {
501  return 0;
502  }
503 }
504 
505 // Returns lhs + rhs, saturating to the highest/lowest representable value on
506 // overflow/underflow respectively.
507 template <typename Index>
508 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
509  const Index highest = NumTraits<Index>::highest();
510  const Index lowest = NumTraits<Index>::lowest();
511  int overflow = sum_will_overflow(lhs, rhs);
512  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
513 }
514 
515 // A functor that adds step_size to a given index, saturating to avoid
516 // overflow/underflow. If overflow/underflow is not possible, regular addition
517 // is used (for efficiency).
518 template <typename Index>
519 struct SafeStep {
520  // lastIdx is one past the end of the possible indexes.
521  // step_size is the value that will be added to the given index when the
522  // functor is called.
523  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
524  : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
525 
526  // Adds step_size to index, saturating on overflow (if overflow is possible).
527  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
528  return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
529  }
530 
531  private:
532  const bool can_overflow_;
533  const Index step_size_;
534 };
535 
536 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
537 struct EigenMetaKernelEval {
538  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
539  StorageIndex step_size) {
540  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
541  for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
542  eval.evalScalar(i);
543  }
544  }
545 };
546 
547 template <typename Evaluator, typename StorageIndex>
548 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
549  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
550  StorageIndex step_size) {
551  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
552  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
553  const StorageIndex vectorized_step_size = step_size * PacketSize;
554 
555  SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
556  // Use the vector path
557  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) {
558  eval.evalPacket(i);
559  }
560  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
561  for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) {
562  eval.evalScalar(i);
563  }
564  }
565 };
566 
567 template <typename Evaluator, typename StorageIndex>
568 __global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) {
569  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
570  const StorageIndex step_size = blockDim.x * gridDim.x;
571 
572  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
574 }
575 
576 /*static*/
577 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
579  const GpuDevice& device) {
580  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
581  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
582  if (needs_assign) {
583  const int block_size = device.maxGpuThreadsPerBlock();
584  const int max_blocks = static_cast<int>(
585  numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
586  NumTraits<StorageIndex>::highest()) /
587  block_size);
588  const StorageIndex size = array_prod(evaluator.dimensions());
589  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
590  const int num_blocks = numext::maxi<int>(
591  numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
592 
593  LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
594  0, device, evaluator, size);
595  }
596  evaluator.cleanup();
597 }
598 
599 #endif // EIGEN_GPUCC
600 #endif // EIGEN_USE_GPU
601 
602 // SYCL Executor policy
603 #ifdef EIGEN_USE_SYCL
604 
605 template <typename Evaluator>
606 struct ExecExprFunctorKernel {
607  typedef typename Evaluator::Index Index;
608  Evaluator evaluator;
609  const Index range;
610  template <typename Scratch>
611  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_)
612  : evaluator(evaluator_), range(range_) {}
613 
614  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); }
615  template <bool is_vec = Evaluator::PacketAccess>
616  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
617  Index gId = static_cast<Index>(itemID.get_global_linear_id());
618  Index total_threads = itemID.get_global_range(0);
619 
620  for (Index i = gId; i < range; i += total_threads) {
621  evaluator.evalScalar(i);
622  }
623  }
624  template <bool is_vec = Evaluator::PacketAccess>
625  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
626  const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
627  Index gId = static_cast<Index>(itemID.get_global_linear_id());
628  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
629  const Index start = Evaluator::PacketSize * gId;
630  for (Index i = start; i < vectorizedRange; i += step) {
631  evaluator.evalPacket(i);
632  }
633  gId += vectorizedRange;
634  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
635  evaluator.evalScalar(i);
636  }
637  }
638 };
639 
640 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
641 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
642  public:
643  typedef typename Expression::Index Index;
644  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) {
646  Evaluator evaluator(expr, dev);
647  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
648  if (needs_assign) {
649  Index range, GRange, tileSize;
650  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
651  total_size = (total_size == 0) ? 1 : total_size;
653  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
654  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
655  range = total_size;
656 
657  dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
658  evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
659  range)
660  .wait();
661  }
662  evaluator.cleanup();
663  }
664 };
665 
666 #endif
667 
668 } // end namespace internal
669 
670 } // end namespace Eigen
671 
672 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_MAX_ALIGN_BYTES
Definition: ConfigureVectorization.h:163
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
Definition: TensorAssign.h:57
Definition: TensorBroadcasting.h:66
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost &cost_per_coeff)
Definition: TensorCostModel.h:166
Definition: TensorExpr.h:162
Definition: TensorExpr.h:97
Definition: TensorCostModel.h:28
Definition: TensorExecutor.h:110
Definition: TensorBlock.h:171
Definition: TensorBlock.h:314
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const
Definition: TensorBlock.h:327
DSizes< IndexType, NumDims > Dimensions
Definition: TensorBlock.h:318
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const
Definition: TensorBlock.h:335
std::remove_const_t< Scalar > ScalarNoConst
Definition: TensorExecutor.h:159
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:166
TensorEvaluator< Expression, DefaultDevice > Evaluator
Definition: TensorExecutor.h:161
static EIGEN_STRONG_INLINE void run(const Expression &expr, const DefaultDevice &device=DefaultDevice())
Definition: TensorExecutor.h:121
Definition: TensorExecutor.h:78
static EIGEN_STRONG_INLINE void run(const Expression &expr, const Device &device=DefaultDevice())
Definition: TensorExecutor.h:92
Expression::Index StorageIndex
Definition: TensorExecutor.h:80
The tensor executor class.
EIGEN_DONT_INLINE void compute(Solver &solver, const MatrixType &A)
Definition: dense_solvers.cpp:23
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
TiledEvaluation
Definition: TensorForwardDeclarations.h:186
@ Off
Definition: TensorForwardDeclarations.h:187
@ On
Definition: TensorForwardDeclarations.h:188
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
internal::nested_eval< T, 1 >::type eval(const T &xpr)
Definition: sparse_permutations.cpp:47
Definition: TensorDeviceDefault.h:19
Definition: TensorMeta.h:47
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
@ value
Definition: TensorExecutor.h:46
Definition: CoreEvaluators.h:104
Definition: ForwardDeclarations.h:21
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317