TensorScanSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorScanSycl.h
15  *
16  * \brief:
17  * Tensor Scan Sycl implement the extend version of
18  * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
19  * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
20  * the size of the tensor. In the first kernel (ScanKernelFunctor), each
21  * threads within the work-group individually reduces the allocated elements per
22  * thread in order to reduces the total number of blocks. In the next step all
23  * thread within the work-group will reduce the associated blocks into the
24  * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
25  * buffer is given as an input and all the threads within a work-group scan and
26  * reduces the boundaries between the blocks (generated from the previous
27  * kernel). and write the data on the temporary buffer. If the second kernel is
28  * required, the third and final kernel (ScanAdjustmentKernelFunctor) will
29  * adjust the final result into the output buffer.
30  * The original algorithm for the parallel prefix sum can be found here:
31  *
32  * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
33  * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
34  *1, no. 1 (2008): 1-17.
35  *****************************************************************/
36 
37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
39 
40 // IWYU pragma: private
41 #include "./InternalHeaderCheck.h"
42 
43 namespace Eigen {
44 namespace TensorSycl {
45 namespace internal {
46 
47 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
48 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
49 #endif
50 
51 template <typename index_t>
53  // must be power of 2
54  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
55  const index_t total_size;
56  const index_t non_scan_size;
57  const index_t scan_size;
58  const index_t non_scan_stride;
59  const index_t scan_stride;
60  const index_t panel_threads;
61  const index_t group_threads;
62  const index_t block_threads;
63  const index_t elements_per_group;
64  const index_t elements_per_block;
65  const index_t loop_range;
66 
67  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
68  index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
69  index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
70  : total_size(total_size_),
71  non_scan_size(non_scan_size_),
72  scan_size(scan_size_),
73  non_scan_stride(non_scan_stride_),
74  scan_stride(scan_stride_),
75  panel_threads(panel_threads_),
76  group_threads(group_threads_),
77  block_threads(block_threads_),
78  elements_per_group(elements_per_group_),
79  elements_per_block(elements_per_block_),
80  loop_range(loop_range_) {}
81 };
82 
83 enum class scan_step { first, second };
84 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
85  scan_step stp>
87  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
90 
92  Evaluator dev_eval;
93  OutAccessor out_ptr;
94  OutAccessor tmp_ptr;
97  const bool inclusive;
99  OutAccessor out_accessor_, OutAccessor temp_accessor_,
100  const ScanParameters<Index> scanParameters_, Op accumulator_,
101  const bool inclusive_)
102  : scratch(scratch_),
103  dev_eval(dev_eval_),
104  out_ptr(out_accessor_),
105  tmp_ptr(temp_accessor_),
106  scanParameters(scanParameters_),
107  accumulator(accumulator_),
108  inclusive(inclusive_) {}
109 
110  template <scan_step sst = stp, typename Input>
111  std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
112  const Input &inpt, Index global_id) const {
113  return inpt.coeff(global_id);
114  }
115 
116  template <scan_step sst = stp, typename Input>
117  std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
118  const Input &inpt, Index global_id) const {
119  return inpt[global_id];
120  }
121 
122  template <scan_step sst = stp, typename InclusiveOp>
123  std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
124  InclusiveOp inclusive_op) const {
125  inclusive_op();
126  }
127 
128  template <scan_step sst = stp, typename InclusiveOp>
129  std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
130  InclusiveOp) const {}
131 
132  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
133  for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
134  Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
135  Index tmp = data_offset % scanParameters.panel_threads;
136  const Index panel_id = data_offset / scanParameters.panel_threads;
137  const Index group_id = tmp / scanParameters.group_threads;
139  const Index block_id = tmp / scanParameters.block_threads;
140  const Index local_id = tmp % scanParameters.block_threads;
141  // we put one element per packet in scratch_mem
142  const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
143  const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
144  CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
145  CoeffReturnType inclusive_scan;
146  // the actual panel size is scan_size * non_scan_size.
147  // elements_per_panel is roundup to power of 2 for binary tree
148  const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
149  const Index group_offset = group_id * scanParameters.non_scan_stride;
150  // This will be effective when the size is bigger than elements_per_block
151  const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
152  const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
153  const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
154  Index next_elements = 0;
156  for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
157  Index global_id = global_offset + next_elements;
158  private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
160  (global_id < scanParameters.total_size))
161  ? read(dev_eval, global_id)
162  : accumulator.initialize();
163  next_elements += scanParameters.scan_stride;
164  }
166  if (inclusive) {
167  inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
168  }
169  });
170  // This for loop must be 2
172  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
173  Index private_offset = 1;
174  // build sum in place up the tree
176  for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
178  for (Index l = 0; l < d; l++) {
179  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
180  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
181  CoeffReturnType accum = accumulator.initialize();
182  accumulator.reduce(private_scan[ai], &accum);
183  accumulator.reduce(private_scan[bi], &accum);
184  private_scan[bi] = accumulator.finalize(accum);
185  }
186  private_offset *= 2;
187  }
188  scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
189  private_scan[PacketSize - 1 + packetIndex];
190  private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
191  // traverse down tree & build scan
193  for (Index d = 1; d < PacketSize; d *= 2) {
194  private_offset >>= 1;
196  for (Index l = 0; l < d; l++) {
197  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
198  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
199  CoeffReturnType accum = accumulator.initialize();
200  accumulator.reduce(private_scan[ai], &accum);
201  accumulator.reduce(private_scan[bi], &accum);
202  private_scan[ai] = private_scan[bi];
203  private_scan[bi] = accumulator.finalize(accum);
204  }
205  }
206  }
207 
208  Index offset = 1;
209  // build sum in place up the tree
210  for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
211  // Synchronise
212  itemID.barrier(cl::sycl::access::fence_space::local_space);
213  if (local_id < d) {
214  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
215  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
216  CoeffReturnType accum = accumulator.initialize();
217  accumulator.reduce(scratch[ai], &accum);
218  accumulator.reduce(scratch[bi], &accum);
219  scratch[bi] = accumulator.finalize(accum);
220  }
221  offset *= 2;
222  }
223  // Synchronise
224  itemID.barrier(cl::sycl::access::fence_space::local_space);
225  // next step optimisation
226  if (local_id == 0) {
231  block_id;
232  tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
233  }
234  // clear the last element
235  scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
236  }
237  // traverse down tree & build scan
238  for (Index d = 1; d < scratch_stride; d *= 2) {
239  offset >>= 1;
240  // Synchronise
241  itemID.barrier(cl::sycl::access::fence_space::local_space);
242  if (local_id < d) {
243  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
244  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
245  CoeffReturnType accum = accumulator.initialize();
246  accumulator.reduce(scratch[ai], &accum);
247  accumulator.reduce(scratch[bi], &accum);
248  scratch[ai] = scratch[bi];
249  scratch[bi] = accumulator.finalize(accum);
250  }
251  }
252  // Synchronise
253  itemID.barrier(cl::sycl::access::fence_space::local_space);
254  // This for loop must be 2
256  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
258  for (Index i = 0; i < PacketSize; i++) {
259  CoeffReturnType accum = private_scan[packetIndex + i];
260  accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
261  private_scan[packetIndex + i] = accumulator.finalize(accum);
262  }
263  }
265  if (inclusive) {
266  accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
267  private_scan[0] = accumulator.finalize(inclusive_scan);
268  }
269  });
270  next_elements = 0;
271  // right the first set of private param
273  for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
274  Index global_id = global_offset + next_elements;
275  if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
277  (global_id < scanParameters.total_size)) {
278  Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
279  out_ptr[global_id] = private_scan[private_id];
280  }
281  next_elements += scanParameters.scan_stride;
282  }
283  } // end for loop
284  }
285 };
286 
287 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
289  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
292  InAccessor in_ptr;
293  OutAccessor out_ptr;
297  OutAccessor out_accessor_,
298  const ScanParameters<Index> scanParameters_,
299  Op accumulator_)
300  : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {}
301 
302  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
303  for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
304  Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
305  Index tmp = data_offset % scanParameters.panel_threads;
306  const Index panel_id = data_offset / scanParameters.panel_threads;
307  const Index group_id = tmp / scanParameters.group_threads;
309  const Index block_id = tmp / scanParameters.block_threads;
310  const Index local_id = tmp % scanParameters.block_threads;
311 
312  // the actual panel size is scan_size * non_scan_size.
313  // elements_per_panel is roundup to power of 2 for binary tree
314  const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
315  const Index group_offset = group_id * scanParameters.non_scan_stride;
316  // This will be effective when the size is bigger than elements_per_block
317  const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
318  const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
319 
320  const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
322  const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
323  CoeffReturnType adjust_val = in_ptr[in_id];
324 
325  Index next_elements = 0;
327  for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
328  Index global_id = global_offset + next_elements;
329  if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
331  (global_id < scanParameters.total_size)) {
332  CoeffReturnType accum = adjust_val;
333  accumulator.reduce(out_ptr[global_id], &accum);
334  out_ptr[global_id] = accumulator.finalize(accum);
335  }
336  next_elements += scanParameters.scan_stride;
337  }
338  }
339  }
340 };
341 
342 template <typename Index>
343 struct ScanInfo {
345  const Index &scan_size;
350 
361  const Eigen::SyclDevice &dev;
362  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
363  const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
364  const Eigen::SyclDevice &dev_)
365  : total_size(total_size_),
366  scan_size(scan_size_),
367  panel_size(panel_size_),
368  non_scan_size(non_scan_size_),
369  scan_stride(scan_stride_),
370  non_scan_stride(non_scan_stride_),
371  dev(dev_) {
372  // must be power of 2
373  local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
374  Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
375 
377 
379  dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
380  const Index elements_per_panel = elements_per_group * non_scan_size;
386 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
388 #else
389  const Index max_threads = panel_threads * panel_size;
390 #endif
391  global_range = roundUp(max_threads, local_range);
392  loop_range = Index(
393  std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
394  }
398  }
399  inline cl::sycl::nd_range<1> get_thread_range() {
400  return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
401  }
402 };
403 
404 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
406  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
407  Reducer &accumulator, const Index total_size,
408  const Index scan_size, const Index panel_size,
409  const Index non_scan_size, const Index scan_stride,
410  const Index non_scan_stride, const Eigen::SyclDevice &dev) {
411  auto scan_info =
412  ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
413 
415  AdjustFuctor;
416  dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
417  scan_info.max_elements_per_block,
418  scan_info.get_scan_parameter(), accumulator)
419  .wait();
420  }
421 };
422 
423 template <typename CoeffReturnType, scan_step stp>
425  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
426  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
427  const Index total_size, const Index scan_size, const Index panel_size,
428  const Index non_scan_size, const Index scan_stride,
429  const Index non_scan_stride, const bool inclusive,
430  const Eigen::SyclDevice &dev) {
431  auto scan_info =
432  ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
433  const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
434  const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
435  CoeffReturnType *temp_pointer =
436  static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
437  EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
438 
440  dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
441  in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
442  scan_info.get_scan_parameter(), accumulator, inclusive)
443  .wait();
444 
445  if (scan_info.block_size > 1) {
447  tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
448  non_scan_size, Index(1), scan_info.block_size, false, dev);
449 
451  tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
452  non_scan_stride, dev);
453  }
454  dev.deallocate_temp(temp_pointer);
455  }
456 };
457 
458 } // namespace internal
459 } // namespace TensorSycl
460 namespace internal {
461 template <typename Self, typename Reducer, bool vectorize>
462 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
463  typedef typename Self::Index Index;
464  typedef typename Self::CoeffReturnType CoeffReturnType;
465  typedef typename Self::Storage Storage;
466  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
467  void operator()(Self &self, EvaluatorPointerType data) const {
468  const Index total_size = internal::array_prod(self.dimensions());
469  const Index scan_size = self.size();
470  const Index scan_stride = self.stride();
471  // this is the scan op (can be sum or ...)
472  auto accumulator = self.accumulator();
473  auto inclusive = !self.exclusive();
474  auto consume_dim = self.consume_dim();
475  auto dev = self.device();
476 
477  auto dims = self.inner().dimensions();
478 
479  Index non_scan_size = 1;
480  Index panel_size = 1;
481  if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
482  for (int i = 0; i < consume_dim; i++) {
483  non_scan_size *= dims[i];
484  }
485  for (int i = consume_dim + 1; i < Self::NumDims; i++) {
486  panel_size *= dims[i];
487  }
488  } else {
489  for (int i = Self::NumDims - 1; i > consume_dim; i--) {
490  non_scan_size *= dims[i];
491  }
492  for (int i = consume_dim - 1; i >= 0; i--) {
493  panel_size *= dims[i];
494  }
495  }
496  const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
497  auto eval_impl = self.inner();
499  eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
500  inclusive, dev);
501  }
502 };
503 } // namespace internal
504 } // namespace Eigen
505 
506 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition: TensorScanSycl.h:48
#define min(a, b)
Definition: datatypes.h:22
@ ColMajor
Definition: Constants.h:318
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
scan_step
Definition: TensorScanSycl.h:83
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:406
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:294
OutAccessor out_ptr
Definition: TensorScanSycl.h:293
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:291
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:290
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition: TensorScanSycl.h:296
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:302
InAccessor in_ptr
Definition: TensorScanSycl.h:292
Op accumulator
Definition: TensorScanSycl.h:295
Definition: TensorScanSycl.h:343
Index max_elements_per_block
Definition: TensorScanSycl.h:351
const Index & total_size
Definition: TensorScanSycl.h:344
const Index & non_scan_stride
Definition: TensorScanSycl.h:349
Index group_threads
Definition: TensorScanSycl.h:354
const Index & non_scan_size
Definition: TensorScanSycl.h:347
const Index & scan_stride
Definition: TensorScanSycl.h:348
const Eigen::SyclDevice & dev
Definition: TensorScanSycl.h:361
Index local_range
Definition: TensorScanSycl.h:360
cl::sycl::nd_range< 1 > get_thread_range()
Definition: TensorScanSycl.h:399
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition: TensorScanSycl.h:362
Index elements_per_block
Definition: TensorScanSycl.h:357
Index block_threads
Definition: TensorScanSycl.h:355
Index block_size
Definition: TensorScanSycl.h:352
Index panel_threads
Definition: TensorScanSycl.h:353
Index elements_per_group
Definition: TensorScanSycl.h:356
Index loop_range
Definition: TensorScanSycl.h:358
const Index & panel_size
Definition: TensorScanSycl.h:346
const Index & scan_size
Definition: TensorScanSycl.h:345
Index global_range
Definition: TensorScanSycl.h:359
ScanParameters< Index > get_scan_parameter()
Definition: TensorScanSycl.h:395
Definition: TensorScanSycl.h:86
Op accumulator
Definition: TensorScanSycl.h:96
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:89
std::enable_if_t< sst !=scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:117
Evaluator dev_eval
Definition: TensorScanSycl.h:92
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:88
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition: TensorScanSycl.h:98
OutAccessor out_ptr
Definition: TensorScanSycl.h:93
LocalAccessor scratch
Definition: TensorScanSycl.h:91
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:132
OutAccessor tmp_ptr
Definition: TensorScanSycl.h:94
std::enable_if_t< sst !=scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp) const
Definition: TensorScanSycl.h:129
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:111
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op) const
Definition: TensorScanSycl.h:123
const bool inclusive
Definition: TensorScanSycl.h:97
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:95
Definition: TensorScanSycl.h:424
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:426
Definition: TensorScanSycl.h:52
const index_t non_scan_stride
Definition: TensorScanSycl.h:58
const index_t scan_stride
Definition: TensorScanSycl.h:59
const index_t total_size
Definition: TensorScanSycl.h:55
const index_t non_scan_size
Definition: TensorScanSycl.h:56
const index_t block_threads
Definition: TensorScanSycl.h:62
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition: TensorScanSycl.h:54
const index_t group_threads
Definition: TensorScanSycl.h:61
const index_t panel_threads
Definition: TensorScanSycl.h:60
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition: TensorScanSycl.h:67
const index_t elements_per_group
Definition: TensorScanSycl.h:63
const index_t elements_per_block
Definition: TensorScanSycl.h:64
const index_t scan_size
Definition: TensorScanSycl.h:57
const index_t loop_range
Definition: TensorScanSycl.h:65
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorScanSycl.h:466
Self::CoeffReturnType CoeffReturnType
Definition: TensorScanSycl.h:464
void operator()(Self &self, EvaluatorPointerType data) const
Definition: TensorScanSycl.h:467
Definition: TensorScan.h:179