TensorReductionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 //
9 // This Source Code Form is subject to the terms of the Mozilla
10 // Public License v. 2.0. If a copy of the MPL was not distributed
11 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
12 
13 /*****************************************************************
14  * TensorReductionSycl.h
15  *
16  * \brief:
17  * This is the specialization of the reduction operation. Two phase reduction approach
18  * is used since the GPU does not have Global Synchronization for global memory among
19  * different work-group/thread block. To solve the problem, we need to create two kernels
20  * to reduce the data, where the first kernel reduce the data locally and each local
21  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
22  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
23  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
24  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
25  *
26  *****************************************************************/
27 
28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
30 // IWYU pragma: private
31 #include "./InternalHeaderCheck.h"
32 
33 namespace Eigen {
34 namespace TensorSycl {
35 namespace internal {
36 
37 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
38 struct OpDefiner {
39  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
40  typedef Op type;
42 
44  const Index &) {
45  return accumulator;
46  }
47 };
48 
49 template <typename CoeffReturnType, typename Index>
50 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
53  return type();
54  }
55 
56  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
57  const Index &scale) {
59  return quotient_op(accumulator, CoeffReturnType(scale));
60  }
61 };
62 
63 template <typename CoeffReturnType, typename Index>
64 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
65  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
68  return type();
69  }
70 
72  const Index &scale) {
73  return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
74  }
75 };
76 
77 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
78  Index local_range>
80  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
83  typedef typename OpDef::type Op;
85  InputAccessor aI;
86  OutputAccessor outAcc;
87  Op op;
88  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
89  : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
90 
91  void operator()(cl::sycl::nd_item<1> itemID) const {
92  // Our empirical research shows that the best performance will be achieved
93  // when there is only one element per thread to reduce in the second step.
94  // in this step the second step reduction time is almost negligible.
95  // Hence, in the second step of reduction the input size is fixed to the
96  // local size, thus, there is only one element read per thread. The
97  // algorithm must be changed if the number of reduce per thread in the
98  // second step is greater than 1. Otherwise, the result will be wrong.
99  const Index localid = itemID.get_local_id(0);
100  auto aInPtr = aI + localid;
101  auto aOutPtr = outAcc;
102  CoeffReturnType *scratchptr = scratch.get_pointer();
103  CoeffReturnType accumulator = *aInPtr;
104 
105  scratchptr[localid] = op.finalize(accumulator);
106  for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
107  itemID.barrier(cl::sycl::access::fence_space::local_space);
108  if (localid < offset) {
109  op.reduce(scratchptr[localid + offset], &accumulator);
110  scratchptr[localid] = op.finalize(accumulator);
111  }
112  }
113  if (localid == 0) *aOutPtr = op.finalize(accumulator);
114  }
115 };
116 
117 // Full reduction first phase. In this version the vectorization is true and the reduction accept
118 // any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
119 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
121  public:
122  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
123  typedef typename Evaluator::Index Index;
124  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
125  (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
127 
128  typedef typename OpDef::type Op;
129  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
130  typedef typename Evaluator::PacketReturnType PacketReturnType;
131  typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
134  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
137  Evaluator evaluator;
141 
142  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
143  Index rng_, OpType op_)
144  : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
145 
146  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
147 
148  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
150  const cl::sycl::nd_item<1> &itemID) const {
151  auto output_ptr = final_output;
152  Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
153  Index globalid = itemID.get_global_id(0);
154  Index localid = itemID.get_local_id(0);
155  Index step = Evaluator::PacketSize * itemID.get_global_range(0);
156  Index start = Evaluator::PacketSize * globalid;
157  // vectorizable parts
158  PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
159  for (Index i = start; i < VectorizedRange; i += step) {
160  op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
161  }
162  globalid += VectorizedRange;
163  // non vectorizable parts
164  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
165  op.template reducePacket<PacketReturnType>(
167  evaluator.impl().coeff(i), op.initialize()),
168  &packetAccumulator);
169  }
170  scratch[localid] = packetAccumulator =
171  OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
172  // reduction parts // Local size is always power of 2
174  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
175  itemID.barrier(cl::sycl::access::fence_space::local_space);
176  if (localid < offset) {
177  op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
178  scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
179  }
180  }
181  if (localid == 0) {
182  output_ptr[itemID.get_group(0)] =
183  op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
184  }
185  }
186 
187  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
189  const cl::sycl::nd_item<1> &itemID) const {
190  auto output_ptr = final_output;
191  Index globalid = itemID.get_global_id(0);
192  Index localid = itemID.get_local_id(0);
193  // vectorizable parts
194  CoeffReturnType accumulator = op.initialize();
195  // non vectorizable parts
196  for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
197  op.reduce(evaluator.impl().coeff(i), &accumulator);
198  }
199  scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
200 
201  // reduction parts. the local size is always power of 2
203  for (Index offset = local_range / 2; offset > 0; offset /= 2) {
204  itemID.barrier(cl::sycl::access::fence_space::local_space);
205  if (localid < offset) {
206  op.reduce(scratch[localid + offset], &accumulator);
207  scratch[localid] = op.finalize(accumulator);
208  }
209  }
210  if (localid == 0) {
211  output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
212  }
213  }
214 };
215 
216 template <typename Evaluator, typename OpType>
218  public:
219  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
220  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
221  typedef typename Evaluator::Index Index;
223  typedef typename OpDef::type Op;
224  template <typename Scratch>
225  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
226  Index range_, Index num_values_to_reduce_)
227  : evaluator(evaluator_),
228  output_accessor(output_accessor_),
229  functor(OpDef::get_op(functor_)),
230  range(range_),
231  num_values_to_reduce(num_values_to_reduce_) {}
232 
233  void operator()(cl::sycl::nd_item<1> itemID) const {
234  // This is to bypass the statefull condition in Eigen meanReducer
235  Op non_const_functor;
236  std::memcpy(&non_const_functor, &functor, sizeof(Op));
237  auto output_accessor_ptr = output_accessor;
238  Index globalid = static_cast<Index>(itemID.get_global_linear_id());
239  if (globalid < range) {
240  CoeffReturnType accum = functor.initialize();
242  evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
243  output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
244  }
245  }
246 
247  private:
248  Evaluator evaluator;
253 };
254 
256 // default is preserver
257 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
259  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
260  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
261  typedef typename Evaluator::Index Index;
263  typedef typename OpDef::type Op;
264  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
267  Evaluator evaluator;
274 
275  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
276  const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
277  const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
278  : scratch(scratch_),
279  evaluator(evaluator_),
280  output_accessor(output_accessor_),
281  op(OpDef::get_op(op_)),
282  preserve_elements_num_groups(preserve_elements_num_groups_),
283  reduce_elements_num_groups(reduce_elements_num_groups_),
284  num_coeffs_to_preserve(num_coeffs_to_preserve_),
285  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
286 
288  CoeffReturnType &accumulator) const {
289  if (globalPId >= num_coeffs_to_preserve) {
290  return;
291  }
292  Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
293  : globalRId + (globalPId * num_coeffs_to_reduce);
294  Index localOffset = globalRId;
295 
296  const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
297  const Index per_thread_global_stride =
298  rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
299  for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
300  op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
301  localOffset += per_thread_local_stride;
302  global_offset += per_thread_global_stride;
303  }
304  }
305  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
306  const Index linearLocalThreadId = itemID.get_local_id(0);
307  Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
308  : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
309  Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
310  : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
311  const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
312  : itemID.get_group(0) / reduce_elements_num_groups;
313  const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
314  : itemID.get_group(0) % reduce_elements_num_groups;
315 
316  Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
317  const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
318  CoeffReturnType *scratchPtr = scratch.get_pointer();
319  auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
320  CoeffReturnType accumulator = op.initialize();
321 
322  element_wise_reduce(globalRId, globalPId, accumulator);
323 
324  accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
325  scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
326  accumulator;
327  if (rt == reduction_dim::inner_most) {
328  pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
329  rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
330  globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331  }
332 
333  /* Apply the reduction operation between the current local
334  * id and the one on the other half of the vector. */
335  auto out_scratch_ptr =
336  scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
337  itemID.barrier(cl::sycl::access::fence_space::local_space);
338  if (rt == reduction_dim::inner_most) {
339  accumulator = *out_scratch_ptr;
340  }
341  // The Local LocalThreadSizeR is always power of 2
343  for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
344  if (rLocalThreadId < offset) {
345  op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
346  // The result has already been divided for mean reducer in the
347  // previous reduction so no need to divide furthermore
348  *out_scratch_ptr = op.finalize(accumulator);
349  }
350  /* All threads collectively read from global memory into local.
351  * The barrier ensures all threads' IO is resolved before
352  * execution continues (strictly speaking, all threads within
353  * a single work-group - there is no co-ordination between
354  * work-groups, only work-items). */
355  itemID.barrier(cl::sycl::access::fence_space::local_space);
356  }
357 
358  if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
359  outPtr[globalPId] = op.finalize(accumulator);
360  }
361  }
362 };
363 
364 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
367  typedef typename OpDef::type Op;
368  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
370  InputAccessor input_accessor;
371  OutputAccessor output_accessor;
375 
377  OutputAccessor output_accessor_, OpType op_,
378  const Index num_coeffs_to_preserve_,
379  const Index num_coeffs_to_reduce_)
380  : input_accessor(input_accessor_),
381  output_accessor(output_accessor_),
382  op(OpDef::get_op(op_)),
383  num_coeffs_to_preserve(num_coeffs_to_preserve_),
384  num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
385 
386  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
387  const Index globalId = itemID.get_global_id(0);
388 
389  if (globalId >= num_coeffs_to_preserve) return;
390 
391  auto in_ptr = input_accessor + globalId;
392 
393  OutScalar accumulator = op.initialize();
394  // num_coeffs_to_reduce is not bigger that 256
395  for (Index i = 0; i < num_coeffs_to_reduce; i++) {
396  op.reduce(*in_ptr, &accumulator);
397  in_ptr += num_coeffs_to_preserve;
398  }
399  output_accessor[globalId] = op.finalize(accumulator);
400  }
401 }; // namespace internal
402 
403 template <typename Index, Index LTP, Index LTR, bool BC_>
407  static EIGEN_CONSTEXPR bool BC = BC_;
408 };
409 
410 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
412  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
413  typedef typename Self::CoeffReturnType CoeffReturnType;
414  typedef typename Self::Storage Storage;
415  typedef typename Self::Index Index;
418 
420 
421  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
422  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
423  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424 
425  // getPowerOfTwo makes sure local range is power of 2 and <=
426  // maxSyclThreadPerBlock this will help us to avoid extra check on the
427  // kernel
430  "The Local thread size must be a power of 2 for the reduction "
431  "operation");
432 
434  // In this step, we force the code not to be more than 2-step reduction:
435  // Our empirical research shows that if each thread reduces at least 64
436  // elements individually, we get better performance. However, this can change
437  // on different platforms. In this step we force the code not to be
438  // morthan step reduction: Our empirical research shows that for inner_most
439  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
440  // > 1024 to achieve the best performance.
441  const Index reductionPerThread = 64;
442  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
443  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
444  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
446  const Index globalRange = pNumGroups * rNumGroups * localRange;
447 
448  EIGEN_CONSTEXPR Index scratchSize =
450  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451  if (rNumGroups > 1) {
452  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
453  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
454  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
455  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457  num_coeffs_to_reduce)
458  .wait();
460  SecondStepPartialReductionKernel;
461  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
462  temp_accessor, output,
463  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464  Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
465  .wait();
466  self.device().deallocate_temp(temp_pointer);
467  } else {
468  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470  num_coeffs_to_reduce)
471  .wait();
472  }
473  return false;
474  }
475 };
476 } // namespace internal
477 } // namespace TensorSycl
478 
479 namespace internal {
480 
481 template <typename Self, typename Op, bool Vectorizable>
482 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
483  typedef typename Self::CoeffReturnType CoeffReturnType;
484  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
486  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
487  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
488  typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
489  static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
490  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
491  "The Local thread size must be a power of 2 for the reduction "
492  "operation");
493  EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
494 
495  typename Self::Index inputSize = self.impl().dimensions().TotalSize();
496  // In this step we force the code not to be more than 2-step reduction:
497  // Our empirical research shows that if each thread reduces at least 512
498  // elements individually, we get better performance.
499  const Index reductionPerThread = 2048;
500  // const Index num_work_group =
501  Index reductionGroup = dev.getPowerOfTwo(
502  (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
503  const Index num_work_group = std::min(reductionGroup, local_range);
504  // 1
505  // ? local_range
506  // : 1);
507  const Index global_range = num_work_group * local_range;
508 
509  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
511  if (num_work_group > 1) {
512  CoeffReturnType *temp_pointer =
513  static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
514  typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
515  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
516  local_range, inputSize, reducer)
517  .wait();
519  EvaluatorPointerType, Index, local_range>
520  GenericRKernel;
521  dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
522  tmp_global_accessor, data,
523  cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
524  num_work_group, reducer)
525  .wait();
526  dev.deallocate_temp(temp_pointer);
527  } else {
528  dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
529  reducer)
530  .wait();
531  }
532  }
533 };
534 // vectorizable inner_most most dim preserver
535 // col reduction
536 template <typename Self, typename Op>
537 struct OuterReducer<Self, Op, Eigen::SyclDevice> {
539 
540  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
541  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
542  typename Self::Index num_coeffs_to_preserve) {
543  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
544  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
545  num_coeffs_to_reduce,
546  num_coeffs_to_preserve);
547  }
548 };
549 // row reduction
550 template <typename Self, typename Op>
551 struct InnerReducer<Self, Op, Eigen::SyclDevice> {
553 
554  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
555  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
556  typename Self::Index num_coeffs_to_preserve) {
557  return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
558  Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
559  num_coeffs_to_reduce,
560  num_coeffs_to_preserve);
561  }
562 };
563 
564 // ArmgMax uses this kernel for partial reduction//
565 // TODO(@mehdi.goli) come up with a better kernel
566 // generic partial reduction
567 template <typename Self, typename Op>
568 struct GenericReducer<Self, Op, Eigen::SyclDevice> {
569  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
570  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
571  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
572  typename Self::Index num_coeffs_to_preserve) {
573  typename Self::Index range, GRange, tileSize;
574  dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
575 
576  dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
578  self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
579  reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
580  .wait();
581  return false;
582  }
583 };
584 
585 } // namespace internal
586 } // namespace Eigen
587 
588 #endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
Definition: TensorReductionSycl.h:120
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:146
std::conditional_t<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType > OutType
Definition: TensorReductionSycl.h:133
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
Definition: TensorReductionSycl.h:142
EvaluatorPointerType final_output
Definition: TensorReductionSycl.h:138
Evaluator::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:130
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:129
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:135
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:188
Evaluator evaluator
Definition: TensorReductionSycl.h:137
Evaluator::Index Index
Definition: TensorReductionSycl.h:123
Index rng
Definition: TensorReductionSycl.h:139
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:149
OpDef::type Op
Definition: TensorReductionSycl.h:128
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:122
LocalAccessor scratch
Definition: TensorReductionSycl.h:136
Op op
Definition: TensorReductionSycl.h:140
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
Definition: TensorReductionSycl.h:225
Op functor
Definition: TensorReductionSycl.h:250
Evaluator::Index Index
Definition: TensorReductionSycl.h:221
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:219
Index range
Definition: TensorReductionSycl.h:251
OpDef::type Op
Definition: TensorReductionSycl.h:223
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:222
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:233
Index num_values_to_reduce
Definition: TensorReductionSycl.h:252
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:220
Evaluator evaluator
Definition: TensorReductionSycl.h:248
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:249
#define min(a, b)
Definition: datatypes.h:22
char char * op
Definition: level2_impl.h:374
reduction_dim
Definition: TensorReductionSycl.h:255
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:67
Vectorise< CoeffReturnType, Eigen::SyclDevice, true >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:65
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:66
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:71
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:52
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:56
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:51
Definition: TensorReductionSycl.h:38
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
Definition: TensorReductionSycl.h:41
Op type
Definition: TensorReductionSycl.h:40
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
Definition: TensorReductionSycl.h:43
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:39
Definition: InteropHeaders.h:135
Definition: TensorReductionSycl.h:411
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:412
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:421
Self::Index Index
Definition: TensorReductionSycl.h:415
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:413
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
Definition: TensorReductionSycl.h:419
Self::Storage Storage
Definition: TensorReductionSycl.h:414
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
Definition: TensorReductionSycl.h:417
Definition: TensorReductionSycl.h:258
const Index preserve_elements_num_groups
Definition: TensorReductionSycl.h:270
Evaluator::Index Index
Definition: TensorReductionSycl.h:261
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:273
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator) const
Definition: TensorReductionSycl.h:287
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
Definition: TensorReductionSycl.h:265
ScratchAcc scratch
Definition: TensorReductionSycl.h:266
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:272
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:268
OpDef::type Op
Definition: TensorReductionSycl.h:263
const Index reduce_elements_num_groups
Definition: TensorReductionSycl.h:271
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:260
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:275
Op op
Definition: TensorReductionSycl.h:269
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:305
Evaluator evaluator
Definition: TensorReductionSycl.h:267
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:259
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:262
Definition: TensorReductionSycl.h:404
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:406
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:405
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:407
Definition: TensorReductionSycl.h:79
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
Definition: TensorReductionSycl.h:88
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
Definition: TensorReductionSycl.h:82
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:81
Op op
Definition: TensorReductionSycl.h:87
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:91
OutputAccessor outAcc
Definition: TensorReductionSycl.h:86
OpDef::type Op
Definition: TensorReductionSycl.h:83
LocalAccessor scratch
Definition: TensorReductionSycl.h:84
InputAccessor aI
Definition: TensorReductionSycl.h:85
Definition: TensorReductionSycl.h:365
OutputAccessor output_accessor
Definition: TensorReductionSycl.h:371
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:386
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:373
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
Definition: TensorReductionSycl.h:369
OpDefiner< OpType, OutScalar, Index, false > OpDef
Definition: TensorReductionSycl.h:366
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:374
Op op
Definition: TensorReductionSycl.h:372
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:376
InputAccessor input_accessor
Definition: TensorReductionSycl.h:370
OpDef::type Op
Definition: TensorReductionSycl.h:367
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:483
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data)
Definition: TensorReductionSycl.h:487
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:484
Definition: TensorReduction.h:356
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:357
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:139
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:570
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:554
Definition: TensorReduction.h:430
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
Definition: TensorFunctors.h:107
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:540
Definition: TensorReduction.h:442
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
Definition: TensorFunctors.h:66
Template functor to compute the quotient of two scalars.
Definition: BinaryFunctors.h:430