430 "The Local thread size must be a power of 2 for the reduction "
441 const Index reductionPerThread = 64;
442 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
444 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ?
std::min(rGroups, localRange) : 1;
446 const Index globalRange = pNumGroups * rNumGroups * localRange;
450 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451 if (rNumGroups > 1) {
453 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(
CoeffReturnType)));
455 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457 num_coeffs_to_reduce)
459 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
460 SecondStepPartialReductionKernel;
461 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
463 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464 Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
466 self.device().deallocate_temp(temp_pointer);
468 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469 self,
output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470 num_coeffs_to_reduce)
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define min(a, b)
Definition: datatypes.h:22
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:412
Self::Index Index
Definition: TensorReductionSycl.h:415
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:413
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:406
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:405
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:407