Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt > Struct Template Reference

#include <TensorReductionSycl.h>

Public Types

typedef Self::EvaluatorPointerType EvaluatorPointerType
 
typedef Self::CoeffReturnType CoeffReturnType
 
typedef Self::Storage Storage
 
typedef Self::Index Index
 
typedef ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
 
typedef PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
 

Static Public Member Functions

static bool run (const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
 

Member Typedef Documentation

◆ CoeffReturnType

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef Self::CoeffReturnType Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::CoeffReturnType

◆ EvaluatorPointerType

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef Self::EvaluatorPointerType Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::EvaluatorPointerType

◆ Index

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef Self::Index Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::Index

◆ PannelParameters

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true> Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::PannelParameters

◆ Storage

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef Self::Storage Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::Storage

◆ SyclReducerKerneType

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
typedef PartialReductionKernel<Self, Op, PannelParameters, rt> Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::SyclReducerKerneType

Member Function Documentation

◆ run()

template<typename Self , typename Op , TensorSycl::internal::reduction_dim rt>
static bool Eigen::TensorSycl::internal::PartialReducerLauncher< Self, Op, rt >::run ( const Self &  self,
const Op &  reducer,
const Eigen::SyclDevice &  dev,
EvaluatorPointerType  output,
Index  num_coeffs_to_reduce,
Index  num_coeffs_to_preserve 
)
inlinestatic
422  {
423  Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424 
425  // getPowerOfTwo makes sure local range is power of 2 and <=
426  // maxSyclThreadPerBlock this will help us to avoid extra check on the
427  // kernel
430  "The Local thread size must be a power of 2 for the reduction "
431  "operation");
432 
434  // In this step, we force the code not to be more than 2-step reduction:
435  // Our empirical research shows that if each thread reduces at least 64
436  // elements individually, we get better performance. However, this can change
437  // on different platforms. In this step we force the code not to be
438  // morthan step reduction: Our empirical research shows that for inner_most
439  // dim reducer, it is better to have 8 group in a reduce dimension for sizes
440  // > 1024 to achieve the best performance.
441  const Index reductionPerThread = 64;
442  Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
443  const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
444  Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445  const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
446  const Index globalRange = pNumGroups * rNumGroups * localRange;
447 
448  EIGEN_CONSTEXPR Index scratchSize =
450  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451  if (rNumGroups > 1) {
452  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
453  dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
454  EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
455  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456  self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457  num_coeffs_to_reduce)
458  .wait();
459  typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
460  SecondStepPartialReductionKernel;
461  dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
462  temp_accessor, output,
463  cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464  Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
465  .wait();
466  self.device().deallocate_temp(temp_pointer);
467  } else {
468  dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469  self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470  num_coeffs_to_reduce)
471  .wait();
472  }
473  return false;
474  }
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define min(a, b)
Definition: datatypes.h:22
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:412
Self::Index Index
Definition: TensorReductionSycl.h:415
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:413
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:406
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:405
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:407

References Eigen::TensorSycl::internal::ReductionPannel< Index, LTP, LTR, BC_ >::BC, EIGEN_CONSTEXPR, Eigen::TensorSycl::internal::ReductionPannel< Index, LTP, LTR, BC_ >::LocalThreadSizeP, Eigen::TensorSycl::internal::ReductionPannel< Index, LTP, LTR, BC_ >::LocalThreadSizeR, min, and output().


The documentation for this struct was generated from the following file: