d7/dfc/TensorReductionSycl_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Mehdi Goli    Codeplay Software Ltd.

 // Ralph Potter  Codeplay Software Ltd.

 // Luke Iwanski  Codeplay Software Ltd.

 // Contact: <eigen@codeplay.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 /*****************************************************************

  * TensorReductionSycl.h

  *

  * \brief:

  *  This is the specialization of the reduction operation. Two phase reduction approach

  * is used since the GPU does not have Global Synchronization for global memory among

  * different work-group/thread block. To solve the problem, we need to create two kernels

  * to reduce the data, where the first kernel reduce the data locally and each local

  * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)

  * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.

  * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:

  * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf

  *

  *****************************************************************/


 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace TensorSycl {

 namespace internal {


 template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>

 struct OpDefiner {

   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;

   typedef Op type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,

                                                                             const Index &) {

     return accumulator;

   }

 };


 template <typename CoeffReturnType, typename Index>

 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {

   typedef Eigen::internal::SumReducer<CoeffReturnType> type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {

     return type();

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,

                                                                            const Index &scale) {

     ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;

     return quotient_op(accumulator, CoeffReturnType(scale));

   }

 };


 template <typename CoeffReturnType, typename Index>

 struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {

   typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;

   typedef Eigen::internal::SumReducer<CoeffReturnType> type;

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {

     return type();

   }


   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,

                                                                             const Index &scale) {

     return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));

   }

 };


 template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,

           Index local_range>

 struct SecondStepFullReducer {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;

   typedef typename OpDef::type Op;

   LocalAccessor scratch;

   InputAccessor aI;

   OutputAccessor outAcc;

   Op op;

   SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)

       : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}


   void operator()(cl::sycl::nd_item<1> itemID) const {

     // Our empirical research shows that the best performance will be achieved

     // when there is only one element per thread to reduce in the second step.

     // in this step the second step reduction time is almost negligible.

     // Hence, in the second step of reduction the input size is fixed to the

     // local size, thus, there is only one element read per thread. The

     // algorithm must be changed if the number of reduce per thread in the

     // second step is greater than 1. Otherwise, the result will be wrong.

     const Index localid = itemID.get_local_id(0);

     auto aInPtr = aI + localid;

     auto aOutPtr = outAcc;

     CoeffReturnType *scratchptr = scratch.get_pointer();

     CoeffReturnType accumulator = *aInPtr;


     scratchptr[localid] = op.finalize(accumulator);

     for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.reduce(scratchptr[localid + offset], &accumulator);

         scratchptr[localid] = op.finalize(accumulator);

       }

     }

     if (localid == 0) *aOutPtr = op.finalize(accumulator);

   }

 };


 // Full reduction first phase. In this version the vectorization is true and the reduction accept

 // any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ).

 template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>

 class FullReductionKernelFunctor {

  public:

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,

                     (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

       OpDef;


   typedef typename OpDef::type Op;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::PacketReturnType PacketReturnType;

   typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,

                              CoeffReturnType>

       OutType;

   typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   LocalAccessor scratch;

   Evaluator evaluator;

   EvaluatorPointerType final_output;

   Index rng;

   Op op;


   FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,

                              Index rng_, OpType op_)

       : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}


   void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }


   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(

       const cl::sycl::nd_item<1> &itemID) const {

     auto output_ptr = final_output;

     Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;

     Index globalid = itemID.get_global_id(0);

     Index localid = itemID.get_local_id(0);

     Index step = Evaluator::PacketSize * itemID.get_global_range(0);

     Index start = Evaluator::PacketSize * globalid;

     // vectorizable parts

     PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();

     for (Index i = start; i < VectorizedRange; i += step) {

       op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);

     }

     globalid += VectorizedRange;

     // non vectorizable parts

     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {

       op.template reducePacket<PacketReturnType>(

           ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(

               evaluator.impl().coeff(i), op.initialize()),

           &packetAccumulator);

     }

     scratch[localid] = packetAccumulator =

         OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);

     // reduction parts // Local size is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = local_range / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);

         scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);

       }

     }

     if (localid == 0) {

       output_ptr[itemID.get_group(0)] =

           op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));

     }

   }


   template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(

       const cl::sycl::nd_item<1> &itemID) const {

     auto output_ptr = final_output;

     Index globalid = itemID.get_global_id(0);

     Index localid = itemID.get_local_id(0);

     // vectorizable parts

     CoeffReturnType accumulator = op.initialize();

     // non vectorizable parts

     for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {

       op.reduce(evaluator.impl().coeff(i), &accumulator);

     }

     scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);


     // reduction parts. the local size is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = local_range / 2; offset > 0; offset /= 2) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         op.reduce(scratch[localid + offset], &accumulator);

         scratch[localid] = op.finalize(accumulator);

       }

     }

     if (localid == 0) {

       output_ptr[itemID.get_group(0)] = op.finalize(accumulator);

     }

   }

 };


 template <typename Evaluator, typename OpType>

 class GenericNondeterministicReducer {

  public:

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;

   typedef typename OpDef::type Op;

   template <typename Scratch>

   GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,

                                  Index range_, Index num_values_to_reduce_)

       : evaluator(evaluator_),

         output_accessor(output_accessor_),

         functor(OpDef::get_op(functor_)),

         range(range_),

         num_values_to_reduce(num_values_to_reduce_) {}


   void operator()(cl::sycl::nd_item<1> itemID) const {

     // This is to bypass the statefull condition in Eigen meanReducer

     Op non_const_functor;

     std::memcpy(&non_const_functor, &functor, sizeof(Op));

     auto output_accessor_ptr = output_accessor;

     Index globalid = static_cast<Index>(itemID.get_global_linear_id());

     if (globalid < range) {

       CoeffReturnType accum = functor.initialize();

       Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(

           evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);

       output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);

     }

   }


  private:

   Evaluator evaluator;

   EvaluatorPointerType output_accessor;

   Op functor;

   Index range;

   Index num_values_to_reduce;

 };


 enum class reduction_dim { inner_most, outer_most };

 // default is preserver

 template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>

 struct PartialReductionKernel {

   typedef typename Evaluator::CoeffReturnType CoeffReturnType;

   typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Evaluator::Index Index;

   typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;

   typedef typename OpDef::type Op;

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       ScratchAcc;

   ScratchAcc scratch;

   Evaluator evaluator;

   EvaluatorPointerType output_accessor;

   Op op;

   const Index preserve_elements_num_groups;

   const Index reduce_elements_num_groups;

   const Index num_coeffs_to_preserve;

   const Index num_coeffs_to_reduce;


   PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,

                          const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,

                          const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)

       : scratch(scratch_),

         evaluator(evaluator_),

         output_accessor(output_accessor_),

         op(OpDef::get_op(op_)),

         preserve_elements_num_groups(preserve_elements_num_groups_),

         reduce_elements_num_groups(reduce_elements_num_groups_),

         num_coeffs_to_preserve(num_coeffs_to_preserve_),

         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,

                                                                  CoeffReturnType &accumulator) const {

     if (globalPId >= num_coeffs_to_preserve) {

       return;

     }

     Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)

                                                           : globalRId + (globalPId * num_coeffs_to_reduce);

     Index localOffset = globalRId;


     const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;

     const Index per_thread_global_stride =

         rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;

     for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {

       op.reduce(evaluator.impl().coeff(global_offset), &accumulator);

       localOffset += per_thread_local_stride;

       global_offset += per_thread_global_stride;

     }

   }

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     const Index linearLocalThreadId = itemID.get_local_id(0);

     Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP

                                                            : linearLocalThreadId / PannelParameters::LocalThreadSizeR;

     Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP

                                                            : linearLocalThreadId % PannelParameters::LocalThreadSizeR;

     const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups

                                                            : itemID.get_group(0) / reduce_elements_num_groups;

     const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups

                                                            : itemID.get_group(0) % reduce_elements_num_groups;


     Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;

     const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;

     CoeffReturnType *scratchPtr = scratch.get_pointer();

     auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);

     CoeffReturnType accumulator = op.initialize();


     element_wise_reduce(globalRId, globalPId, accumulator);


     accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);

     scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =

         accumulator;

     if (rt == reduction_dim::inner_most) {

       pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;

       rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;

       globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;

     }


     /* Apply the reduction operation between the current local

      * id and the one on the other half of the vector. */

     auto out_scratch_ptr =

         scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));

     itemID.barrier(cl::sycl::access::fence_space::local_space);

     if (rt == reduction_dim::inner_most) {

       accumulator = *out_scratch_ptr;

     }

     // The Local LocalThreadSizeR is always power of 2

     EIGEN_UNROLL_LOOP

     for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {

       if (rLocalThreadId < offset) {

         op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);

         // The result has already been divided for mean reducer in the

         // previous reduction so no need to divide furthermore

         *out_scratch_ptr = op.finalize(accumulator);

       }

       /* All threads collectively read from global memory into local.

        * The barrier ensures all threads' IO is resolved before

        * execution continues (strictly speaking, all threads within

        * a single work-group - there is no co-ordination between

        * work-groups, only work-items). */

       itemID.barrier(cl::sycl::access::fence_space::local_space);

     }


     if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {

       outPtr[globalPId] = op.finalize(accumulator);

     }

   }

 };


 template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>

 struct SecondStepPartialReduction {

   typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;

   typedef typename OpDef::type Op;

   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       ScratchAccessor;

   InputAccessor input_accessor;

   OutputAccessor output_accessor;

   Op op;

   const Index num_coeffs_to_preserve;

   const Index num_coeffs_to_reduce;


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,

                                                                    OutputAccessor output_accessor_, OpType op_,

                                                                    const Index num_coeffs_to_preserve_,

                                                                    const Index num_coeffs_to_reduce_)

       : input_accessor(input_accessor_),

         output_accessor(output_accessor_),

         op(OpDef::get_op(op_)),

         num_coeffs_to_preserve(num_coeffs_to_preserve_),

         num_coeffs_to_reduce(num_coeffs_to_reduce_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     const Index globalId = itemID.get_global_id(0);


     if (globalId >= num_coeffs_to_preserve) return;


     auto in_ptr = input_accessor + globalId;


     OutScalar accumulator = op.initialize();

     // num_coeffs_to_reduce is not bigger that 256

     for (Index i = 0; i < num_coeffs_to_reduce; i++) {

       op.reduce(*in_ptr, &accumulator);

       in_ptr += num_coeffs_to_preserve;

     }

     output_accessor[globalId] = op.finalize(accumulator);

   }

 };  // namespace internal


 template <typename Index, Index LTP, Index LTR, bool BC_>

 struct ReductionPannel {

   static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;

   static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;

   static EIGEN_CONSTEXPR bool BC = BC_;

 };


 template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>

 struct PartialReducerLauncher {

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::Storage Storage;

   typedef typename Self::Index Index;

   typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>

       PannelParameters;


   typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,

                   Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {

     Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);


     // getPowerOfTwo makes sure local range is power of 2 and <=

     // maxSyclThreadPerBlock this will help us to avoid extra check on the

     // kernel

     static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &

                     (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),

                   "The Local thread size must be a power of 2 for the reduction "

                   "operation");


     EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;

     // In this step, we force the code not to be more than 2-step reduction:

     // Our empirical research shows that if each thread reduces at least 64

     // elements individually, we get better performance. However, this can change

     // on different platforms. In this step we force the code not to be

     // morthan step reduction: Our empirical research shows that for inner_most

     // dim reducer, it is better to have 8 group in a reduce dimension for sizes

     // > 1024 to achieve the best performance.

     const Index reductionPerThread = 64;

     Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);

     const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;

     Index rGroups = (cu + pNumGroups - 1) / pNumGroups;

     const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;

     const Index globalRange = pNumGroups * rNumGroups * localRange;


     EIGEN_CONSTEXPR Index scratchSize =

         PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);

     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));

     if (rNumGroups > 1) {

       CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(

           dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));

       EvaluatorPointerType temp_accessor = dev.get(temp_pointer);

       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(

              self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,

              num_coeffs_to_reduce)

           .wait();

       typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>

           SecondStepPartialReductionKernel;

       dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(

              temp_accessor, output,

              cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),

              Index(1), reducer, num_coeffs_to_preserve, rNumGroups)

           .wait();

       self.device().deallocate_temp(temp_pointer);

     } else {

       dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(

              self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,

              num_coeffs_to_reduce)

           .wait();

     }

     return false;

   }

 };

 }  // namespace internal

 }  // namespace TensorSycl


 namespace internal {


 template <typename Self, typename Op, bool Vectorizable>

 struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;

   static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;

   static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {

     typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;

     static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &

                     (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),

                   "The Local thread size must be a power of 2 for the reduction "

                   "operation");

     EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;


     typename Self::Index inputSize = self.impl().dimensions().TotalSize();

     // In this step we force the code not to be more than 2-step reduction:

     // Our empirical research shows that if each thread reduces at least 512

     // elements individually, we get better performance.

     const Index reductionPerThread = 2048;

     // const Index num_work_group =

     Index reductionGroup = dev.getPowerOfTwo(

         (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);

     const Index num_work_group = std::min(reductionGroup, local_range);

     // 1

     // ? local_range

     // : 1);

     const Index global_range = num_work_group * local_range;


     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));

     typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;

     if (num_work_group > 1) {

       CoeffReturnType *temp_pointer =

           static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));

       typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);

       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,

                                                                       local_range, inputSize, reducer)

           .wait();

       typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,

                                                           EvaluatorPointerType, Index, local_range>

           GenericRKernel;

       dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(

              tmp_global_accessor, data,

              cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),

              num_work_group, reducer)

           .wait();

       dev.deallocate_temp(temp_pointer);

     } else {

       dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,

                                                                       reducer)

           .wait();

     }

   }

 };

 // vectorizable inner_most most dim preserver

 // col reduction

 template <typename Self, typename Op>

 struct OuterReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<

         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,

                                                                                  num_coeffs_to_reduce,

                                                                                  num_coeffs_to_preserve);

   }

 };

 // row reduction

 template <typename Self, typename Op>

 struct InnerReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;


   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     return ::Eigen::TensorSycl::internal::PartialReducerLauncher<

         Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,

                                                                                  num_coeffs_to_reduce,

                                                                                  num_coeffs_to_preserve);

   }

 };


 // ArmgMax uses this kernel for partial reduction//

 // TODO(@mehdi.goli) come up with a better kernel

 // generic partial reduction

 template <typename Self, typename Op>

 struct GenericReducer<Self, Op, Eigen::SyclDevice> {

   static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;

   static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,

                   typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,

                   typename Self::Index num_coeffs_to_preserve) {

     typename Self::Index range, GRange, tileSize;

     dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);


     dev.template unary_kernel_launcher<typename Self::CoeffReturnType,

                                        TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(

            self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),

            reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))

         .wait();

     return false;

   }

 };


 }  // namespace internal

 }  // namespace Eigen


 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

i
int i
Definition: BiCGSTAB_step_by_step.cpp:9

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298

EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR
Definition: Macros.h:758

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

data
int data[]
Definition: Map_placement_new.cpp:1

Eigen::TensorSycl::internal::FullReductionKernelFunctor
Definition: TensorReductionSycl.h:120

Eigen::TensorSycl::internal::FullReductionKernelFunctor::operator()
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:146

Eigen::TensorSycl::internal::FullReductionKernelFunctor::OutType
std::conditional_t<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType > OutType
Definition: TensorReductionSycl.h:133

Eigen::TensorSycl::internal::FullReductionKernelFunctor::FullReductionKernelFunctor
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
Definition: TensorReductionSycl.h:142

Eigen::TensorSycl::internal::FullReductionKernelFunctor::final_output
EvaluatorPointerType final_output
Definition: TensorReductionSycl.h:138

Eigen::TensorSycl::internal::FullReductionKernelFunctor::PacketReturnType
Evaluator::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:130

Eigen::TensorSycl::internal::FullReductionKernelFunctor::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:129

Eigen::TensorSycl::internal::FullReductionKernelFunctor::LocalAccessor
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:135

Eigen::TensorSycl::internal::FullReductionKernelFunctor::compute_reduction
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:188

Eigen::TensorSycl::internal::FullReductionKernelFunctor::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:137

Eigen::TensorSycl::internal::FullReductionKernelFunctor::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:123

Eigen::TensorSycl::internal::FullReductionKernelFunctor::rng
Index rng
Definition: TensorReductionSycl.h:139

Eigen::TensorSycl::internal::FullReductionKernelFunctor::compute_reduction
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:149

Eigen::TensorSycl::internal::FullReductionKernelFunctor::Op
OpDef::type Op
Definition: TensorReductionSycl.h:128

Eigen::TensorSycl::internal::FullReductionKernelFunctor::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:122

Eigen::TensorSycl::internal::FullReductionKernelFunctor::scratch
LocalAccessor scratch
Definition: TensorReductionSycl.h:136

Eigen::TensorSycl::internal::FullReductionKernelFunctor::op
Op op
Definition: TensorReductionSycl.h:140

Eigen::TensorSycl::internal::GenericNondeterministicReducer
Definition: TensorReductionSycl.h:217

Eigen::TensorSycl::internal::GenericNondeterministicReducer::GenericNondeterministicReducer
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
Definition: TensorReductionSycl.h:225

Eigen::TensorSycl::internal::GenericNondeterministicReducer::functor
Op functor
Definition: TensorReductionSycl.h:250

Eigen::TensorSycl::internal::GenericNondeterministicReducer::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:221

Eigen::TensorSycl::internal::GenericNondeterministicReducer::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:219

Eigen::TensorSycl::internal::GenericNondeterministicReducer::range
Index range
Definition: TensorReductionSycl.h:251

Eigen::TensorSycl::internal::GenericNondeterministicReducer::Op
OpDef::type Op
Definition: TensorReductionSycl.h:223

Eigen::TensorSycl::internal::GenericNondeterministicReducer::OpDef
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:222

Eigen::TensorSycl::internal::GenericNondeterministicReducer::operator()
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:233

Eigen::TensorSycl::internal::GenericNondeterministicReducer::num_values_to_reduce
Index num_values_to_reduce
Definition: TensorReductionSycl.h:252

Eigen::TensorSycl::internal::GenericNondeterministicReducer::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:220

Eigen::TensorSycl::internal::GenericNondeterministicReducer::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:248

Eigen::TensorSycl::internal::GenericNondeterministicReducer::output_accessor
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:249

min
#define min(a, b)
Definition: datatypes.h:22

op
char char * op
Definition: level2_impl.h:374

Eigen::TensorSycl::internal::reduction_dim
reduction_dim
Definition: TensorReductionSycl.h:255

Eigen::TensorSycl::internal::reduction_dim::outer_most
@ outer_most

Eigen::TensorSycl::internal::reduction_dim::inner_most
@ inner_most

Eigen::internal::pdiv
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83

internal
Definition: Eigen_Colamd.h:49

oomph::CumulativeTimings::start
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243

output
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, true >::get_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:67

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, true >::PacketReturnType
Vectorise< CoeffReturnType, Eigen::SyclDevice, true >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:65

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, true >::type
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:66

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, true >::finalise_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:71

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, false >::get_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:52

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, false >::finalise_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:56

Eigen::TensorSycl::internal::OpDefiner< Eigen::internal::MeanReducer< CoeffReturnType >, CoeffReturnType, Index, false >::type
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:51

Eigen::TensorSycl::internal::OpDefiner
Definition: TensorReductionSycl.h:38

Eigen::TensorSycl::internal::OpDefiner::get_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
Definition: TensorReductionSycl.h:41

Eigen::TensorSycl::internal::OpDefiner::type
Op type
Definition: TensorReductionSycl.h:40

Eigen::TensorSycl::internal::OpDefiner::finalise_op
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
Definition: TensorReductionSycl.h:43

Eigen::TensorSycl::internal::OpDefiner::PacketReturnType
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:39

Eigen::TensorSycl::internal::PacketWrapper
Definition: InteropHeaders.h:135

Eigen::TensorSycl::internal::PartialReducerLauncher
Definition: TensorReductionSycl.h:411

Eigen::TensorSycl::internal::PartialReducerLauncher::EvaluatorPointerType
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:412

Eigen::TensorSycl::internal::PartialReducerLauncher::run
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:421

Eigen::TensorSycl::internal::PartialReducerLauncher::Index
Self::Index Index
Definition: TensorReductionSycl.h:415

Eigen::TensorSycl::internal::PartialReducerLauncher::CoeffReturnType
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:413

Eigen::TensorSycl::internal::PartialReducerLauncher::SyclReducerKerneType
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
Definition: TensorReductionSycl.h:419

Eigen::TensorSycl::internal::PartialReducerLauncher::Storage
Self::Storage Storage
Definition: TensorReductionSycl.h:414

Eigen::TensorSycl::internal::PartialReducerLauncher::PannelParameters
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
Definition: TensorReductionSycl.h:417

Eigen::TensorSycl::internal::PartialReductionKernel
Definition: TensorReductionSycl.h:258

Eigen::TensorSycl::internal::PartialReductionKernel::preserve_elements_num_groups
const Index preserve_elements_num_groups
Definition: TensorReductionSycl.h:270

Eigen::TensorSycl::internal::PartialReductionKernel::Index
Evaluator::Index Index
Definition: TensorReductionSycl.h:261

Eigen::TensorSycl::internal::PartialReductionKernel::num_coeffs_to_reduce
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:273

Eigen::TensorSycl::internal::PartialReductionKernel::element_wise_reduce
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator) const
Definition: TensorReductionSycl.h:287

Eigen::TensorSycl::internal::PartialReductionKernel::ScratchAcc
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
Definition: TensorReductionSycl.h:265

Eigen::TensorSycl::internal::PartialReductionKernel::scratch
ScratchAcc scratch
Definition: TensorReductionSycl.h:266

Eigen::TensorSycl::internal::PartialReductionKernel::num_coeffs_to_preserve
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:272

Eigen::TensorSycl::internal::PartialReductionKernel::output_accessor
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:268

Eigen::TensorSycl::internal::PartialReductionKernel::Op
OpDef::type Op
Definition: TensorReductionSycl.h:263

Eigen::TensorSycl::internal::PartialReductionKernel::reduce_elements_num_groups
const Index reduce_elements_num_groups
Definition: TensorReductionSycl.h:271

Eigen::TensorSycl::internal::PartialReductionKernel::EvaluatorPointerType
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:260

Eigen::TensorSycl::internal::PartialReductionKernel::PartialReductionKernel
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:275

Eigen::TensorSycl::internal::PartialReductionKernel::op
Op op
Definition: TensorReductionSycl.h:269

Eigen::TensorSycl::internal::PartialReductionKernel::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:305

Eigen::TensorSycl::internal::PartialReductionKernel::evaluator
Evaluator evaluator
Definition: TensorReductionSycl.h:267

Eigen::TensorSycl::internal::PartialReductionKernel::CoeffReturnType
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:259

Eigen::TensorSycl::internal::PartialReductionKernel::OpDef
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:262

Eigen::TensorSycl::internal::ReductionPannel
Definition: TensorReductionSycl.h:404

Eigen::TensorSycl::internal::ReductionPannel::LocalThreadSizeR
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:406

Eigen::TensorSycl::internal::ReductionPannel::LocalThreadSizeP
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:405

Eigen::TensorSycl::internal::ReductionPannel::BC
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:407

Eigen::TensorSycl::internal::SecondStepFullReducer
Definition: TensorReductionSycl.h:79

Eigen::TensorSycl::internal::SecondStepFullReducer::SecondStepFullReducer
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
Definition: TensorReductionSycl.h:88

Eigen::TensorSycl::internal::SecondStepFullReducer::OpDef
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
Definition: TensorReductionSycl.h:82

Eigen::TensorSycl::internal::SecondStepFullReducer::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:81

Eigen::TensorSycl::internal::SecondStepFullReducer::op
Op op
Definition: TensorReductionSycl.h:87

Eigen::TensorSycl::internal::SecondStepFullReducer::operator()
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:91

Eigen::TensorSycl::internal::SecondStepFullReducer::outAcc
OutputAccessor outAcc
Definition: TensorReductionSycl.h:86

Eigen::TensorSycl::internal::SecondStepFullReducer::Op
OpDef::type Op
Definition: TensorReductionSycl.h:83

Eigen::TensorSycl::internal::SecondStepFullReducer::scratch
LocalAccessor scratch
Definition: TensorReductionSycl.h:84

Eigen::TensorSycl::internal::SecondStepFullReducer::aI
InputAccessor aI
Definition: TensorReductionSycl.h:85

Eigen::TensorSycl::internal::SecondStepPartialReduction
Definition: TensorReductionSycl.h:365

Eigen::TensorSycl::internal::SecondStepPartialReduction::output_accessor
OutputAccessor output_accessor
Definition: TensorReductionSycl.h:371

Eigen::TensorSycl::internal::SecondStepPartialReduction::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:386

Eigen::TensorSycl::internal::SecondStepPartialReduction::num_coeffs_to_preserve
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:373

Eigen::TensorSycl::internal::SecondStepPartialReduction::ScratchAccessor
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
Definition: TensorReductionSycl.h:369

Eigen::TensorSycl::internal::SecondStepPartialReduction::OpDef
OpDefiner< OpType, OutScalar, Index, false > OpDef
Definition: TensorReductionSycl.h:366

Eigen::TensorSycl::internal::SecondStepPartialReduction::num_coeffs_to_reduce
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:374

Eigen::TensorSycl::internal::SecondStepPartialReduction::op
Op op
Definition: TensorReductionSycl.h:372

Eigen::TensorSycl::internal::SecondStepPartialReduction::SecondStepPartialReduction
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:376

Eigen::TensorSycl::internal::SecondStepPartialReduction::input_accessor
InputAccessor input_accessor
Definition: TensorReductionSycl.h:370

Eigen::TensorSycl::internal::SecondStepPartialReduction::Op
OpDef::type Op
Definition: TensorReductionSycl.h:367

Eigen::internal::FullReducer< Self, Op, Eigen::SyclDevice, Vectorizable >::CoeffReturnType
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:483

Eigen::internal::FullReducer< Self, Op, Eigen::SyclDevice, Vectorizable >::run
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data)
Definition: TensorReductionSycl.h:487

Eigen::internal::FullReducer< Self, Op, Eigen::SyclDevice, Vectorizable >::EvaluatorPointerType
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:484

Eigen::internal::FullReducer
Definition: TensorReduction.h:356

Eigen::internal::FullReducer::HasOptimizedImplementation
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:357

Eigen::internal::GenericDimReducer::reduce
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:139

Eigen::internal::GenericReducer< Self, Op, Eigen::SyclDevice >::run
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:570

Eigen::internal::InnerReducer< Self, Op, Eigen::SyclDevice >::run
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:554

Eigen::internal::InnerReducer
Definition: TensorReduction.h:430

Eigen::internal::InnerReducer::HasOptimizedImplementation
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431

Eigen::internal::InnerReducer::run
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433

Eigen::internal::MeanReducer
Definition: TensorFunctors.h:107

Eigen::internal::OuterReducer< Self, Op, Eigen::SyclDevice >::run
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:540

Eigen::internal::OuterReducer
Definition: TensorReduction.h:442

Eigen::internal::OuterReducer::HasOptimizedImplementation
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443

Eigen::internal::OuterReducer::run
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445

Eigen::internal::SumReducer
Definition: TensorFunctors.h:66

Eigen::internal::scalar_quotient_op
Template functor to compute the quotient of two scalars.
Definition: BinaryFunctors.h:430

InternalHeaderCheck.h