de/d6a/TensorScanSycl_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Mehdi Goli    Codeplay Software Ltd.

 // Ralph Potter  Codeplay Software Ltd.

 // Luke Iwanski  Codeplay Software Ltd.

 // Contact: <eigen@codeplay.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 /*****************************************************************

  * TensorScanSycl.h

  *

  * \brief:

  *  Tensor Scan Sycl implement the extend  version of

  * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.

  * The algorithm requires up to 3 stage (consequently 3 kernels) depending on

  * the size of the tensor. In the first kernel (ScanKernelFunctor), each

  * threads within the work-group individually reduces the allocated elements per

  * thread in order to reduces the total number of blocks. In the next step all

  * thread within the work-group will reduce the associated blocks into the

  * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary

  * buffer is given as an input and all the threads within a work-group scan and

  * reduces the boundaries between the blocks (generated from the previous

  * kernel). and write the data on the temporary buffer. If the second kernel is

  * required, the third and final kernel (ScanAdjustmentKernelFunctor) will

  * adjust the final result into the output buffer.

  * The original algorithm for the parallel prefix sum can be found here:

  *

  * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel

  * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003

  *1, no. 1 (2008): 1-17.

  *****************************************************************/


 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP


 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {

 namespace TensorSycl {

 namespace internal {


 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE

 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)

 #endif


 template <typename index_t>

 struct ScanParameters {

   // must be power of 2

   static EIGEN_CONSTEXPR index_t ScanPerThread = 8;

   const index_t total_size;

   const index_t non_scan_size;

   const index_t scan_size;

   const index_t non_scan_stride;

   const index_t scan_stride;

   const index_t panel_threads;

   const index_t group_threads;

   const index_t block_threads;

   const index_t elements_per_group;

   const index_t elements_per_block;

   const index_t loop_range;


   ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,

                  index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,

                  index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)

       : total_size(total_size_),

         non_scan_size(non_scan_size_),

         scan_size(scan_size_),

         non_scan_stride(non_scan_stride_),

         scan_stride(scan_stride_),

         panel_threads(panel_threads_),

         group_threads(group_threads_),

         block_threads(block_threads_),

         elements_per_group(elements_per_group_),

         elements_per_block(elements_per_block_),

         loop_range(loop_range_) {}

 };


 enum class scan_step { first, second };

 template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,

           scan_step stp>

 struct ScanKernelFunctor {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;


   LocalAccessor scratch;

   Evaluator dev_eval;

   OutAccessor out_ptr;

   OutAccessor tmp_ptr;

   const ScanParameters<Index> scanParameters;

   Op accumulator;

   const bool inclusive;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,

                                                           OutAccessor out_accessor_, OutAccessor temp_accessor_,

                                                           const ScanParameters<Index> scanParameters_, Op accumulator_,

                                                           const bool inclusive_)

       : scratch(scratch_),

         dev_eval(dev_eval_),

         out_ptr(out_accessor_),

         tmp_ptr(temp_accessor_),

         scanParameters(scanParameters_),

         accumulator(accumulator_),

         inclusive(inclusive_) {}


   template <scan_step sst = stp, typename Input>

   std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(

       const Input &inpt, Index global_id) const {

     return inpt.coeff(global_id);

   }


   template <scan_step sst = stp, typename Input>

   std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(

       const Input &inpt, Index global_id) const {

     return inpt[global_id];

   }


   template <scan_step sst = stp, typename InclusiveOp>

   std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(

       InclusiveOp inclusive_op) const {

     inclusive_op();

   }


   template <scan_step sst = stp, typename InclusiveOp>

   std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(

       InclusiveOp) const {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {

       Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));

       Index tmp = data_offset % scanParameters.panel_threads;

       const Index panel_id = data_offset / scanParameters.panel_threads;

       const Index group_id = tmp / scanParameters.group_threads;

       tmp = tmp % scanParameters.group_threads;

       const Index block_id = tmp / scanParameters.block_threads;

       const Index local_id = tmp % scanParameters.block_threads;

       // we put one element per packet in scratch_mem

       const Index scratch_stride = scanParameters.elements_per_block / PacketSize;

       const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;

       CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];

       CoeffReturnType inclusive_scan;

       // the actual panel size is scan_size * non_scan_size.

       // elements_per_panel is roundup to power of 2 for binary tree

       const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;

       const Index group_offset = group_id * scanParameters.non_scan_stride;

       // This will be effective when the size is bigger than elements_per_block

       const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;

       const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);

       const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;

       Index next_elements = 0;

       EIGEN_UNROLL_LOOP

       for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         private_scan[i] = ((((block_id * scanParameters.elements_per_block) +

                              (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&

                            (global_id < scanParameters.total_size))

                               ? read(dev_eval, global_id)

                               : accumulator.initialize();

         next_elements += scanParameters.scan_stride;

       }

       first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {

         if (inclusive) {

           inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];

         }

       });

       // This for loop must be 2

       EIGEN_UNROLL_LOOP

       for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {

         Index private_offset = 1;

         // build sum in place up the tree

         EIGEN_UNROLL_LOOP

         for (Index d = PacketSize >> 1; d > 0; d >>= 1) {

           EIGEN_UNROLL_LOOP

           for (Index l = 0; l < d; l++) {

             Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;

             Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;

             CoeffReturnType accum = accumulator.initialize();

             accumulator.reduce(private_scan[ai], &accum);

             accumulator.reduce(private_scan[bi], &accum);

             private_scan[bi] = accumulator.finalize(accum);

           }

           private_offset *= 2;

         }

         scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =

             private_scan[PacketSize - 1 + packetIndex];

         private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();

         // traverse down tree & build scan

         EIGEN_UNROLL_LOOP

         for (Index d = 1; d < PacketSize; d *= 2) {

           private_offset >>= 1;

           EIGEN_UNROLL_LOOP

           for (Index l = 0; l < d; l++) {

             Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;

             Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;

             CoeffReturnType accum = accumulator.initialize();

             accumulator.reduce(private_scan[ai], &accum);

             accumulator.reduce(private_scan[bi], &accum);

             private_scan[ai] = private_scan[bi];

             private_scan[bi] = accumulator.finalize(accum);

           }

         }

       }


       Index offset = 1;

       // build sum in place up the tree

       for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {

         // Synchronise

         itemID.barrier(cl::sycl::access::fence_space::local_space);

         if (local_id < d) {

           Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;

           Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;

           CoeffReturnType accum = accumulator.initialize();

           accumulator.reduce(scratch[ai], &accum);

           accumulator.reduce(scratch[bi], &accum);

           scratch[bi] = accumulator.finalize(accum);

         }

         offset *= 2;

       }

       // Synchronise

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       // next step optimisation

       if (local_id == 0) {

         if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {

           const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *

                                     scanParameters.non_scan_size +

                                 group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +

                                 block_id;

           tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];

         }

         // clear the last element

         scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();

       }

       // traverse down tree & build scan

       for (Index d = 1; d < scratch_stride; d *= 2) {

         offset >>= 1;

         // Synchronise

         itemID.barrier(cl::sycl::access::fence_space::local_space);

         if (local_id < d) {

           Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;

           Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;

           CoeffReturnType accum = accumulator.initialize();

           accumulator.reduce(scratch[ai], &accum);

           accumulator.reduce(scratch[bi], &accum);

           scratch[ai] = scratch[bi];

           scratch[bi] = accumulator.finalize(accum);

         }

       }

       // Synchronise

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       // This for loop must be 2

       EIGEN_UNROLL_LOOP

       for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {

         EIGEN_UNROLL_LOOP

         for (Index i = 0; i < PacketSize; i++) {

           CoeffReturnType accum = private_scan[packetIndex + i];

           accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);

           private_scan[packetIndex + i] = accumulator.finalize(accum);

         }

       }

       first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {

         if (inclusive) {

           accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);

           private_scan[0] = accumulator.finalize(inclusive_scan);

         }

       });

       next_elements = 0;

       // right the first set of private param

       EIGEN_UNROLL_LOOP

       for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <

              scanParameters.scan_size) &&

             (global_id < scanParameters.total_size)) {

           Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));

           out_ptr[global_id] = private_scan[private_id];

         }

         next_elements += scanParameters.scan_stride;

       }

     }  // end for loop

   }

 };


 template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>

 struct ScanAdjustmentKernelFunctor {

   typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>

       LocalAccessor;

   static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;

   InAccessor in_ptr;

   OutAccessor out_ptr;

   const ScanParameters<Index> scanParameters;

   Op accumulator;

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,

                                                                     OutAccessor out_accessor_,

                                                                     const ScanParameters<Index> scanParameters_,

                                                                     Op accumulator_)

       : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {

       Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));

       Index tmp = data_offset % scanParameters.panel_threads;

       const Index panel_id = data_offset / scanParameters.panel_threads;

       const Index group_id = tmp / scanParameters.group_threads;

       tmp = tmp % scanParameters.group_threads;

       const Index block_id = tmp / scanParameters.block_threads;

       const Index local_id = tmp % scanParameters.block_threads;


       // the actual panel size is scan_size * non_scan_size.

       // elements_per_panel is roundup to power of 2 for binary tree

       const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;

       const Index group_offset = group_id * scanParameters.non_scan_stride;

       // This will be effective when the size is bigger than elements_per_block

       const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;

       const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;


       const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;

       const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;

       const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;

       CoeffReturnType adjust_val = in_ptr[in_id];


       Index next_elements = 0;

       EIGEN_UNROLL_LOOP

       for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {

         Index global_id = global_offset + next_elements;

         if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <

              scanParameters.scan_size) &&

             (global_id < scanParameters.total_size)) {

           CoeffReturnType accum = adjust_val;

           accumulator.reduce(out_ptr[global_id], &accum);

           out_ptr[global_id] = accumulator.finalize(accum);

         }

         next_elements += scanParameters.scan_stride;

       }

     }

   }

 };


 template <typename Index>

 struct ScanInfo {

   const Index &total_size;

   const Index &scan_size;

   const Index &panel_size;

   const Index &non_scan_size;

   const Index &scan_stride;

   const Index &non_scan_stride;


   Index max_elements_per_block;

   Index block_size;

   Index panel_threads;

   Index group_threads;

   Index block_threads;

   Index elements_per_group;

   Index elements_per_block;

   Index loop_range;

   Index global_range;

   Index local_range;

   const Eigen::SyclDevice &dev;

   EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,

                                const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,

                                const Eigen::SyclDevice &dev_)

       : total_size(total_size_),

         scan_size(scan_size_),

         panel_size(panel_size_),

         non_scan_size(non_scan_size_),

         scan_stride(scan_stride_),

         non_scan_stride(non_scan_stride_),

         dev(dev_) {

     // must be power of 2

     local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),

                            Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));


     max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;


     elements_per_group =

         dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);

     const Index elements_per_panel = elements_per_group * non_scan_size;

     elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));

     panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;

     group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;

     block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;

     block_size = elements_per_group / elements_per_block;

 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE

     const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));

 #else

     const Index max_threads = panel_threads * panel_size;

 #endif

     global_range = roundUp(max_threads, local_range);

     loop_range = Index(

         std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));

   }

   inline ScanParameters<Index> get_scan_parameter() {

     return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,

                                  group_threads, block_threads, elements_per_group, elements_per_block, loop_range);

   }

   inline cl::sycl::nd_range<1> get_thread_range() {

     return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));

   }

 };


 template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>

 struct SYCLAdjustBlockOffset {

   EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,

                                                            Reducer &accumulator, const Index total_size,

                                                            const Index scan_size, const Index panel_size,

                                                            const Index non_scan_size, const Index scan_stride,

                                                            const Index non_scan_stride, const Eigen::SyclDevice &dev) {

     auto scan_info =

         ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);


     typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>

         AdjustFuctor;

     dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),

                                                                       scan_info.max_elements_per_block,

                                                                       scan_info.get_scan_parameter(), accumulator)

         .wait();

   }

 };


 template <typename CoeffReturnType, scan_step stp>

 struct ScanLauncher_impl {

   template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>

   EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,

                                              const Index total_size, const Index scan_size, const Index panel_size,

                                              const Index non_scan_size, const Index scan_stride,

                                              const Index non_scan_stride, const bool inclusive,

                                              const Eigen::SyclDevice &dev) {

     auto scan_info =

         ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);

     const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;

     const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);

     CoeffReturnType *temp_pointer =

         static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));

     EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);


     typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;

     dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(

            in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,

            scan_info.get_scan_parameter(), accumulator, inclusive)

         .wait();


     if (scan_info.block_size > 1) {

       ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(

           tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,

           non_scan_size, Index(1), scan_info.block_size, false, dev);


       SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(

           tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,

           non_scan_stride, dev);

     }

     dev.deallocate_temp(temp_pointer);

   }

 };


 }  // namespace internal

 }  // namespace TensorSycl

 namespace internal {

 template <typename Self, typename Reducer, bool vectorize>

 struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {

   typedef typename Self::Index Index;

   typedef typename Self::CoeffReturnType CoeffReturnType;

   typedef typename Self::Storage Storage;

   typedef typename Self::EvaluatorPointerType EvaluatorPointerType;

   void operator()(Self &self, EvaluatorPointerType data) const {

     const Index total_size = internal::array_prod(self.dimensions());

     const Index scan_size = self.size();

     const Index scan_stride = self.stride();

     // this is the scan op (can be sum or ...)

     auto accumulator = self.accumulator();

     auto inclusive = !self.exclusive();

     auto consume_dim = self.consume_dim();

     auto dev = self.device();


     auto dims = self.inner().dimensions();


     Index non_scan_size = 1;

     Index panel_size = 1;

     if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {

       for (int i = 0; i < consume_dim; i++) {

         non_scan_size *= dims[i];

       }

       for (int i = consume_dim + 1; i < Self::NumDims; i++) {

         panel_size *= dims[i];

       }

     } else {

       for (int i = Self::NumDims - 1; i > consume_dim; i--) {

         non_scan_size *= dims[i];

       }

       for (int i = consume_dim - 1; i >= 0; i--) {

         panel_size *= dims[i];

       }

     }

     const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;

     auto eval_impl = self.inner();

     TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(

         eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,

         inclusive, dev);

   }

 };

 }  // namespace internal

 }  // namespace Eigen


 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

i
int i
Definition: BiCGSTAB_step_by_step.cpp:9

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298

EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR
Definition: Macros.h:758

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

data
int data[]
Definition: Map_placement_new.cpp:1

EIGEN_SYCL_MAX_GLOBAL_RANGE
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition: TensorScanSycl.h:48

min
#define min(a, b)
Definition: datatypes.h:22

Eigen::ColMajor
@ ColMajor
Definition: Constants.h:318

tmp
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365

Eigen::TensorSycl::internal::scan_step
scan_step
Definition: TensorScanSycl.h:83

Eigen::TensorSycl::internal::scan_step::first
@ first

Eigen::TensorSycl::internal::scan_step::second
@ second

Eigen::bfloat16_impl::ceil
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644

Eigen::internal::array_prod
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83

internal
Definition: Eigen_Colamd.h:49

Eigen::TensorSycl::internal::SYCLAdjustBlockOffset
Definition: TensorScanSycl.h:405

Eigen::TensorSycl::internal::SYCLAdjustBlockOffset::adjust_scan_block_offset
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:406

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor
Definition: TensorScanSycl.h:288

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::scanParameters
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:294

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::out_ptr
OutAccessor out_ptr
Definition: TensorScanSycl.h:293

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:291

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:290

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::ScanAdjustmentKernelFunctor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition: TensorScanSycl.h:296

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:302

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::in_ptr
InAccessor in_ptr
Definition: TensorScanSycl.h:292

Eigen::TensorSycl::internal::ScanAdjustmentKernelFunctor::accumulator
Op accumulator
Definition: TensorScanSycl.h:295

Eigen::TensorSycl::internal::ScanInfo
Definition: TensorScanSycl.h:343

Eigen::TensorSycl::internal::ScanInfo::max_elements_per_block
Index max_elements_per_block
Definition: TensorScanSycl.h:351

Eigen::TensorSycl::internal::ScanInfo::total_size
const Index & total_size
Definition: TensorScanSycl.h:344

Eigen::TensorSycl::internal::ScanInfo::non_scan_stride
const Index & non_scan_stride
Definition: TensorScanSycl.h:349

Eigen::TensorSycl::internal::ScanInfo::group_threads
Index group_threads
Definition: TensorScanSycl.h:354

Eigen::TensorSycl::internal::ScanInfo::non_scan_size
const Index & non_scan_size
Definition: TensorScanSycl.h:347

Eigen::TensorSycl::internal::ScanInfo::scan_stride
const Index & scan_stride
Definition: TensorScanSycl.h:348

Eigen::TensorSycl::internal::ScanInfo::dev
const Eigen::SyclDevice & dev
Definition: TensorScanSycl.h:361

Eigen::TensorSycl::internal::ScanInfo::local_range
Index local_range
Definition: TensorScanSycl.h:360

Eigen::TensorSycl::internal::ScanInfo::get_thread_range
cl::sycl::nd_range< 1 > get_thread_range()
Definition: TensorScanSycl.h:399

Eigen::TensorSycl::internal::ScanInfo::ScanInfo
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition: TensorScanSycl.h:362

Eigen::TensorSycl::internal::ScanInfo::elements_per_block
Index elements_per_block
Definition: TensorScanSycl.h:357

Eigen::TensorSycl::internal::ScanInfo::block_threads
Index block_threads
Definition: TensorScanSycl.h:355

Eigen::TensorSycl::internal::ScanInfo::block_size
Index block_size
Definition: TensorScanSycl.h:352

Eigen::TensorSycl::internal::ScanInfo::panel_threads
Index panel_threads
Definition: TensorScanSycl.h:353

Eigen::TensorSycl::internal::ScanInfo::elements_per_group
Index elements_per_group
Definition: TensorScanSycl.h:356

Eigen::TensorSycl::internal::ScanInfo::loop_range
Index loop_range
Definition: TensorScanSycl.h:358

Eigen::TensorSycl::internal::ScanInfo::panel_size
const Index & panel_size
Definition: TensorScanSycl.h:346

Eigen::TensorSycl::internal::ScanInfo::scan_size
const Index & scan_size
Definition: TensorScanSycl.h:345

Eigen::TensorSycl::internal::ScanInfo::global_range
Index global_range
Definition: TensorScanSycl.h:359

Eigen::TensorSycl::internal::ScanInfo::get_scan_parameter
ScanParameters< Index > get_scan_parameter()
Definition: TensorScanSycl.h:395

Eigen::TensorSycl::internal::ScanKernelFunctor
Definition: TensorScanSycl.h:86

Eigen::TensorSycl::internal::ScanKernelFunctor::accumulator
Op accumulator
Definition: TensorScanSycl.h:96

Eigen::TensorSycl::internal::ScanKernelFunctor::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:89

Eigen::TensorSycl::internal::ScanKernelFunctor::read
std::enable_if_t< sst !=scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:117

Eigen::TensorSycl::internal::ScanKernelFunctor::dev_eval
Evaluator dev_eval
Definition: TensorScanSycl.h:92

Eigen::TensorSycl::internal::ScanKernelFunctor::LocalAccessor
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:88

Eigen::TensorSycl::internal::ScanKernelFunctor::ScanKernelFunctor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition: TensorScanSycl.h:98

Eigen::TensorSycl::internal::ScanKernelFunctor::out_ptr
OutAccessor out_ptr
Definition: TensorScanSycl.h:93

Eigen::TensorSycl::internal::ScanKernelFunctor::scratch
LocalAccessor scratch
Definition: TensorScanSycl.h:91

Eigen::TensorSycl::internal::ScanKernelFunctor::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:132

Eigen::TensorSycl::internal::ScanKernelFunctor::tmp_ptr
OutAccessor tmp_ptr
Definition: TensorScanSycl.h:94

Eigen::TensorSycl::internal::ScanKernelFunctor::first_step_inclusive_Operation
std::enable_if_t< sst !=scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp) const
Definition: TensorScanSycl.h:129

Eigen::TensorSycl::internal::ScanKernelFunctor::read
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:111

Eigen::TensorSycl::internal::ScanKernelFunctor::first_step_inclusive_Operation
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op) const
Definition: TensorScanSycl.h:123

Eigen::TensorSycl::internal::ScanKernelFunctor::inclusive
const bool inclusive
Definition: TensorScanSycl.h:97

Eigen::TensorSycl::internal::ScanKernelFunctor::scanParameters
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:95

Eigen::TensorSycl::internal::ScanLauncher_impl
Definition: TensorScanSycl.h:424

Eigen::TensorSycl::internal::ScanLauncher_impl::scan_block
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:426

Eigen::TensorSycl::internal::ScanParameters
Definition: TensorScanSycl.h:52

Eigen::TensorSycl::internal::ScanParameters::non_scan_stride
const index_t non_scan_stride
Definition: TensorScanSycl.h:58

Eigen::TensorSycl::internal::ScanParameters::scan_stride
const index_t scan_stride
Definition: TensorScanSycl.h:59

Eigen::TensorSycl::internal::ScanParameters::total_size
const index_t total_size
Definition: TensorScanSycl.h:55

Eigen::TensorSycl::internal::ScanParameters::non_scan_size
const index_t non_scan_size
Definition: TensorScanSycl.h:56

Eigen::TensorSycl::internal::ScanParameters::block_threads
const index_t block_threads
Definition: TensorScanSycl.h:62

Eigen::TensorSycl::internal::ScanParameters::ScanPerThread
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition: TensorScanSycl.h:54

Eigen::TensorSycl::internal::ScanParameters::group_threads
const index_t group_threads
Definition: TensorScanSycl.h:61

Eigen::TensorSycl::internal::ScanParameters::panel_threads
const index_t panel_threads
Definition: TensorScanSycl.h:60

Eigen::TensorSycl::internal::ScanParameters::ScanParameters
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition: TensorScanSycl.h:67

Eigen::TensorSycl::internal::ScanParameters::elements_per_group
const index_t elements_per_group
Definition: TensorScanSycl.h:63

Eigen::TensorSycl::internal::ScanParameters::elements_per_block
const index_t elements_per_block
Definition: TensorScanSycl.h:64

Eigen::TensorSycl::internal::ScanParameters::scan_size
const index_t scan_size
Definition: TensorScanSycl.h:57

Eigen::TensorSycl::internal::ScanParameters::loop_range
const index_t loop_range
Definition: TensorScanSycl.h:65

Eigen::internal::ScanLauncher< Self, Reducer, Eigen::SyclDevice, vectorize >::EvaluatorPointerType
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorScanSycl.h:466

Eigen::internal::ScanLauncher< Self, Reducer, Eigen::SyclDevice, vectorize >::Index
Self::Index Index
Definition: TensorScanSycl.h:463

Eigen::internal::ScanLauncher< Self, Reducer, Eigen::SyclDevice, vectorize >::CoeffReturnType
Self::CoeffReturnType CoeffReturnType
Definition: TensorScanSycl.h:464

Eigen::internal::ScanLauncher< Self, Reducer, Eigen::SyclDevice, vectorize >::operator()
void operator()(Self &self, EvaluatorPointerType data) const
Definition: TensorScanSycl.h:467

Eigen::internal::ScanLauncher< Self, Reducer, Eigen::SyclDevice, vectorize >::Storage
Self::Storage Storage
Definition: TensorScanSycl.h:465

Eigen::internal::ScanLauncher
Definition: TensorScan.h:179

InternalHeaderCheck.h