da/d85/TensorContractionSycl_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library for linear algebra.

 //

 // Mehdi Goli    Codeplay Software Ltd.

 // Ralph Potter  Codeplay Software Ltd.

 // Luke Iwanski  Codeplay Software Ltd.

 // Contact: <eigen@codeplay.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not

 // distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 /*****************************************************************

  * TensorContractionSycl.h

  *

  * \brief:

  *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend

  *

  *****************************************************************/


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H

 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H


 // IWYU pragma: private

 #include "./InternalHeaderCheck.h"


 namespace Eigen {


 namespace TensorSycl {

 namespace internal {


 #ifndef EIGEN_SYCL_DISABLE_GEMV

 template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>

 struct TVPanelSize {

   // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension

   static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;

   // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension

   static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;

   // TileSizeDimNC: determines the tile size for the non-contracting dimension

   static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;

   // TileSizeDimC: determines the tile size for the contracting dimension

   static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;

   // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;

   // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;

   // BC : determines if supporting bank conflict is required

   static EIGEN_CONSTEXPR bool BC = false;

 };

 #endif


 template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>

 struct TTPanelSize {

   // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered

   static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;

   // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the

   // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//

 #ifndef EIGEN_SYCL_REG_M

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;

 #else

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;

 #endif

 // WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the

 // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro

 #ifndef EIGEN_SYCL_REG_N

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;

 #else

   static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;

 #endif

   // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension

   static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;

   // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension

   static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;

   // TileSizeDimM: determines the tile size for the m dimension

   static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;

   // TileSizeDimN: determines the tile size for the n dimension

   static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;

   // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize

   static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =

       ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));

   // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize

   static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =

       ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));

   // BC : determines if supporting bank conflict is required

   static EIGEN_CONSTEXPR bool BC = true;

   // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by

   // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)

   static EIGEN_CONSTEXPR bool DoubleBuffer =

 #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER

       false;

 #else

       true;

 #endif

 };


 /* !

  * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to

  * specialize the contraction algorithm based on device support for dedicated local memory.

  */

 enum class contraction_type { local, no_local };

 /* !

  * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).

  */

 enum class data_source { global_mem, local_mem, private_mem };


 template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,

           typename StorageIndex>

 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<PacketLoad, PacketType> read(

     const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {

   const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;

   const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;

   return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));

 }


 template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>

 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!PacketLoad, PacketType> read(

     const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {

   const StorageIndex row = (IsRhs) ? CIndex : NCIndex;

   const StorageIndex col = (IsRhs) ? NCIndex : CIndex;

   return tensorMapper(row, col);

 }


 template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>

 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(

     PacketType &packet_data, DataScalar ptr) {

   EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;

   EIGEN_UNROLL_LOOP

   for (int i = 0; i < PacketSize; i++) {

     *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);

     ptr += ld;

   }

 }


 template <data_source dt, typename PacketType, typename DataScalar>

 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

     typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem,

                               void>

     write(PacketType &packet_data, DataScalar *ptr) {

   ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);

 }


 template <data_source dt, typename PacketType, typename DataScalar>

 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE

     typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem,

                               void>

     write(PacketType &packet_data, DataScalar *ptr) {

   *ptr = packet_data;

 }


 template <bool is_internal>

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {

   return true;

 }


 template <>

 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {

   return cond;

 }


 template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>

 struct BlockProperties {

   static EIGEN_CONSTEXPR bool packet_load = packet_load_;

   typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;

   static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;

   typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;

   static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;

   static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);

   static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);

   static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);

 };


 template <typename StorageIndex>

 struct ThreadProperties {

   const StorageIndex linearLocalThreadId;

   const StorageIndex kGroupId;

   const StorageIndex mGroupOffset;

   const StorageIndex nGroupOffset;

   const StorageIndex kGroupOffset;

   const StorageIndex mLocalOffset;

   const StorageIndex nLocalOffset;

   const StorageIndex mGlobalOffset;

   const StorageIndex nGlobalOffset;

   StorageIndex kSize;

   const bool is_internal;

   // this is used to adjust the last block

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(

       const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,

       const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,

       const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,

       StorageIndex kSize_, const bool is_internal_)

       : linearLocalThreadId(linearLocalThreadId_),

         kGroupId(kGroupId_),

         mGroupOffset(mGroupOffset_),

         nGroupOffset(nGroupOffset_),

         kGroupOffset(kGroupOffset_),

         mLocalOffset(mLocalOffset_),

         nLocalOffset(nLocalOffset_),

         mGlobalOffset(mGlobalOffset_),

         nGlobalOffset(nGlobalOffset_),

         kSize(kSize_),

         is_internal(is_internal_) {}

 };


 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,

           typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,

           typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>

 class TensorContractionKernel {

  public:

   typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType

       PacketReturnType;

   static EIGEN_CONSTEXPR int PacketSize =

       Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;

   static EIGEN_CONSTEXPR bool is_lhs_transposed =

       !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;

   static EIGEN_CONSTEXPR bool is_rhs_transposed =

       !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;


   typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,

                           PacketReturnType>

       LHSBlockProperties;


   typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,

                           PacketReturnType>

       RHSBlockProperties;


   static EIGEN_CONSTEXPR StorageIndex NStride =

       contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;


   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;

   typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;

   typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;

   typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;

   static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local

                                                  ? Properties::TileSizeDimM + Properties::BC

                                                  : Properties::WorkLoadPerThreadM;

   static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local

                                                  ? Properties::TileSizeDimN + Properties::BC

                                                  : Properties::WorkLoadPerThreadN;

   static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;


   template <contraction_type, StorageIndex>

   struct MemHolder {

     tile_ptr ptr;

     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}

   };

   template <StorageIndex MemSize>

   struct MemHolder<contraction_type::no_local, MemSize> {

     OutScalar ptr[MemSize] = {OutScalar{0}};

   };

   struct TiledMemory {

     MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;

     MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;

     tile_ptr lhs_scratch_ptr_compute;

     tile_ptr rhs_scratch_ptr_compute;

     const std::pair<StorageIndex, StorageIndex> lhs_extract_index;

     const std::pair<StorageIndex, StorageIndex> rhs_extract_index;

     template <contraction_type tp = contraction_tp>

     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,

                                                       std::enable_if_t<tp == contraction_type::no_local> * = 0)

         : lhs_scratch_extract{},

           rhs_scratch_extract{},

           lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),

           rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),

           lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),

           rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}


     template <contraction_type tp = contraction_tp>

     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &thread_properties,

                                                       local_ptr block_start_ptr,

                                                       std::enable_if_t<tp == contraction_type::local> * = 0)

         : lhs_scratch_extract{block_start_ptr},

           rhs_scratch_extract{lhs_scratch_extract.ptr +

                               ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},

           lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),

           rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),

           lhs_extract_index(

               local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),

           rhs_extract_index(

               local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}

   };


   Scratch scratch;

   const LhsMapper lhs;

   const RhsMapper rhs;

   OutAccessor out_res;

   const StorageIndex groupSizeM;

   const StorageIndex groupSizeN;

   const StorageIndex numTiles;

   const TripleDim triple_dim;


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,

                                                                 const RhsMapper rhs_, OutAccessor out_res_,

                                                                 const StorageIndex groupSizeM_,

                                                                 const StorageIndex groupSizeN_,

                                                                 const StorageIndex numTiles_,

                                                                 const TripleDim triple_dim_)

       : scratch(scratch_),

         lhs(lhs_),

         rhs(rhs_),

         out_res(out_res_),

         groupSizeM(groupSizeM_),

         groupSizeN(groupSizeN_),

         numTiles(numTiles_),

         triple_dim(triple_dim_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,

                                                                 const RhsMapper rhs_, OutAccessor out_res_,

                                                                 const StorageIndex groupSizeM_,

                                                                 const StorageIndex numTiles_,

                                                                 const TripleDim triple_dim_)

       : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);

     const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;

     const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;

     const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;

     const StorageIndex tmp = itemID.get_group(0) / groupSizeM;

     const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;

     const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;

     const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;

     const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;

     const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;

     const StorageIndex nLocalOffset = NStride * nLocalThreadId;

     const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;

     const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;


     const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;

     StorageIndex kGroupOffset = kGroupId * kSizePerWG;

     const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&

                              triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&

                              triple_dim.K - kGroupOffset >= kSizePerWG;

     // this is used to adjust the last block

     StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);

     // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to

     // tile

     kGroupOffset += kSize;


     auto thread_properties =

         ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,

                                        mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);


     auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);


     (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)

                                     : compute_panel<false>(itemID, thread_properties, out_ptr);

   }

   // The compute block computes the contraction operation private block for each thread and store the resutl in the

   // privateRes memory of Each computation the compute block function is independent of local and no local concepts as

   // it only compute the block on each thread's private memory space

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,

                                                                     PacketReturnType *privateRes) const {

     StorageIndex idx = 0;

     EIGEN_CONSTEXPR StorageIndex lhs_stride =

         contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;

     EIGEN_UNROLL_LOOP

     for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {

       auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};

       StorageIndex lhs_index = 0;

       EIGEN_UNROLL_LOOP

       for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {

         PacketReturnType lhsPack{};

         Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,

                                                                                              lhs_block_ptr + lhs_index);

         privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);


         lhs_index += lhs_stride;

         idx++;

       }

     }

   }

   // The store function write the computed contraction operation in the private memory of each thread to the global

   // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base

   // class.

   template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,

                                                    StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {

     auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {

       return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);

     };

     // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is

     // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*

     // WorkLoadPerThreadN slice of N

     EIGEN_CONSTEXPR StorageIndex GlobalNStride =

         contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;

     EIGEN_UNROLL_LOOP

     for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {

       // output leading dimension

       StorageIndex outputLD = 0;

       // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local

       // memory and extracting from local to global is the same as no transposed version. However, when local memory is

       // not used and RHS is transposed we packetize the load for RHS.

       EIGEN_UNROLL_LOOP

       for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {

         StorageIndex globalRow = mGlobalOffset;

         EIGEN_UNROLL_LOOP

         for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {

           PacketReturnType privetOut = privateRes[wLPTM];

           if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {

             // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second

             // StorageIndex Therefore it is always coalesced layout

             write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);

           } else {

             EIGEN_UNROLL_LOOP

             for (StorageIndex mId = 0; mId < PacketSize; mId++) {

               StorageIndex mOffset = globalRow + mId;

               if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {

                 out_ptr[mOffset + outputLD] =

                     Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);

               }

             }

           }

           globalRow += (PacketSize * Properties::LocalThreadSizeM);

         }

         outputLD += triple_dim.M;

         privateRes += Properties::WorkLoadPerThreadM / PacketSize;

       }

       out_ptr += (GlobalNStride * outputLD);


       nGlobalOffset += (PrivateNStride * GlobalNStride);

     }

   }

   // when no local memory is used the following extract_block will be enabled

   template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,

             contraction_type contract_tp = contraction_tp>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(

       const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,

       const StorageIndex &ncOffset, const StorageIndex cOffset) const {

     EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =

         InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;

     EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =

         InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;

     const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;


     auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {

       return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&

               (NCIndex + InputBlockProperties::nc_stride - 1 < NC));

     };

     const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;

     StorageIndex cIndex = cOffset;


     EIGEN_UNROLL_LOOP

     for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {

       StorageIndex ncIndex = ncOffset;

       EIGEN_UNROLL_LOOP

       for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {

         if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {

           auto val =

               read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,

                    InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);


           write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),

                 data_source::private_mem>(val, private_ptr);

         } else {

           EIGEN_UNROLL_LOOP

           for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {

             const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);

             const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);

             OutScalar val =

                 (ncInd < NC && cInd < triple_dim.K)

                     ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(

                           inpt, ncInd, cInd, ld)

                     : OutScalar(0);

             write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),

                   data_source::private_mem>(

                 val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +

                          ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));

           }

         }


         // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So

         // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.

         ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)

                       ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC

                       : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);

         private_ptr += InputBlockProperties::nc_stride;

       }

       // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC

       private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;

       cIndex += InputBlockProperties::c_stride;

     }

   }

   template <typename InputBlockProperties, StorageIndex TileSizeDimNC>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(

       const StorageIndex &linearLocalThreadId) {

     const StorageIndex localThreadNC =

         (InputBlockProperties::is_coalesced_layout)

             ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)

             : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);

     const StorageIndex localThreadC =

         (InputBlockProperties::is_coalesced_layout)

             ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)

             : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);

     return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);

   }


   template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<db && ctp == contraction_type::local> sync_mem(

       const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {

     db_offset = !db_offset;

   }


   template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db && ctp == contraction_type::local> sync_mem(

       const cl::sycl::nd_item<1> &itemID, bool &) noexcept {

     itemID.barrier(cl::sycl::access::fence_space::local_space);

   }


   template <contraction_type ctp = contraction_tp>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<ctp == contraction_type::no_local> sync_mem(

       const cl::sycl::nd_item<1> &, bool &) noexcept {

     return;

   }


   template <bool need_sync, contraction_type ctp = contraction_tp>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::no_local>

   sync_thread(const cl::sycl::nd_item<1> &

 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION

                   itemID

 #endif

               ) noexcept {

 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION

     itemID.barrier(cl::sycl::access::fence_spacce::local_space);

 #else

     return;

 #endif

   }

   template <bool need_sync, contraction_type ctp = contraction_tp>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::local>

   sync_thread(const cl::sycl::nd_item<1> &itemID) {

     itemID.barrier(cl::sycl::access::fence_space::local_space);

   }

   template <bool need_sync>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync> sync_thread(const cl::sycl::nd_item<1> &) {

     return;

   }


   template <bool is_internal_block>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,

                                                                     ThreadProperties<StorageIndex> &thread_properties,

                                                                     TiledMemory &tiled_input_block,

                                                                     PacketReturnType *privateRes,

                                                                     bool &db_offset) const {

     // Tiling the Rhs block from global to local memory

     extract_block<RHSBlockProperties, is_internal_block>(

         rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),

         tiled_input_block.rhs_extract_index,

         contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,

         thread_properties.kGroupOffset - thread_properties.kSize);


     sync_thread<contraction_tp == contraction_type::no_local>(itemID);


     // Tiling the Lhs block from global to local memory

     extract_block<LHSBlockProperties, is_internal_block>(

         lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),

         tiled_input_block.lhs_extract_index,

         contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,

         thread_properties.kGroupOffset - thread_properties.kSize);


     // itemID.barrier(cl::sycl::access::fence_space::local_space);

     sync_thread<contraction_tp == contraction_type::local>(itemID);

     // switch to compute mede

     StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);

     StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);

     // Loop over the values of a single tile

     for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {

       compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,

                              tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);

       lhs_offset += LSDL;

       rhs_offset += LSDR;

     }

     // computing the K index for the next tile

     thread_properties.kSize -= Properties::TileSizeDimK;

     sync_mem(itemID, db_offset);

   }


   // when local memory is available the following compute_panel will be enabled

   template <bool is_internal_block, typename OutPtr>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,

                                                            ThreadProperties<StorageIndex> &thread_properties,

                                                            OutPtr out_ptr) const {

     auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};

     // Allocate register space

     PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {

         PacketReturnType{0}};

     bool db_offset = 0;


     while (thread_properties.kSize >= Properties::TileSizeDimK) {

       compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);

     }

     if (thread_properties.kSize > 0) {

       compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);

     }


     // Storing the final results in the output

     store<is_internal_block,

           contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(

         out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,

         thread_properties.nGlobalOffset);

   }

   // When local memory is available the following extract_block will be enabled

   template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,

             contraction_type contract_tp = contraction_tp>

   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(

       const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,

       const StorageIndex &ncOffset, const StorageIndex cOffset) const {

     EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =

         InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;

     EIGEN_CONSTEXPR StorageIndex LoadPerThread =

         InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;

     EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;

     static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&

                    (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),

                   " LocalOffset must be divisible by stride");

     const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;

     StorageIndex localThreadNC = local_index.first;

     StorageIndex localThreadC = local_index.second;

     auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {

       return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&

               (NCIndex + InputBlockProperties::nc_stride - 1 < NC));

     };

     EIGEN_UNROLL_LOOP

     for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {

       const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);

       const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);

       const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;

       if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {

         auto val =

             read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,

                  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);

         write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(

             val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +

                      (InputBlockProperties::c_stride * localThreadC * LSD));

       } else {

         EIGEN_UNROLL_LOOP

         for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {

           const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);

           const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);

           OutScalar val =

               (nCInd < NC && cInd < triple_dim.K)

                   ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(

                         inpt, nCInd, cInd, ld)

                   : OutScalar(0);


           write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(

               val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +

                        (InputBlockProperties::is_coalesced_layout ? i : 0) +

                        ((InputBlockProperties::c_stride * localThreadC +

                          (InputBlockProperties::is_coalesced_layout ? 0 : i)) *

                         LSD));

         }

       }

       localThreadNC += (InputBlockProperties::is_coalesced_layout)

                            ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)

                            : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);

       localThreadC += (InputBlockProperties::is_coalesced_layout)

                           ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)

                           : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);

     }

   }

 };


 #ifndef EIGEN_SYCL_DISABLE_GEMV


 template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,

           typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>

 struct GeneralVectorTensor {

   typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType

       PacketReturnType;

   static EIGEN_CONSTEXPR int PacketSize =

       Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;

   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;


   static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =

       KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;


   // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make

   // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.

   typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>

       VecBlockProperties;


   Scratch scratch;

   const VectorMapper vec;

   const TensorMapper mat;

   OutAccessor out_res;

   const StorageIndex nonContractGroupSize;

   const StorageIndex nonContractDim;

   const StorageIndex contractDim;


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,

                                                             const TensorMapper mat_, OutAccessor out_res_,

                                                             const StorageIndex nonContractGroupSize_,

                                                             const StorageIndex nonContractDim_,

                                                             const StorageIndex contractDim_)

       : scratch(scratch_),

         vec(vec_),

         mat(mat_),

         out_res(out_res_),

         nonContractGroupSize(nonContractGroupSize_),

         nonContractDim(nonContractDim_),

         contractDim(contractDim_) {}


   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {

     auto scratch_ptr = scratch.get_pointer();

     const StorageIndex linearLocalThreadId = itemID.get_local_id(0);

     StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC

                                             : linearLocalThreadId % Properties::LocalThreadSizeNC;

     StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC

                                          : linearLocalThreadId / Properties::LocalThreadSizeNC;

     const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;

     const StorageIndex nonContractGroupId =

         is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;

     const StorageIndex contractGroupId =

         is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;

     auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);


     const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;

     const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;

     auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;

     const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;

     const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;

     auto local_output = scratch_ptr + OutScratchOffset;

     const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&

                              contractDim - contractGroupOffset >= Properties::TileSizeDimC;

     is_internal

         ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

                               scratch_ptr, contractGroupOffset,

 #endif

                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,

                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)

         : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

                                scratch_ptr, contractGroupOffset,

 #endif

                                nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,

                                nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);

   }

   template <bool is_internal_block, typename OutPtr>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(

       const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,

       OutPtr out_ptr,

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

       OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,

 #endif

       const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,

       StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,

       StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {

     OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};

     // Reading the vector

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

     const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;

     extract_block<VecBlockProperties, is_internal_block, KFactor,

                   Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,

                                                                                 vectorOffset, contractDim);


     itemID.barrier(cl::sycl::access::fence_space::local_space);

     auto in_scratch_ptr = scratch_ptr + contractId;

 #endif


     StorageIndex privateOffsetC = 0;

     EIGEN_UNROLL_LOOP

     for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {

       StorageIndex privateOffsetNC = 0;

       bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

       auto vecScalar = *in_scratch_ptr;

 #else

       auto vecScalar = (check_boundary<is_internal_block>(contract_conds))

                            ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,

                                  is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))

                            : OutScalar(0);

 #endif

       EIGEN_UNROLL_LOOP

       for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {

         auto matScalar = (check_boundary<is_internal_block>(

                              contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))

                              ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC

                                               : globalNonContractDimOffset + privateOffsetNC,

                                    is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC

                                               : globalContractDimOffset + privateOffsetC)

                              : OutScalar(0);


         outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);

         privateOffsetNC += Properties::LocalThreadSizeNC;

       }

       privateOffsetC += Properties::LocalThreadSizeC;

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

       in_scratch_ptr += Properties::LocalThreadSizeC;

 #endif

     }


     auto out_scratch_ptr = local_output + outScratchIndex;

     // Each block of 16*16 element in shared memory should reduce to 16*1

     EIGEN_UNROLL_LOOP

     for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {

       *out_scratch_ptr = outScalar[j];


       out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);

     }

     if (is_lhs_vec) {

       nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;

       contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;

       outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;

     }


     out_scratch_ptr = local_output + outScratchIndex;

     EIGEN_UNROLL_LOOP

     for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {

       EIGEN_UNROLL_LOOP

       for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {

         itemID.barrier(cl::sycl::access::fence_space::local_space);

         if (contractId < offset) {

           StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);

           *out_scratch_ptr += out_scratch_ptr[myNeigbourId];

         }

       }

       // moving to next 16 by 16 block

       out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);

     }


     if (contractId == 0) {

       out_scratch_ptr = local_output + nonContractId;

       StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;

       out_ptr += global_final_offset;

       EIGEN_UNROLL_LOOP

       for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {

         if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {

           auto res = *out_scratch_ptr;


           *out_ptr = res;

           out_ptr += Properties::LocalThreadSizeNC;

         }

         // moving to next 16 by 16 block to ge the next 16 reduced elements

         out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);

         if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;

       }

     }

   }


   template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,

             typename Local>

   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,

                                                                   const StorageIndex &linearLocalThreadId,

                                                                   const StorageIndex &cOffset, const StorageIndex &C) {

     local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;

     StorageIndex cIndex = cOffset;

     for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {

       if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {

         auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,

                         InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),

                                                                                               cIndex, StorageIndex(1));

         write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);

       } else {

         EIGEN_UNROLL_LOOP

         for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {

           OutScalar val =

               (cIndex + i < C)

                   ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(

                         inpt, StorageIndex(0), cIndex + i, StorageIndex(1))

                   : OutScalar(0);

           write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);

         }

       }

       local_ptr += InputBlockProperties::c_stride * GroupSize;

       cIndex += InputBlockProperties::c_stride * GroupSize;

     }

   }

 };

 #endif


 #ifndef EIGEN_SYCL_DISABLE_SCALAR


 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,

           typename RhsMapper, typename StorageIndex, bool Vectorizable>

 struct GeneralScalarContraction {

   typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;

   Scratch scratch;

   const LhsMapper lhs;

   const RhsMapper rhs;

   OutAccessor out_res;

   const StorageIndex rng;


   EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_,

                                              OutAccessor out_res_, const StorageIndex rng_)

       : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}


   EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {

     auto out_ptr = out_res;

     OutScalar *scratch_ptr = scratch.get_pointer();


     StorageIndex globalid = itemID.get_global_id(0);

     StorageIndex localid = itemID.get_local_id(0);

     OutScalar accumulator = OutScalar(0);

     for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {

       accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);

     }

     auto out_scratch_ptr = scratch_ptr + localid;

     *out_scratch_ptr = accumulator;

     for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {

       itemID.barrier(cl::sycl::access::fence_space::local_space);

       if (localid < offset) {

         *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);

       }

     }

     if (localid == 0) {

       out_ptr[itemID.get_group(0)] = accumulator;

     }

   }

 };

 #endif


 }  // namespace internal

 }  // namespace TensorSycl


 template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>

 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,

                        Eigen::SyclDevice>

     : public TensorContractionEvaluatorBase<TensorEvaluator<

           const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {

   static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,

                 "SYCL tensor contraction does not support output kernels.");


   typedef Eigen::SyclDevice Device;


   typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;

   typedef TensorContractionEvaluatorBase<Self> Base;

   typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;

   typedef std::remove_const_t<typename XprType::Scalar> Scalar;

   typedef typename XprType::Index StorageIndex;

   typedef typename XprType::CoeffReturnType CoeffReturnType;

   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;

   typedef typename Base::Storage Storage;

   typedef typename Base::EvaluatorPointerType EvaluatorPointerType;

   struct TripleDim {

     const StorageIndex M;

     const StorageIndex N;

     const StorageIndex K;

     TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}

   };

   enum {

     PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),

     BlockAccess = false,

   };


   static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;

   static constexpr int LDims = Base::LDims;

   static constexpr int RDims = Base::RDims;

   static constexpr int ContractDims = Base::ContractDims;


   typedef array<StorageIndex, LDims> left_dim_mapper_t;

   typedef array<StorageIndex, RDims> right_dim_mapper_t;


   typedef array<StorageIndex, ContractDims> contract_t;

   typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;

   typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;


   static constexpr int NumDims = LDims + RDims - 2 * ContractDims;


   typedef DSizes<StorageIndex, NumDims> Dimensions;


   typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;

   typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;

   typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType> LhsScalar;

   typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType> RhsScalar;


   typedef typename LeftEvaluator::Dimensions LeftDimensions;

   typedef typename RightEvaluator::Dimensions RightDimensions;


   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>

   struct input_mapper_propertis {

     static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;

     static EIGEN_CONSTEXPR bool is_rhs_matrix =

         (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);

   };


   TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}


   // We need to redefine this method to make nvcc happy

   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {

     this->m_leftImpl.evalSubExprsIfNeeded(NULL);

     this->m_rightImpl.evalSubExprsIfNeeded(NULL);

     if (!data) {

       this->m_result = this->m_device.get(

           static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));

       data = this->m_result;

     }

     evalToSycl(data);

     return (this->m_result != NULL);

   }

   const Eigen::SyclDevice &device() const { return this->m_device; }

   void evalToSycl(typename Base::EvaluatorPointerType buffer) const {

     if (this->m_lhs_inner_dim_contiguous) {

       if (this->m_rhs_inner_dim_contiguous) {

         if (this->m_rhs_inner_dim_reordered) {

           evalTyped<true, true, true, Unaligned>(buffer);

         } else {

           evalTyped<true, true, false, Unaligned>(buffer);

         }

       } else {

         if (this->m_rhs_inner_dim_reordered) {

           evalTyped<true, false, true, Unaligned>(buffer);

         } else {

           evalTyped<true, false, false, Unaligned>(buffer);

         }

       }

     } else {

       if (this->m_rhs_inner_dim_contiguous) {

         if (this->m_rhs_inner_dim_reordered) {

           evalTyped<false, true, true, Unaligned>(buffer);

         } else {

           evalTyped<false, true, false, Unaligned>(buffer);

         }

       } else {

         if (this->m_rhs_inner_dim_reordered) {

           evalTyped<false, false, true, Unaligned>(buffer);

         } else {

           evalTyped<false, false, false, Unaligned>(buffer);

         }

       }

     }

   }


   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>

   void evalTyped(typename Base::EvaluatorPointerType buffer) const {

     const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};

     typedef internal::TensorContractionInputMapper<

         LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,

         PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>

         LhsMapper;


     typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,

                                                    right_nocontract_t, contract_t,

                                                    PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,

                                                    rhs_inner_dim_reordered, Unaligned, MakePointer>

         RhsMapper;


     // initialize data mappers

     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,

                   this->m_left_contracting_strides, this->m_k_strides);


     RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,

                   this->m_right_contracting_strides, this->m_k_strides);


 #ifndef EIGEN_SYCL_DISABLE_SCALAR

     if (triple_dim.M == 1 && triple_dim.N == 1) {

       launchSC(buffer, lhs, rhs, triple_dim.K);

     } else

 #endif

 #ifndef EIGEN_SYCL_DISABLE_GEMV

         if (triple_dim.M != 1 && triple_dim.N == 1) {

       LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);

     } else if (triple_dim.M == 1 && triple_dim.N != 1) {

       LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);

     } else  // This is equivalent of if (m!=1 && n!=1)

 #endif

     {

       typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>

           inpt_mapper_properties;

 #ifndef EIGEN_SYCL_DISABLE_SKINNY

       bool skinny = false;

       auto platform_name = this->device().getPlatformName();

       // This is based on empirical calculation for AMD r9-nano and Fiji

       if (platform_name.find("AMD") == 0) {

         skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&

                  ((triple_dim.M < 1024 && triple_dim.N < 1024) ||

                   (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));

       } else {

         skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||

                   ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||

                   ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));

       }

       if (skinny)

         adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);

       else

 #endif  // EIGEN_SYCL_DISABLE_SKINNY

         adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);

     }

   }


   template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>

   void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,

                                     const TripleDim &triple_dim) const {

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON

     if (device().has_local_memory()) {

       typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;

       launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(

           buffer, lhs, rhs, triple_dim);

     }

 #endif

 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF

     if (!(device().has_local_memory())) {

       typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;

       launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(

           buffer, lhs, rhs, triple_dim);

     }

 #endif

   }


   template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,

             typename Properties, typename LhsMapper, typename RhsMapper>

   void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,

                 const TripleDim &triple_dim) const {

     const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);

     const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);

     const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;

     const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;


     const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);

     StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;

     StorageIndex groupSizeK =

         skinny

             ? std::max(std::min(totalTilesK,

                                 (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /

                                     (groupSizeM * groupSizeN)),

                        StorageIndex(1))

             : StorageIndex(1);


     const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;


     const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;


     const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;

     const StorageIndex globalRange = totalGroupSize * localRange;


     const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)

                                          ? ((Properties::DoubleBuffer + 1) *

                                             (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +

                                                ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *

                                                 (Properties::TileSizeDimN + Properties::BC))

                                          : StorageIndex(1);


     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));

     if (groupSizeK == 1) {

       typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,

                                                             LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,

                                                             PacketAccess, input_mapper_properties, true, ct>

           ContractKernelName;

       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(

               lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim)

           .wait();

     } else {

       typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,

                                                             LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,

                                                             PacketAccess, input_mapper_properties, false, ct>

           ContractKernelName;

       CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(

           device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));

       EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);


       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(

               lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,

               triple_dim)

           .wait();


       typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

       auto op = Op();

       typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,

                                                                EvaluatorPointerType, Op>

           ReductionKernel;


       device()

           .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(

               tmp_global_accessor, buffer,

               cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(

                                         Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),

                                     cl::sycl::range<1>(localRange)),

               StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK)

           .wait();

       device().deallocate_temp(temp_pointer);

     }

   }


 #ifndef EIGEN_SYCL_DISABLE_GEMV

   template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>

   void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,

                                     StorageIndex NC, StorageIndex C) const {

     const StorageIndex nonContractDim = NC;

     EIGEN_CONSTEXPR StorageIndex NCFactor = 1;

     EIGEN_CONSTEXPR StorageIndex CFactor = 1;

     EIGEN_CONSTEXPR StorageIndex NCWindow = 16;

     typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>

         Properties;

     const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);

     const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);

     const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);

     const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);

     const StorageIndex globalRange =

         (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));

     const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;

     const StorageIndex scratchSize =

         (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;

     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));

     if (cNumGroups > 1) {

       typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,

                                                                TensorMapper, StorageIndex, Properties, CFactor, false,

                                                                is_lhs_vec, false>

           ContractKernelName;

       CoeffReturnType *temp_pointer =

           static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));

       EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);


       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(

               vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C)

           .wait();


       typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

       typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,

                                                                EvaluatorPointerType, Op>

           ReductionKernel;


       device()

           .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(

               tmp_global_accessor, buffer,

               cl::sycl::nd_range<1>(

                   cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),

                   cl::sycl::range<1>(localRange)),

               StorageIndex(1), Op(), nonContractDim, cNumGroups)

           .wait();

       device().deallocate_temp(temp_pointer);

     } else {

       typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,

                                                                TensorMapper, StorageIndex, Properties, CFactor, false,

                                                                is_lhs_vec, true>

           ContractKernelName;

       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(

               vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C)

           .wait();

     }

   }

 #endif


 #ifndef EIGEN_SYCL_DISABLE_SCALAR

   template <typename LhsMapper, typename RhsMapper>

   EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,

                                     StorageIndex K) const {

     EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &

                           (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),

                         "The Local thread size must be a power of 2 for the reduction "

                         "operation");

     EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;


     // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread

     // reduces at least 512 elementss individually, we get better performance.

     const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);

     const StorageIndex global_range = num_work_group * local_range;


     typedef Eigen::TensorSycl::internal::GeneralScalarContraction<

         CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>

         ContractKernelName;

     auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));

     if (num_work_group > 1) {

       CoeffReturnType *temp_pointer =

           static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));

       EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);

       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,

                                                                                 thread_range, local_range, K)

           .wait();

       typedef Eigen::internal::SumReducer<CoeffReturnType> Op;

       typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,

                                                           EvaluatorPointerType, StorageIndex, local_range>

           GenericRKernel;

       device()

           .template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(

               tmp_global_accessor, buffer,

               cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range,

               Op())

           .wait();

       device().deallocate_temp(temp_pointer);

     } else {

       device()

           .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,

                                                                                 local_range, K)

           .wait();

     }

   }

 #endif


   EIGEN_STRONG_INLINE void cleanup() {

     this->m_leftImpl.cleanup();

     this->m_rightImpl.cleanup();


     if (this->m_result) {

       this->m_device.deallocate_temp(this->m_result);

       this->m_result = NULL;

     }

   }

 };

 }  // namespace Eigen

 #endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H

i
int i
Definition: BiCGSTAB_step_by_step.cpp:9

EIGEN_ALWAYS_INLINE
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845

EIGEN_UNROLL_LOOP
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298

EIGEN_CONSTEXPR
#define EIGEN_CONSTEXPR
Definition: Macros.h:758

EIGEN_DEVICE_FUNC
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892

EIGEN_STRONG_INLINE
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834

col
m col(1)

row
m row(1)

res
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3

EIGEN_STATIC_ASSERT
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26

EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
Definition: TensorMacros.h:51

size
Scalar Scalar int size
Definition: benchVecAdd.cpp:17

C
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49

Eigen::Matrix
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186

Eigen::SparseMatrix< double >

Eigen::TensorContractionOp
Definition: TensorContraction.h:307

Eigen::TensorContractionOp::Index
Eigen::internal::traits< TensorContractionOp >::Index Index
Definition: TensorContraction.h:314

Eigen::TensorContractionOp::CoeffReturnType
internal::gebp_traits< typename LhsXprType::CoeffReturnType, typename RhsXprType::CoeffReturnType >::ResScalar CoeffReturnType
Definition: TensorContraction.h:311

Eigen::TensorSycl::internal::TensorContractionKernel
TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
Definition: TensorContractionSycl.h:457

Eigen::TensorSycl::internal::TensorContractionKernel::sync_mem
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &db_offset) noexcept
Definition: TensorContractionSycl.h:785

Eigen::TensorSycl::internal::TensorContractionKernel::TensorContractionKernel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:593

Eigen::TensorSycl::internal::TensorContractionKernel::is_lhs_transposed
static EIGEN_CONSTEXPR bool is_lhs_transposed
Definition: TensorContractionSycl.h:463

Eigen::TensorSycl::internal::TensorContractionKernel::groupSizeM
const StorageIndex groupSizeM
Definition: TensorContractionSycl.h:573

Eigen::TensorSycl::internal::TensorContractionKernel::RHSBlockProperties
BlockProperties< is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix &&Vectorizable, PacketReturnType > RHSBlockProperties
Definition: TensorContractionSycl.h:474

Eigen::TensorSycl::internal::TensorContractionKernel::LSDR
static EIGEN_CONSTEXPR StorageIndex LSDR
Definition: TensorContractionSycl.h:486

Eigen::TensorSycl::internal::TensorContractionKernel::extract_block
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::local > extract_block(const Input &inpt, Local local_ptr, const std::pair< StorageIndex, StorageIndex > &local_index, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:891

Eigen::TensorSycl::internal::TensorContractionKernel::LocalOffset
static EIGEN_CONSTEXPR StorageIndex LocalOffset
Definition: TensorContractionSycl.h:489

Eigen::TensorSycl::internal::TensorContractionKernel::LHSBlockProperties
BlockProperties< is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix &&Vectorizable, PacketReturnType > LHSBlockProperties
Definition: TensorContractionSycl.h:470

Eigen::TensorSycl::internal::TensorContractionKernel::TensorContractionKernel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex groupSizeN_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:578

Eigen::TensorSycl::internal::TensorContractionKernel::extract_block
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::no_local > extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair< StorageIndex, StorageIndex > &, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:713

Eigen::TensorSycl::internal::TensorContractionKernel::tile_ptr
std::conditional_t< contraction_tp==contraction_type::local, local_ptr, private_ptr > tile_ptr
Definition: TensorContractionSycl.h:482

Eigen::TensorSycl::internal::TensorContractionKernel::compute_tile_per_panel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, TiledMemory &tiled_input_block, PacketReturnType *privateRes, bool &db_offset) const
Definition: TensorContractionSycl.h:826

Eigen::TensorSycl::internal::TensorContractionKernel::store
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const
Definition: TensorContractionSycl.h:663

Eigen::TensorSycl::internal::TensorContractionKernel::sync_thread
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync > sync_thread(const cl::sycl::nd_item< 1 > &)
Definition: TensorContractionSycl.h:821

Eigen::TensorSycl::internal::TensorContractionKernel::triple_dim
const TripleDim triple_dim
Definition: TensorContractionSycl.h:576

Eigen::TensorSycl::internal::TensorContractionKernel::compute_panel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, OutPtr out_ptr) const
Definition: TensorContractionSycl.h:866

Eigen::TensorSycl::internal::TensorContractionKernel::LSDL
static EIGEN_CONSTEXPR StorageIndex LSDL
Definition: TensorContractionSycl.h:483

Eigen::TensorSycl::internal::TensorContractionKernel::NStride
static EIGEN_CONSTEXPR StorageIndex NStride
Definition: TensorContractionSycl.h:476

Eigen::TensorSycl::internal::TensorContractionKernel::sync_thread
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::local > sync_thread(const cl::sycl::nd_item< 1 > &itemID)
Definition: TensorContractionSycl.h:817

Eigen::TensorSycl::internal::TensorContractionKernel::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:600

Eigen::TensorSycl::internal::TensorContractionKernel::rhs
const RhsMapper rhs
Definition: TensorContractionSycl.h:571

Eigen::TensorSycl::internal::TensorContractionKernel::numTiles
const StorageIndex numTiles
Definition: TensorContractionSycl.h:575

Eigen::TensorSycl::internal::TensorContractionKernel::private_ptr
OutScalar * private_ptr
Definition: TensorContractionSycl.h:481

Eigen::TensorSycl::internal::TensorContractionKernel::local_id_extract
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair< StorageIndex, StorageIndex > local_id_extract(const StorageIndex &linearLocalThreadId)
Definition: TensorContractionSycl.h:771

Eigen::TensorSycl::internal::TensorContractionKernel::sync_mem
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< ctp==contraction_type::no_local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &) noexcept
Definition: TensorContractionSycl.h:797

Eigen::TensorSycl::internal::TensorContractionKernel::scratch
Scratch scratch
Definition: TensorContractionSycl.h:569

Eigen::TensorSycl::internal::TensorContractionKernel::Scratch
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:479

Eigen::TensorSycl::internal::TensorContractionKernel::local_ptr
cl::sycl::multi_ptr< OutScalar, cl::sycl::access::address_space::local_space > local_ptr
Definition: TensorContractionSycl.h:480

Eigen::TensorSycl::internal::TensorContractionKernel::lhs
const LhsMapper lhs
Definition: TensorContractionSycl.h:570

Eigen::TensorSycl::internal::TensorContractionKernel::PacketReturnType
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:460

Eigen::TensorSycl::internal::TensorContractionKernel::sync_thread
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::no_local > sync_thread(const cl::sycl::nd_item< 1 > &) noexcept
Definition: TensorContractionSycl.h:804

Eigen::TensorSycl::internal::TensorContractionKernel::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:461

Eigen::TensorSycl::internal::TensorContractionKernel::compute_block_per_tile
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, PacketReturnType *privateRes) const
Definition: TensorContractionSycl.h:638

Eigen::TensorSycl::internal::TensorContractionKernel::out_res
OutAccessor out_res
Definition: TensorContractionSycl.h:572

Eigen::TensorSycl::internal::TensorContractionKernel::sync_mem
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &itemID, bool &) noexcept
Definition: TensorContractionSycl.h:791

Eigen::TensorSycl::internal::TensorContractionKernel::groupSizeN
const StorageIndex groupSizeN
Definition: TensorContractionSycl.h:574

Eigen::TensorSycl::internal::TensorContractionKernel::is_rhs_transposed
static EIGEN_CONSTEXPR bool is_rhs_transposed
Definition: TensorContractionSycl.h:465

Eigen::Triplet< double >

Eigen::internal::TensorContractionInputMapper
Definition: TensorContractionMapper.h:482

bool

oomph::Matrix
Definition: matrices.h:74

N
@ N
Definition: constructor.cpp:22

min
#define min(a, b)
Definition: datatypes.h:22

max
#define max(a, b)
Definition: datatypes.h:23

Eigen::Unaligned
@ Unaligned
Definition: Constants.h:235

k
char char char int int * k
Definition: level2_impl.h:374

op
char char * op
Definition: level2_impl.h:374

tmp
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365

Eigen::TensorSycl::internal::data_source
data_source
Definition: TensorContractionSycl.h:133

Eigen::TensorSycl::internal::data_source::private_mem
@ private_mem

Eigen::TensorSycl::internal::data_source::global_mem
@ global_mem

Eigen::TensorSycl::internal::data_source::local_mem
@ local_mem

Eigen::TensorSycl::internal::write
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< dt !=data_source::global_mem, void > write(PacketType &packet_data, DataScalar ptr)
write, a template function used for storing the data to local memory. This function is used to guaran...
Definition: TensorContractionSycl.h:221

Eigen::TensorSycl::internal::check_boundary< false >
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary< false >(bool cond)
check_boundary: specialization of the check_boundary for non-internal blocks.
Definition: TensorContractionSycl.h:291

Eigen::TensorSycl::internal::read
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< PacketLoad, PacketType > read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
Definition: TensorContractionSycl.h:162

Eigen::TensorSycl::internal::contraction_type
contraction_type
Definition: TensorContractionSycl.h:129

Eigen::TensorSycl::internal::contraction_type::no_local
@ no_local

Eigen::TensorSycl::internal::contraction_type::local
@ local

Eigen::TensorSycl::internal::check_boundary
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool)
check_boundary: is used to check the edge condition for non-internal blocks.
Definition: TensorContractionSycl.h:281

Eigen::internal::Lhs
@ Lhs
Definition: TensorContractionMapper.h:20

Eigen::internal::Rhs
@ Rhs
Definition: TensorContractionMapper.h:20

Eigen::internal::pmadd
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218

Eigen::numext::uint64_t
std::uint64_t uint64_t
Definition: Meta.h:42

Eigen
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70

Eigen::array
std::array< T, N > array
Definition: EmulateArray.h:231

Eigen::value
squared absolute value
Definition: GlobalFunctions.h:87

PlanarWave::K
double K
Wave number.
Definition: sphere_scattering.cc:115

calibrate.val
val
Definition: calibrate.py:119

internal
Definition: Eigen_Colamd.h:49

Eigen::DSizes
Definition: TensorDimensions.h:161

Eigen::MakePointer
Definition: TensorForwardDeclarations.h:25

Eigen::PacketType
Definition: TensorMeta.h:47

Eigen::StorageMemory
Definition: TensorForwardDeclarations.h:42

Eigen::TensorContractionEvaluatorBase
Definition: TensorContraction.h:342

Eigen::TensorContractionEvaluatorBase< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType LeftArgType
Definition: TensorContraction.h:344

Eigen::TensorContractionEvaluatorBase< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType RightArgType
Definition: TensorContraction.h:345

Eigen::TensorContractionEvaluatorBase< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType OutputKernelType
Definition: TensorContraction.h:346

Eigen::TensorContractionEvaluatorBase< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices Indices
Definition: TensorContraction.h:343

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::XprType
TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType > XprType
Definition: TensorContractionSycl.h:1287

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Base
TensorContractionEvaluatorBase< Self > Base
Definition: TensorContractionSycl.h:1286

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::launchTT
void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1461

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::evalTyped
void evalTyped(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1384

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::PacketReturnType
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorContractionSycl.h:1291

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::right_dim_mapper_t
array< StorageIndex, RDims > right_dim_mapper_t
Definition: TensorContractionSycl.h:1311

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::cleanup
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorContractionSycl.h:1643

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Storage
Base::Storage Storage
Definition: TensorContractionSycl.h:1292

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::left_nocontract_t
array< StorageIndex, LDims - ContractDims > left_nocontract_t
Definition: TensorContractionSycl.h:1314

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::right_nocontract_t
array< StorageIndex, RDims - ContractDims > right_nocontract_t
Definition: TensorContractionSycl.h:1315

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Dimensions
DSizes< StorageIndex, NumDims > Dimensions
Definition: TensorContractionSycl.h:1319

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::adjustTT
void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1441

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::evalToSycl
void evalToSycl(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1351

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::left_dim_mapper_t
array< StorageIndex, LDims > left_dim_mapper_t
Definition: TensorContractionSycl.h:1310

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::CoeffReturnType
XprType::CoeffReturnType CoeffReturnType
Definition: TensorContractionSycl.h:1290

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::TensorEvaluator
TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorContractionSycl.h:1336

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Self
TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Device > Self
Definition: TensorContractionSycl.h:1285

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::EvaluatorPointerType
Base::EvaluatorPointerType EvaluatorPointerType
Definition: TensorContractionSycl.h:1293

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::RightEvaluator
TensorEvaluator< typename Base::EvalRightArgType, Device > RightEvaluator
Definition: TensorContractionSycl.h:1322

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::RhsScalar
std::remove_const_t< typename RightEvaluator::CoeffReturnType > RhsScalar
Definition: TensorContractionSycl.h:1324

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::device
const Eigen::SyclDevice & device() const
Definition: TensorContractionSycl.h:1350

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::StorageIndex
XprType::Index StorageIndex
Definition: TensorContractionSycl.h:1289

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::LaunchVT
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const
Definition: TensorContractionSycl.h:1537

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::contract_t
array< StorageIndex, ContractDims > contract_t
Definition: TensorContractionSycl.h:1313

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::LeftDimensions
LeftEvaluator::Dimensions LeftDimensions
Definition: TensorContractionSycl.h:1326

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::LeftEvaluator
TensorEvaluator< typename Base::EvalLeftArgType, Device > LeftEvaluator
Definition: TensorContractionSycl.h:1321

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::RightDimensions
RightEvaluator::Dimensions RightDimensions
Definition: TensorContractionSycl.h:1327

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Scalar
std::remove_const_t< typename XprType::Scalar > Scalar
Definition: TensorContractionSycl.h:1288

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::evalSubExprsIfNeeded
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data)
Definition: TensorContractionSycl.h:1339

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::Device
Eigen::SyclDevice Device
Definition: TensorContractionSycl.h:1281

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::launchSC
EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, StorageIndex K) const
Definition: TensorContractionSycl.h:1598

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::LhsScalar
std::remove_const_t< typename LeftEvaluator::CoeffReturnType > LhsScalar
Definition: TensorContractionSycl.h:1323

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::TripleDim::N
const StorageIndex N
Definition: TensorContractionSycl.h:1296

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::TripleDim::K
const StorageIndex K
Definition: TensorContractionSycl.h:1297

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::TripleDim::TripleDim
TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_)
Definition: TensorContractionSycl.h:1298

Eigen::TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice >::TripleDim::M
const StorageIndex M
Definition: TensorContractionSycl.h:1295

Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31

Eigen::TensorEvaluator::Layout
static constexpr int Layout
Definition: TensorEvaluator.h:46

Eigen::TensorEvaluator::m_device
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170

Eigen::TensorEvaluator::EvaluatorPointerType
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41

Eigen::TensorEvaluator::PacketAccess
@ PacketAccess
Definition: TensorEvaluator.h:50

Eigen::TensorEvaluator::data
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165

Eigen::TensorEvaluator::CoeffReturnType
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34

Eigen::TensorEvaluator::Dimensions
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36

Eigen::TensorSycl::internal::BlockProperties
BlockProperties is a template class that provides different characteristic of a block of each Tensor ...
Definition: TensorContractionSycl.h:322

Eigen::TensorSycl::internal::BlockProperties::c_stride
static EIGEN_CONSTEXPR int c_stride
Definition: TensorContractionSycl.h:330

Eigen::TensorSycl::internal::BlockProperties::elements_per_access
static EIGEN_CONSTEXPR int elements_per_access
Definition: TensorContractionSycl.h:327

Eigen::TensorSycl::internal::BlockProperties::OutType
std::conditional_t< packet_load, PacketType, OutScalar > OutType
Definition: TensorContractionSycl.h:326

Eigen::TensorSycl::internal::BlockProperties::is_coalesced_layout
static EIGEN_CONSTEXPR bool is_coalesced_layout
Definition: TensorContractionSycl.h:328

Eigen::TensorSycl::internal::BlockProperties::is_rhs
static EIGEN_CONSTEXPR bool is_rhs
Definition: TensorContractionSycl.h:325

Eigen::TensorSycl::internal::BlockProperties::OutScalar
Eigen::internal::unpacket_traits< PacketType >::type OutScalar
Definition: TensorContractionSycl.h:324

Eigen::TensorSycl::internal::BlockProperties::packet_load
static EIGEN_CONSTEXPR bool packet_load
Definition: TensorContractionSycl.h:323

Eigen::TensorSycl::internal::BlockProperties::nc_stride
static EIGEN_CONSTEXPR int nc_stride
Definition: TensorContractionSycl.h:329

Eigen::TensorSycl::internal::GeneralScalarContraction
GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contrac...
Definition: TensorContractionSycl.h:1235

Eigen::TensorSycl::internal::GeneralScalarContraction::out_res
OutAccessor out_res
Definition: TensorContractionSycl.h:1240

Eigen::TensorSycl::internal::GeneralScalarContraction::scratch
Scratch scratch
Definition: TensorContractionSycl.h:1237

Eigen::TensorSycl::internal::GeneralScalarContraction::Scratch
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1236

Eigen::TensorSycl::internal::GeneralScalarContraction::GeneralScalarContraction
EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex rng_)
Definition: TensorContractionSycl.h:1243

Eigen::TensorSycl::internal::GeneralScalarContraction::rhs
const RhsMapper rhs
Definition: TensorContractionSycl.h:1239

Eigen::TensorSycl::internal::GeneralScalarContraction::rng
const StorageIndex rng
Definition: TensorContractionSycl.h:1241

Eigen::TensorSycl::internal::GeneralScalarContraction::lhs
const LhsMapper lhs
Definition: TensorContractionSycl.h:1238

Eigen::TensorSycl::internal::GeneralScalarContraction::operator()
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1247

Eigen::TensorSycl::internal::GeneralVectorTensor
GeneralVectorTensor is a template class that provides Tensor -vector contraction operation,...
Definition: TensorContractionSycl.h:995

Eigen::TensorSycl::internal::GeneralVectorTensor::scratch
Scratch scratch
Definition: TensorContractionSycl.h:1010

Eigen::TensorSycl::internal::GeneralVectorTensor::Scratch
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1000

Eigen::TensorSycl::internal::GeneralVectorTensor::compute_panel
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, OutPtr out_ptr, const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex)
Definition: TensorContractionSycl.h:1068

Eigen::TensorSycl::internal::GeneralVectorTensor::GeneralVectorTensor
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, const TensorMapper mat_, OutAccessor out_res_, const StorageIndex nonContractGroupSize_, const StorageIndex nonContractDim_, const StorageIndex contractDim_)
Definition: TensorContractionSycl.h:1018

Eigen::TensorSycl::internal::GeneralVectorTensor::mat
const TensorMapper mat
Definition: TensorContractionSycl.h:1012

Eigen::TensorSycl::internal::GeneralVectorTensor::extract_block
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, const StorageIndex &linearLocalThreadId, const StorageIndex &cOffset, const StorageIndex &C)
Definition: TensorContractionSycl.h:1171

Eigen::TensorSycl::internal::GeneralVectorTensor::nonContractDim
const StorageIndex nonContractDim
Definition: TensorContractionSycl.h:1015

Eigen::TensorSycl::internal::GeneralVectorTensor::nonContractGroupSize
const StorageIndex nonContractGroupSize
Definition: TensorContractionSycl.h:1014

Eigen::TensorSycl::internal::GeneralVectorTensor::contractDim
const StorageIndex contractDim
Definition: TensorContractionSycl.h:1016

Eigen::TensorSycl::internal::GeneralVectorTensor::VecBlockProperties
BlockProperties< is_lhs_vec ? false :true, is_lhs_vec ? false :true, Vectorizable, PacketReturnType > VecBlockProperties
Definition: TensorContractionSycl.h:1008

Eigen::TensorSycl::internal::GeneralVectorTensor::operator()
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1031

Eigen::TensorSycl::internal::GeneralVectorTensor::out_res
OutAccessor out_res
Definition: TensorContractionSycl.h:1013

Eigen::TensorSycl::internal::GeneralVectorTensor::PacketSize
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:998

Eigen::TensorSycl::internal::GeneralVectorTensor::OutScratchOffset
static EIGEN_CONSTEXPR StorageIndex OutScratchOffset
Definition: TensorContractionSycl.h:1002

Eigen::TensorSycl::internal::GeneralVectorTensor::PacketReturnType
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:997

Eigen::TensorSycl::internal::GeneralVectorTensor::vec
const VectorMapper vec
Definition: TensorContractionSycl.h:1011

Eigen::TensorSycl::internal::PacketWrapper::set_packet
static EIGEN_DEVICE_FUNC void set_packet(PacketReturnType, Scalar *)
Definition: InteropHeaders.h:145

Eigen::TensorSycl::internal::PacketWrapper::scalarize
static EIGEN_DEVICE_FUNC Scalar scalarize(Index, PacketReturnType &)
Definition: InteropHeaders.h:138

Eigen::TensorSycl::internal::SecondStepFullReducer
Definition: TensorReductionSycl.h:79

Eigen::TensorSycl::internal::SecondStepPartialReduction
Definition: TensorReductionSycl.h:365

Eigen::TensorSycl::internal::TTPanelSize
TTPanelSize, a template class used for setting the panel size required for launching General Tensor T...
Definition: TensorContractionSycl.h:82

Eigen::TensorSycl::internal::TTPanelSize::WorkLoadPerThreadN
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN
Definition: TensorContractionSycl.h:95

Eigen::TensorSycl::internal::TTPanelSize::WorkLoadPerThreadM
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM
Definition: TensorContractionSycl.h:88

Eigen::TensorSycl::internal::TTPanelSize::DoubleBuffer
static EIGEN_CONSTEXPR bool DoubleBuffer
Definition: TensorContractionSycl.h:117

Eigen::TensorSycl::internal::TTPanelSize::LocalThreadSizeM
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM
Definition: TensorContractionSycl.h:100

Eigen::TensorSycl::internal::TTPanelSize::LocalThreadSizeN
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN
Definition: TensorContractionSycl.h:102

Eigen::TensorSycl::internal::TTPanelSize::TileSizeDimM
static EIGEN_CONSTEXPR StorageIndex TileSizeDimM
Definition: TensorContractionSycl.h:104

Eigen::TensorSycl::internal::TTPanelSize::LoadPerThreadLhs
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs
Definition: TensorContractionSycl.h:108

Eigen::TensorSycl::internal::TTPanelSize::LoadPerThreadRhs
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs
Definition: TensorContractionSycl.h:111

Eigen::TensorSycl::internal::TTPanelSize::BC
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:114

Eigen::TensorSycl::internal::TTPanelSize::TileSizeDimN
static EIGEN_CONSTEXPR StorageIndex TileSizeDimN
Definition: TensorContractionSycl.h:106

Eigen::TensorSycl::internal::TTPanelSize::TileSizeDimK
static EIGEN_CONSTEXPR StorageIndex TileSizeDimK
Definition: TensorContractionSycl.h:84

Eigen::TensorSycl::internal::TVPanelSize
TVPanelSize, a template class used for setting the panel size required for launching General TensorVe...
Definition: TensorContractionSycl.h:46

Eigen::TensorSycl::internal::TVPanelSize::BC
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:60

Eigen::TensorSycl::internal::TVPanelSize::TileSizeDimNC
static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC
Definition: TensorContractionSycl.h:52

Eigen::TensorSycl::internal::TVPanelSize::TileSizeDimC
static EIGEN_CONSTEXPR StorageIndex TileSizeDimC
Definition: TensorContractionSycl.h:54

Eigen::TensorSycl::internal::TVPanelSize::WorkLoadPerThreadC
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC
Definition: TensorContractionSycl.h:58

Eigen::TensorSycl::internal::TVPanelSize::WorkLoadPerThreadNC
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC
Definition: TensorContractionSycl.h:56

Eigen::TensorSycl::internal::TVPanelSize::LocalThreadSizeNC
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC
Definition: TensorContractionSycl.h:50

Eigen::TensorSycl::internal::TVPanelSize::LocalThreadSizeC
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC
Definition: TensorContractionSycl.h:48

Eigen::TensorSycl::internal::TensorContractionKernel::MemHolder
MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it ...
Definition: TensorContractionSycl.h:504

Eigen::TensorSycl::internal::TensorContractionKernel::MemHolder::ptr
tile_ptr ptr
Definition: TensorContractionSycl.h:505

Eigen::TensorSycl::internal::TensorContractionKernel::MemHolder::MemHolder
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr)
Definition: TensorContractionSycl.h:506

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory
TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel fr...
Definition: TensorContractionSycl.h:537

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::TiledMemory
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &thread_properties, local_ptr block_start_ptr, std::enable_if_t< tp==contraction_type::local > *=0)
Definition: TensorContractionSycl.h:555

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::TiledMemory
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &, local_ptr, std::enable_if_t< tp==contraction_type::no_local > *=0)
Definition: TensorContractionSycl.h:545

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::rhs_scratch_ptr_compute
tile_ptr rhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:541

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::rhs_extract_index
const std::pair< StorageIndex, StorageIndex > rhs_extract_index
Definition: TensorContractionSycl.h:543

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::rhs_scratch_extract
MemHolder< contraction_tp, Properties::WorkLoadPerThreadN *Properties::TileSizeDimK > rhs_scratch_extract
Definition: TensorContractionSycl.h:539

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::lhs_extract_index
const std::pair< StorageIndex, StorageIndex > lhs_extract_index
Definition: TensorContractionSycl.h:542

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::lhs_scratch_ptr_compute
tile_ptr lhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:540

Eigen::TensorSycl::internal::TensorContractionKernel::TiledMemory::lhs_scratch_extract
MemHolder< contraction_tp, Properties::WorkLoadPerThreadM *Properties::TileSizeDimK > lhs_scratch_extract
Definition: TensorContractionSycl.h:538

Eigen::TensorSycl::internal::ThreadProperties
ThreadProperties is a template class that provides each thread's properties within a workgroup....
Definition: TensorContractionSycl.h:373

Eigen::TensorSycl::internal::ThreadProperties::mGroupOffset
const StorageIndex mGroupOffset
Definition: TensorContractionSycl.h:376

Eigen::TensorSycl::internal::ThreadProperties::nGroupOffset
const StorageIndex nGroupOffset
Definition: TensorContractionSycl.h:377

Eigen::TensorSycl::internal::ThreadProperties::mLocalOffset
const StorageIndex mLocalOffset
Definition: TensorContractionSycl.h:379

Eigen::TensorSycl::internal::ThreadProperties::kGroupId
const StorageIndex kGroupId
Definition: TensorContractionSycl.h:375

Eigen::TensorSycl::internal::ThreadProperties::linearLocalThreadId
const StorageIndex linearLocalThreadId
Definition: TensorContractionSycl.h:374

Eigen::TensorSycl::internal::ThreadProperties::nLocalOffset
const StorageIndex nLocalOffset
Definition: TensorContractionSycl.h:380

Eigen::TensorSycl::internal::ThreadProperties::kSize
StorageIndex kSize
Definition: TensorContractionSycl.h:383

Eigen::TensorSycl::internal::ThreadProperties::kGroupOffset
const StorageIndex kGroupOffset
Definition: TensorContractionSycl.h:378

Eigen::TensorSycl::internal::ThreadProperties::mGlobalOffset
const StorageIndex mGlobalOffset
Definition: TensorContractionSycl.h:381

Eigen::TensorSycl::internal::ThreadProperties::is_internal
const bool is_internal
Definition: TensorContractionSycl.h:384

Eigen::TensorSycl::internal::ThreadProperties::nGlobalOffset
const StorageIndex nGlobalOffset
Definition: TensorContractionSycl.h:382

Eigen::TensorSycl::internal::ThreadProperties::ThreadProperties
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, StorageIndex kSize_, const bool is_internal_)
Definition: TensorContractionSycl.h:386

Eigen::internal::SumReducer
Definition: TensorFunctors.h:66

Eigen::internal::TensorContractionInputMapperTrait
Definition: TensorContractionMapper.h:517

Eigen::internal::unpacket_traits
Definition: GenericPacketMath.h:134

j
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2

Eigen::internal::Packet
Definition: ZVector/PacketMath.h:50

InternalHeaderCheck.h