TensorContractionSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library for linear algebra.
2 //
3 // Mehdi Goli Codeplay Software Ltd.
4 // Ralph Potter Codeplay Software Ltd.
5 // Luke Iwanski Codeplay Software Ltd.
6 // Contact: <eigen@codeplay.com>
7 //
8 // This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
9 // distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 /*****************************************************************
12  * TensorContractionSycl.h
13  *
14  * \brief:
15  * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
16  *
17  *****************************************************************/
18 
19 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
20 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
21 
22 // IWYU pragma: private
23 #include "./InternalHeaderCheck.h"
24 
25 namespace Eigen {
26 
27 namespace TensorSycl {
28 namespace internal {
29 
30 #ifndef EIGEN_SYCL_DISABLE_GEMV
45 template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
46 struct TVPanelSize {
47  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
48  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
49  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
50  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
51  // TileSizeDimNC: determines the tile size for the non-contracting dimension
52  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
53  // TileSizeDimC: determines the tile size for the contracting dimension
55  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
57  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
59  // BC : determines if supporting bank conflict is required
60  static EIGEN_CONSTEXPR bool BC = false;
61 };
62 #endif
63 
81 template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
82 struct TTPanelSize {
83  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
84  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
85  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
86  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
87 #ifndef EIGEN_SYCL_REG_M
88  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
89 #else
90  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
91 #endif
92 // WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
93 // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
94 #ifndef EIGEN_SYCL_REG_N
95  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
96 #else
97  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
98 #endif
99  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
100  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
101  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
102  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
103  // TileSizeDimM: determines the tile size for the m dimension
105  // TileSizeDimN: determines the tile size for the n dimension
107  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize
108  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
110  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize
111  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
113  // BC : determines if supporting bank conflict is required
114  static EIGEN_CONSTEXPR bool BC = true;
115  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
116  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
118 #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
119  false;
120 #else
121  true;
122 #endif
123 };
124 
125 /* !
126  * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
127  * specialize the contraction algorithm based on device support for dedicated local memory.
128  */
130 /* !
131  * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
132  */
134 
160 template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
161  typename StorageIndex>
162 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<PacketLoad, PacketType> read(
163  const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
164  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
165  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
166  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
167 }
168 
191 template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
192 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!PacketLoad, PacketType> read(
193  const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
194  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
195  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
196  return tensorMapper(row, col);
197 }
198 
220 template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
221 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(
222  PacketType &packet_data, DataScalar ptr) {
225  for (int i = 0; i < PacketSize; i++) {
227  ptr += ld;
228  }
229 }
230 
246 template <data_source dt, typename PacketType, typename DataScalar>
249  void>
250  write(PacketType &packet_data, DataScalar *ptr) {
251  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
252 }
253 
267 template <data_source dt, typename PacketType, typename DataScalar>
270  void>
271  write(PacketType &packet_data, DataScalar *ptr) {
272  *ptr = packet_data;
273 }
274 
280 template <bool is_internal>
282  return true;
283 }
284 
290 template <>
292  return cond;
293 }
294 
321 template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
323  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
325  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
326  typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
328  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
331 };
332 
372 template <typename StorageIndex>
374  const StorageIndex linearLocalThreadId;
375  const StorageIndex kGroupId;
376  const StorageIndex mGroupOffset;
377  const StorageIndex nGroupOffset;
378  const StorageIndex kGroupOffset;
379  const StorageIndex mLocalOffset;
380  const StorageIndex nLocalOffset;
381  const StorageIndex mGlobalOffset;
382  const StorageIndex nGlobalOffset;
383  StorageIndex kSize;
384  const bool is_internal;
385  // this is used to adjust the last block
387  const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
388  const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
389  const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
390  StorageIndex kSize_, const bool is_internal_)
391  : linearLocalThreadId(linearLocalThreadId_),
392  kGroupId(kGroupId_),
393  mGroupOffset(mGroupOffset_),
394  nGroupOffset(nGroupOffset_),
395  kGroupOffset(kGroupOffset_),
396  mLocalOffset(mLocalOffset_),
397  nLocalOffset(nLocalOffset_),
398  mGlobalOffset(mGlobalOffset_),
399  nGlobalOffset(nGlobalOffset_),
400  kSize(kSize_),
401  is_internal(is_internal_) {}
402 };
403 
454 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
455  typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
456  typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
458  public:
459  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
462  Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
467 
468  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
471 
472  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
475 
476  static EIGEN_CONSTEXPR StorageIndex NStride =
477  contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
478 
479  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
480  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
481  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
482  typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;
483  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
484  ? Properties::TileSizeDimM + Properties::BC
485  : Properties::WorkLoadPerThreadM;
486  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
487  ? Properties::TileSizeDimN + Properties::BC
488  : Properties::WorkLoadPerThreadN;
489  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
490 
503  template <contraction_type, StorageIndex>
504  struct MemHolder {
506  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
507  };
511  template <StorageIndex MemSize>
512  struct MemHolder<contraction_type::no_local, MemSize> {
513  OutScalar ptr[MemSize] = {OutScalar{0}};
514  };
537  struct TiledMemory {
542  const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
543  const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
544  template <contraction_type tp = contraction_tp>
546  std::enable_if_t<tp == contraction_type::no_local> * = 0)
551  lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
552  rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
553 
554  template <contraction_type tp = contraction_tp>
556  local_ptr block_start_ptr,
557  std::enable_if_t<tp == contraction_type::local> * = 0)
558  : lhs_scratch_extract{block_start_ptr},
560  ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
561  lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
562  rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
564  local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
566  local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
567  };
568 
570  const LhsMapper lhs;
571  const RhsMapper rhs;
572  OutAccessor out_res;
573  const StorageIndex groupSizeM;
574  const StorageIndex groupSizeN;
575  const StorageIndex numTiles;
576  const TripleDim triple_dim;
577 
579  const RhsMapper rhs_, OutAccessor out_res_,
580  const StorageIndex groupSizeM_,
581  const StorageIndex groupSizeN_,
582  const StorageIndex numTiles_,
583  const TripleDim triple_dim_)
584  : scratch(scratch_),
585  lhs(lhs_),
586  rhs(rhs_),
587  out_res(out_res_),
588  groupSizeM(groupSizeM_),
589  groupSizeN(groupSizeN_),
590  numTiles(numTiles_),
591  triple_dim(triple_dim_) {}
592 
594  const RhsMapper rhs_, OutAccessor out_res_,
595  const StorageIndex groupSizeM_,
596  const StorageIndex numTiles_,
597  const TripleDim triple_dim_)
598  : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
599 
600  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
601  const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
602  const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
603  const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
604  const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
605  const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
606  const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
607  const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
608  const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
609  const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
610  const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
611  const StorageIndex nLocalOffset = NStride * nLocalThreadId;
612  const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
613  const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
614 
615  const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
616  StorageIndex kGroupOffset = kGroupId * kSizePerWG;
617  const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
618  triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
619  triple_dim.K - kGroupOffset >= kSizePerWG;
620  // this is used to adjust the last block
621  StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
622  // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
623  // tile
624  kGroupOffset += kSize;
625 
626  auto thread_properties =
627  ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
628  mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
629 
630  auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
631 
632  (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
633  : compute_panel<false>(itemID, thread_properties, out_ptr);
634  }
635  // The compute block computes the contraction operation private block for each thread and store the resutl in the
636  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
637  // it only compute the block on each thread's private memory space
638  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
639  PacketReturnType *privateRes) const {
640  StorageIndex idx = 0;
641  EIGEN_CONSTEXPR StorageIndex lhs_stride =
642  contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
644  for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
645  auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
646  StorageIndex lhs_index = 0;
648  for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
649  PacketReturnType lhsPack{};
651  lhs_block_ptr + lhs_index);
652  privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
653 
654  lhs_index += lhs_stride;
655  idx++;
656  }
657  }
658  }
659  // The store function write the computed contraction operation in the private memory of each thread to the global
660  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
661  // class.
662  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
663  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
664  StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
665  auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
666  return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
667  };
668  // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
669  // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
670  // WorkLoadPerThreadN slice of N
671  EIGEN_CONSTEXPR StorageIndex GlobalNStride =
672  contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
674  for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
675  // output leading dimension
676  StorageIndex outputLD = 0;
677  // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local
678  // memory and extracting from local to global is the same as no transposed version. However, when local memory is
679  // not used and RHS is transposed we packetize the load for RHS.
681  for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
682  StorageIndex globalRow = mGlobalOffset;
684  for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
685  PacketReturnType privetOut = privateRes[wLPTM];
686  if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
687  // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
688  // StorageIndex Therefore it is always coalesced layout
689  write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
690  } else {
692  for (StorageIndex mId = 0; mId < PacketSize; mId++) {
693  StorageIndex mOffset = globalRow + mId;
694  if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
695  out_ptr[mOffset + outputLD] =
697  }
698  }
699  }
700  globalRow += (PacketSize * Properties::LocalThreadSizeM);
701  }
702  outputLD += triple_dim.M;
703  privateRes += Properties::WorkLoadPerThreadM / PacketSize;
704  }
705  out_ptr += (GlobalNStride * outputLD);
706 
707  nGlobalOffset += (PrivateNStride * GlobalNStride);
708  }
709  }
710  // when no local memory is used the following extract_block will be enabled
711  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
712  contraction_type contract_tp = contraction_tp>
713  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(
714  const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
715  const StorageIndex &ncOffset, const StorageIndex cOffset) const {
716  EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
717  InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
718  EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
719  InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
720  const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
721 
722  auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
723  return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
724  (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
725  };
726  const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
727  StorageIndex cIndex = cOffset;
728 
730  for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
731  StorageIndex ncIndex = ncOffset;
733  for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
734  if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
735  auto val =
736  read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
737  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
738 
739  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
741  } else {
743  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
744  const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
745  const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
746  OutScalar val =
747  (ncInd < NC && cInd < triple_dim.K)
748  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
749  inpt, ncInd, cInd, ld)
750  : OutScalar(0);
751  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
753  val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
754  ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
755  }
756  }
757 
758  // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
759  // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
760  ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
761  ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
762  : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
763  private_ptr += InputBlockProperties::nc_stride;
764  }
765  // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
766  private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
767  cIndex += InputBlockProperties::c_stride;
768  }
769  }
770  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
771  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
772  const StorageIndex &linearLocalThreadId) {
773  const StorageIndex localThreadNC =
774  (InputBlockProperties::is_coalesced_layout)
775  ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
776  : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
777  const StorageIndex localThreadC =
778  (InputBlockProperties::is_coalesced_layout)
779  ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
780  : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
781  return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
782  }
783 
784  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
785  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<db && ctp == contraction_type::local> sync_mem(
786  const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
787  db_offset = !db_offset;
788  }
789 
790  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
791  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db && ctp == contraction_type::local> sync_mem(
792  const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
793  itemID.barrier(cl::sycl::access::fence_space::local_space);
794  }
795 
796  template <contraction_type ctp = contraction_tp>
797  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<ctp == contraction_type::no_local> sync_mem(
798  const cl::sycl::nd_item<1> &, bool &) noexcept {
799  return;
800  }
801 
802  template <bool need_sync, contraction_type ctp = contraction_tp>
803  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::no_local>
804  sync_thread(const cl::sycl::nd_item<1> &
805 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
806  itemID
807 #endif
808  ) noexcept {
809 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
810  itemID.barrier(cl::sycl::access::fence_spacce::local_space);
811 #else
812  return;
813 #endif
814  }
815  template <bool need_sync, contraction_type ctp = contraction_tp>
816  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::local>
817  sync_thread(const cl::sycl::nd_item<1> &itemID) {
818  itemID.barrier(cl::sycl::access::fence_space::local_space);
819  }
820  template <bool need_sync>
821  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync> sync_thread(const cl::sycl::nd_item<1> &) {
822  return;
823  }
824 
825  template <bool is_internal_block>
826  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
827  ThreadProperties<StorageIndex> &thread_properties,
828  TiledMemory &tiled_input_block,
829  PacketReturnType *privateRes,
830  bool &db_offset) const {
831  // Tiling the Rhs block from global to local memory
832  extract_block<RHSBlockProperties, is_internal_block>(
833  rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
834  tiled_input_block.rhs_extract_index,
835  contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
836  thread_properties.kGroupOffset - thread_properties.kSize);
837 
838  sync_thread<contraction_tp == contraction_type::no_local>(itemID);
839 
840  // Tiling the Lhs block from global to local memory
841  extract_block<LHSBlockProperties, is_internal_block>(
842  lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
843  tiled_input_block.lhs_extract_index,
844  contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
845  thread_properties.kGroupOffset - thread_properties.kSize);
846 
847  // itemID.barrier(cl::sycl::access::fence_space::local_space);
848  sync_thread<contraction_tp == contraction_type::local>(itemID);
849  // switch to compute mede
850  StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
851  StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
852  // Loop over the values of a single tile
853  for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
854  compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
855  tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
856  lhs_offset += LSDL;
857  rhs_offset += LSDR;
858  }
859  // computing the K index for the next tile
860  thread_properties.kSize -= Properties::TileSizeDimK;
861  sync_mem(itemID, db_offset);
862  }
863 
864  // when local memory is available the following compute_panel will be enabled
865  template <bool is_internal_block, typename OutPtr>
866  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
867  ThreadProperties<StorageIndex> &thread_properties,
868  OutPtr out_ptr) const {
869  auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
870  // Allocate register space
871  PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
872  PacketReturnType{0}};
873  bool db_offset = 0;
874 
875  while (thread_properties.kSize >= Properties::TileSizeDimK) {
876  compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
877  }
878  if (thread_properties.kSize > 0) {
879  compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
880  }
881 
882  // Storing the final results in the output
883  store<is_internal_block,
884  contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
885  out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
886  thread_properties.nGlobalOffset);
887  }
888  // When local memory is available the following extract_block will be enabled
889  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
890  contraction_type contract_tp = contraction_tp>
891  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(
892  const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,
893  const StorageIndex &ncOffset, const StorageIndex cOffset) const {
894  EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
895  InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
896  EIGEN_CONSTEXPR StorageIndex LoadPerThread =
897  InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
898  EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
899  static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
900  (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
901  " LocalOffset must be divisible by stride");
902  const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
903  StorageIndex localThreadNC = local_index.first;
904  StorageIndex localThreadC = local_index.second;
905  auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
906  return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
907  (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
908  };
910  for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
911  const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
912  const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
913  const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
914  if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
915  auto val =
916  read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
917  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
918  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
919  val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
920  (InputBlockProperties::c_stride * localThreadC * LSD));
921  } else {
923  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
924  const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
925  const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
926  OutScalar val =
927  (nCInd < NC && cInd < triple_dim.K)
928  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
929  inpt, nCInd, cInd, ld)
930  : OutScalar(0);
931 
932  write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
933  val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
934  (InputBlockProperties::is_coalesced_layout ? i : 0) +
935  ((InputBlockProperties::c_stride * localThreadC +
936  (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
937  LSD));
938  }
939  }
940  localThreadNC += (InputBlockProperties::is_coalesced_layout)
941  ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
942  : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
943  localThreadC += (InputBlockProperties::is_coalesced_layout)
944  ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
945  : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
946  }
947  }
948 };
949 
950 #ifndef EIGEN_SYCL_DISABLE_GEMV
951 
993 template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
994  typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
996  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
999  Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
1000  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1001 
1002  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
1003  KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1004 
1005  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
1006  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
1009 
1011  const VectorMapper vec;
1012  const TensorMapper mat;
1013  OutAccessor out_res;
1014  const StorageIndex nonContractGroupSize;
1015  const StorageIndex nonContractDim;
1016  const StorageIndex contractDim;
1017 
1019  const TensorMapper mat_, OutAccessor out_res_,
1020  const StorageIndex nonContractGroupSize_,
1021  const StorageIndex nonContractDim_,
1022  const StorageIndex contractDim_)
1023  : scratch(scratch_),
1024  vec(vec_),
1025  mat(mat_),
1026  out_res(out_res_),
1027  nonContractGroupSize(nonContractGroupSize_),
1028  nonContractDim(nonContractDim_),
1029  contractDim(contractDim_) {}
1030 
1031  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
1032  auto scratch_ptr = scratch.get_pointer();
1033  const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
1034  StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
1035  : linearLocalThreadId % Properties::LocalThreadSizeNC;
1036  StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
1037  : linearLocalThreadId / Properties::LocalThreadSizeNC;
1038  const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
1039  const StorageIndex nonContractGroupId =
1040  is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
1041  const StorageIndex contractGroupId =
1042  is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
1043  auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
1044 
1045  const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
1046  const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
1047  auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1048  const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
1049  const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
1050  auto local_output = scratch_ptr + OutScratchOffset;
1051  const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
1052  contractDim - contractGroupOffset >= Properties::TileSizeDimC;
1053  is_internal
1054  ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
1056  scratch_ptr, contractGroupOffset,
1057 #endif
1058  nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1059  nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
1060  : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
1062  scratch_ptr, contractGroupOffset,
1063 #endif
1064  nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
1065  nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
1066  }
1067  template <bool is_internal_block, typename OutPtr>
1069  const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
1070  OutPtr out_ptr,
1072  OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
1073 #endif
1074  const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
1075  StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
1076  StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
1077  OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
1078  // Reading the vector
1079 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1080  const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
1081  extract_block<VecBlockProperties, is_internal_block, KFactor,
1082  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
1083  vectorOffset, contractDim);
1084 
1085  itemID.barrier(cl::sycl::access::fence_space::local_space);
1086  auto in_scratch_ptr = scratch_ptr + contractId;
1087 #endif
1088 
1089  StorageIndex privateOffsetC = 0;
1091  for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
1092  StorageIndex privateOffsetNC = 0;
1093  bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
1094 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1095  auto vecScalar = *in_scratch_ptr;
1096 #else
1097  auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
1098  ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
1099  is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
1100  : OutScalar(0);
1101 #endif
1103  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1104  auto matScalar = (check_boundary<is_internal_block>(
1105  contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
1106  ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
1107  : globalNonContractDimOffset + privateOffsetNC,
1108  is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
1109  : globalContractDimOffset + privateOffsetC)
1110  : OutScalar(0);
1111 
1112  outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
1113  privateOffsetNC += Properties::LocalThreadSizeNC;
1114  }
1115  privateOffsetC += Properties::LocalThreadSizeC;
1116 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1117  in_scratch_ptr += Properties::LocalThreadSizeC;
1118 #endif
1119  }
1120 
1121  auto out_scratch_ptr = local_output + outScratchIndex;
1122  // Each block of 16*16 element in shared memory should reduce to 16*1
1124  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1125  *out_scratch_ptr = outScalar[j];
1126 
1127  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1128  }
1129  if (is_lhs_vec) {
1130  nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
1131  contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
1132  outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1133  }
1134 
1135  out_scratch_ptr = local_output + outScratchIndex;
1137  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1139  for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
1140  itemID.barrier(cl::sycl::access::fence_space::local_space);
1141  if (contractId < offset) {
1142  StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
1143  *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
1144  }
1145  }
1146  // moving to next 16 by 16 block
1147  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1148  }
1149 
1150  if (contractId == 0) {
1151  out_scratch_ptr = local_output + nonContractId;
1152  StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
1153  out_ptr += global_final_offset;
1155  for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
1156  if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
1157  auto res = *out_scratch_ptr;
1158 
1159  *out_ptr = res;
1160  out_ptr += Properties::LocalThreadSizeNC;
1161  }
1162  // moving to next 16 by 16 block to ge the next 16 reduced elements
1163  out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1164  if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
1165  }
1166  }
1167  }
1168 
1169  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
1170  typename Local>
1171  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
1172  const StorageIndex &linearLocalThreadId,
1173  const StorageIndex &cOffset, const StorageIndex &C) {
1174  local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
1175  StorageIndex cIndex = cOffset;
1176  for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
1177  if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
1178  auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
1179  InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
1180  cIndex, StorageIndex(1));
1181  write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
1182  } else {
1184  for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
1185  OutScalar val =
1186  (cIndex + i < C)
1187  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
1188  inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
1189  : OutScalar(0);
1190  write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
1191  }
1192  }
1193  local_ptr += InputBlockProperties::c_stride * GroupSize;
1194  cIndex += InputBlockProperties::c_stride * GroupSize;
1195  }
1196  }
1197 };
1198 #endif
1199 
1200 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1201 
1233 template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
1234  typename RhsMapper, typename StorageIndex, bool Vectorizable>
1236  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
1238  const LhsMapper lhs;
1239  const RhsMapper rhs;
1240  OutAccessor out_res;
1241  const StorageIndex rng;
1242 
1243  EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_,
1244  OutAccessor out_res_, const StorageIndex rng_)
1245  : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
1246 
1247  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
1248  auto out_ptr = out_res;
1249  OutScalar *scratch_ptr = scratch.get_pointer();
1250 
1251  StorageIndex globalid = itemID.get_global_id(0);
1252  StorageIndex localid = itemID.get_local_id(0);
1253  OutScalar accumulator = OutScalar(0);
1254  for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
1255  accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
1256  }
1257  auto out_scratch_ptr = scratch_ptr + localid;
1258  *out_scratch_ptr = accumulator;
1259  for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
1260  itemID.barrier(cl::sycl::access::fence_space::local_space);
1261  if (localid < offset) {
1262  *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
1263  }
1264  }
1265  if (localid == 0) {
1266  out_ptr[itemID.get_group(0)] = accumulator;
1267  }
1268  }
1269 };
1270 #endif
1271 
1272 } // namespace internal
1273 } // namespace TensorSycl
1274 
1275 template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
1277  Eigen::SyclDevice>
1278  : public TensorContractionEvaluatorBase<TensorEvaluator<
1279  const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
1281  "SYCL tensor contraction does not support output kernels.");
1282 
1283  typedef Eigen::SyclDevice Device;
1284 
1288  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
1289  typedef typename XprType::Index StorageIndex;
1292  typedef typename Base::Storage Storage;
1294  struct TripleDim {
1298  TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
1299  };
1300  enum {
1302  BlockAccess = false,
1303  };
1304 
1306  static constexpr int LDims = Base::LDims;
1307  static constexpr int RDims = Base::RDims;
1308  static constexpr int ContractDims = Base::ContractDims;
1309 
1312 
1314  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
1315  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
1316 
1317  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
1318 
1320 
1323  typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType> LhsScalar;
1324  typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType> RhsScalar;
1325 
1328 
1329  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
1330  struct input_mapper_propertis {
1331  static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
1332  static EIGEN_CONSTEXPR bool is_rhs_matrix =
1333  (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
1334  };
1335 
1336  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
1337 
1338  // We need to redefine this method to make nvcc happy
1340  this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1341  this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1342  if (!data) {
1343  this->m_result = this->m_device.get(
1344  static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
1345  data = this->m_result;
1346  }
1347  evalToSycl(data);
1348  return (this->m_result != NULL);
1349  }
1350  const Eigen::SyclDevice &device() const { return this->m_device; }
1351  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
1352  if (this->m_lhs_inner_dim_contiguous) {
1353  if (this->m_rhs_inner_dim_contiguous) {
1354  if (this->m_rhs_inner_dim_reordered) {
1355  evalTyped<true, true, true, Unaligned>(buffer);
1356  } else {
1357  evalTyped<true, true, false, Unaligned>(buffer);
1358  }
1359  } else {
1360  if (this->m_rhs_inner_dim_reordered) {
1361  evalTyped<true, false, true, Unaligned>(buffer);
1362  } else {
1363  evalTyped<true, false, false, Unaligned>(buffer);
1364  }
1365  }
1366  } else {
1367  if (this->m_rhs_inner_dim_contiguous) {
1368  if (this->m_rhs_inner_dim_reordered) {
1369  evalTyped<false, true, true, Unaligned>(buffer);
1370  } else {
1371  evalTyped<false, true, false, Unaligned>(buffer);
1372  }
1373  } else {
1374  if (this->m_rhs_inner_dim_reordered) {
1375  evalTyped<false, false, true, Unaligned>(buffer);
1376  } else {
1377  evalTyped<false, false, false, Unaligned>(buffer);
1378  }
1379  }
1380  }
1381  }
1382 
1383  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
1384  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
1385  const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
1388  PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
1389  LhsMapper;
1390 
1393  PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
1394  rhs_inner_dim_reordered, Unaligned, MakePointer>
1395  RhsMapper;
1396 
1397  // initialize data mappers
1398  LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1399  this->m_left_contracting_strides, this->m_k_strides);
1400 
1401  RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1402  this->m_right_contracting_strides, this->m_k_strides);
1403 
1404 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1405  if (triple_dim.M == 1 && triple_dim.N == 1) {
1406  launchSC(buffer, lhs, rhs, triple_dim.K);
1407  } else
1408 #endif
1409 #ifndef EIGEN_SYCL_DISABLE_GEMV
1410  if (triple_dim.M != 1 && triple_dim.N == 1) {
1411  LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
1412  } else if (triple_dim.M == 1 && triple_dim.N != 1) {
1413  LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
1414  } else // This is equivalent of if (m!=1 && n!=1)
1415 #endif
1416  {
1417  typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
1418  inpt_mapper_properties;
1419 #ifndef EIGEN_SYCL_DISABLE_SKINNY
1420  bool skinny = false;
1421  auto platform_name = this->device().getPlatformName();
1422  // This is based on empirical calculation for AMD r9-nano and Fiji
1423  if (platform_name.find("AMD") == 0) {
1424  skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
1425  ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
1426  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
1427  } else {
1428  skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
1429  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
1430  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
1431  }
1432  if (skinny)
1433  adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1434  else
1435 #endif // EIGEN_SYCL_DISABLE_SKINNY
1436  adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1437  }
1438  }
1439 
1440  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
1441  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1442  const TripleDim &triple_dim) const {
1443 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1444  if (device().has_local_memory()) {
1446  launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
1447  buffer, lhs, rhs, triple_dim);
1448  }
1449 #endif
1450 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
1451  if (!(device().has_local_memory())) {
1453  launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
1454  buffer, lhs, rhs, triple_dim);
1455  }
1456 #endif
1457  }
1458 
1459  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
1460  typename Properties, typename LhsMapper, typename RhsMapper>
1461  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1462  const TripleDim &triple_dim) const {
1463  const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
1464  const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
1465  const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
1466  const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
1467 
1468  const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
1469  StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
1470  StorageIndex groupSizeK =
1471  skinny
1472  ? std::max(std::min(totalTilesK,
1473  (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
1474  (groupSizeM * groupSizeN)),
1475  StorageIndex(1))
1476  : StorageIndex(1);
1477 
1478  const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
1479 
1480  const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
1481 
1482  const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
1483  const StorageIndex globalRange = totalGroupSize * localRange;
1484 
1486  ? ((Properties::DoubleBuffer + 1) *
1487  (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
1488  ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
1489  (Properties::TileSizeDimN + Properties::BC))
1490  : StorageIndex(1);
1491 
1492  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1493  if (groupSizeK == 1) {
1495  LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1496  PacketAccess, input_mapper_properties, true, ct>
1497  ContractKernelName;
1498  device()
1499  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1500  lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim)
1501  .wait();
1502  } else {
1504  LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
1505  PacketAccess, input_mapper_properties, false, ct>
1506  ContractKernelName;
1507  CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
1508  device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
1509  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1510 
1511  device()
1512  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1513  lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
1514  triple_dim)
1515  .wait();
1516 
1518  auto op = Op();
1521  ReductionKernel;
1522 
1523  device()
1524  .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1525  tmp_global_accessor, buffer,
1526  cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
1527  Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
1528  cl::sycl::range<1>(localRange)),
1529  StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK)
1530  .wait();
1531  device().deallocate_temp(temp_pointer);
1532  }
1533  }
1534 
1535 #ifndef EIGEN_SYCL_DISABLE_GEMV
1536  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
1537  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
1538  StorageIndex NC, StorageIndex C) const {
1539  const StorageIndex nonContractDim = NC;
1540  EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
1541  EIGEN_CONSTEXPR StorageIndex CFactor = 1;
1542  EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
1544  Properties;
1545  const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
1546  const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
1547  const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
1548  const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
1549  const StorageIndex globalRange =
1550  (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
1551  const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
1552  const StorageIndex scratchSize =
1553  (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1554  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1555  if (cNumGroups > 1) {
1557  TensorMapper, StorageIndex, Properties, CFactor, false,
1558  is_lhs_vec, false>
1559  ContractKernelName;
1560  CoeffReturnType *temp_pointer =
1561  static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
1562  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1563 
1564  device()
1565  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1566  vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
1567  .wait();
1568 
1572  ReductionKernel;
1573 
1574  device()
1575  .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1576  tmp_global_accessor, buffer,
1577  cl::sycl::nd_range<1>(
1578  cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
1579  cl::sycl::range<1>(localRange)),
1580  StorageIndex(1), Op(), nonContractDim, cNumGroups)
1581  .wait();
1582  device().deallocate_temp(temp_pointer);
1583  } else {
1585  TensorMapper, StorageIndex, Properties, CFactor, false,
1586  is_lhs_vec, true>
1587  ContractKernelName;
1588  device()
1589  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1590  vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
1591  .wait();
1592  }
1593  }
1594 #endif
1595 
1596 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1597  template <typename LhsMapper, typename RhsMapper>
1598  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
1599  StorageIndex K) const {
1600  EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
1601  (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
1602  "The Local thread size must be a power of 2 for the reduction "
1603  "operation");
1604  EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
1605 
1606  // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
1607  // reduces at least 512 elementss individually, we get better performance.
1608  const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
1609  const StorageIndex global_range = num_work_group * local_range;
1610 
1612  CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
1613  ContractKernelName;
1614  auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
1615  if (num_work_group > 1) {
1616  CoeffReturnType *temp_pointer =
1617  static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
1618  EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
1619  device()
1620  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
1621  thread_range, local_range, K)
1622  .wait();
1625  EvaluatorPointerType, StorageIndex, local_range>
1626  GenericRKernel;
1627  device()
1628  .template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
1629  tmp_global_accessor, buffer,
1630  cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range,
1631  Op())
1632  .wait();
1633  device().deallocate_temp(temp_pointer);
1634  } else {
1635  device()
1636  .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
1637  local_range, K)
1638  .wait();
1639  }
1640  }
1641 #endif
1642 
1644  this->m_leftImpl.cleanup();
1645  this->m_rightImpl.cleanup();
1646 
1647  if (this->m_result) {
1648  this->m_device.deallocate_temp(this->m_result);
1649  this->m_result = NULL;
1650  }
1651  }
1652 };
1653 } // namespace Eigen
1654 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
m col(1)
m row(1)
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
Definition: TensorMacros.h:51
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
Definition: TensorContraction.h:307
Eigen::internal::traits< TensorContractionOp >::Index Index
Definition: TensorContraction.h:314
internal::gebp_traits< typename LhsXprType::CoeffReturnType, typename RhsXprType::CoeffReturnType >::ResScalar CoeffReturnType
Definition: TensorContraction.h:311
TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
Definition: TensorContractionSycl.h:457
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &db_offset) noexcept
Definition: TensorContractionSycl.h:785
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:593
static EIGEN_CONSTEXPR bool is_lhs_transposed
Definition: TensorContractionSycl.h:463
const StorageIndex groupSizeM
Definition: TensorContractionSycl.h:573
BlockProperties< is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix &&Vectorizable, PacketReturnType > RHSBlockProperties
Definition: TensorContractionSycl.h:474
static EIGEN_CONSTEXPR StorageIndex LSDR
Definition: TensorContractionSycl.h:486
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::local > extract_block(const Input &inpt, Local local_ptr, const std::pair< StorageIndex, StorageIndex > &local_index, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:891
static EIGEN_CONSTEXPR StorageIndex LocalOffset
Definition: TensorContractionSycl.h:489
BlockProperties< is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix &&Vectorizable, PacketReturnType > LHSBlockProperties
Definition: TensorContractionSycl.h:470
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex groupSizeN_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:578
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::no_local > extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair< StorageIndex, StorageIndex > &, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:713
std::conditional_t< contraction_tp==contraction_type::local, local_ptr, private_ptr > tile_ptr
Definition: TensorContractionSycl.h:482
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, TiledMemory &tiled_input_block, PacketReturnType *privateRes, bool &db_offset) const
Definition: TensorContractionSycl.h:826
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const
Definition: TensorContractionSycl.h:663
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync > sync_thread(const cl::sycl::nd_item< 1 > &)
Definition: TensorContractionSycl.h:821
const TripleDim triple_dim
Definition: TensorContractionSycl.h:576
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, OutPtr out_ptr) const
Definition: TensorContractionSycl.h:866
static EIGEN_CONSTEXPR StorageIndex LSDL
Definition: TensorContractionSycl.h:483
static EIGEN_CONSTEXPR StorageIndex NStride
Definition: TensorContractionSycl.h:476
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::local > sync_thread(const cl::sycl::nd_item< 1 > &itemID)
Definition: TensorContractionSycl.h:817
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:600
const RhsMapper rhs
Definition: TensorContractionSycl.h:571
const StorageIndex numTiles
Definition: TensorContractionSycl.h:575
OutScalar * private_ptr
Definition: TensorContractionSycl.h:481
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair< StorageIndex, StorageIndex > local_id_extract(const StorageIndex &linearLocalThreadId)
Definition: TensorContractionSycl.h:771
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< ctp==contraction_type::no_local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &) noexcept
Definition: TensorContractionSycl.h:797
Scratch scratch
Definition: TensorContractionSycl.h:569
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:479
cl::sycl::multi_ptr< OutScalar, cl::sycl::access::address_space::local_space > local_ptr
Definition: TensorContractionSycl.h:480
const LhsMapper lhs
Definition: TensorContractionSycl.h:570
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:460
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::no_local > sync_thread(const cl::sycl::nd_item< 1 > &) noexcept
Definition: TensorContractionSycl.h:804
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:461
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, PacketReturnType *privateRes) const
Definition: TensorContractionSycl.h:638
OutAccessor out_res
Definition: TensorContractionSycl.h:572
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &itemID, bool &) noexcept
Definition: TensorContractionSycl.h:791
const StorageIndex groupSizeN
Definition: TensorContractionSycl.h:574
static EIGEN_CONSTEXPR bool is_rhs_transposed
Definition: TensorContractionSycl.h:465
Definition: TensorContractionMapper.h:482
Definition: matrices.h:74
@ N
Definition: constructor.cpp:22
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
@ Unaligned
Definition: Constants.h:235
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
data_source
Definition: TensorContractionSycl.h:133
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< dt !=data_source::global_mem, void > write(PacketType &packet_data, DataScalar ptr)
write, a template function used for storing the data to local memory. This function is used to guaran...
Definition: TensorContractionSycl.h:221
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary< false >(bool cond)
check_boundary: specialization of the check_boundary for non-internal blocks.
Definition: TensorContractionSycl.h:291
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< PacketLoad, PacketType > read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
Definition: TensorContractionSycl.h:162
contraction_type
Definition: TensorContractionSycl.h:129
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool)
check_boundary: is used to check the edge condition for non-internal blocks.
Definition: TensorContractionSycl.h:281
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
std::uint64_t uint64_t
Definition: Meta.h:42
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
double K
Wave number.
Definition: sphere_scattering.cc:115
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
Definition: TensorDimensions.h:161
Definition: TensorForwardDeclarations.h:25
Definition: TensorMeta.h:47
Definition: TensorForwardDeclarations.h:42
Definition: TensorContraction.h:342
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType LeftArgType
Definition: TensorContraction.h:344
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType RightArgType
Definition: TensorContraction.h:345
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType OutputKernelType
Definition: TensorContraction.h:346
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices Indices
Definition: TensorContraction.h:343
TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType > XprType
Definition: TensorContractionSycl.h:1287
void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1461
void evalTyped(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1384
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorContractionSycl.h:1291
array< StorageIndex, LDims - ContractDims > left_nocontract_t
Definition: TensorContractionSycl.h:1314
array< StorageIndex, RDims - ContractDims > right_nocontract_t
Definition: TensorContractionSycl.h:1315
void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1441
void evalToSycl(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1351
TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorContractionSycl.h:1336
TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Device > Self
Definition: TensorContractionSycl.h:1285
TensorEvaluator< typename Base::EvalRightArgType, Device > RightEvaluator
Definition: TensorContractionSycl.h:1322
std::remove_const_t< typename RightEvaluator::CoeffReturnType > RhsScalar
Definition: TensorContractionSycl.h:1324
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const
Definition: TensorContractionSycl.h:1537
TensorEvaluator< typename Base::EvalLeftArgType, Device > LeftEvaluator
Definition: TensorContractionSycl.h:1321
std::remove_const_t< typename XprType::Scalar > Scalar
Definition: TensorContractionSycl.h:1288
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data)
Definition: TensorContractionSycl.h:1339
EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, StorageIndex K) const
Definition: TensorContractionSycl.h:1598
std::remove_const_t< typename LeftEvaluator::CoeffReturnType > LhsScalar
Definition: TensorContractionSycl.h:1323
TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_)
Definition: TensorContractionSycl.h:1298
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41
@ PacketAccess
Definition: TensorEvaluator.h:50
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36
BlockProperties is a template class that provides different characteristic of a block of each Tensor ...
Definition: TensorContractionSycl.h:322
static EIGEN_CONSTEXPR int c_stride
Definition: TensorContractionSycl.h:330
static EIGEN_CONSTEXPR int elements_per_access
Definition: TensorContractionSycl.h:327
std::conditional_t< packet_load, PacketType, OutScalar > OutType
Definition: TensorContractionSycl.h:326
static EIGEN_CONSTEXPR bool is_coalesced_layout
Definition: TensorContractionSycl.h:328
static EIGEN_CONSTEXPR bool is_rhs
Definition: TensorContractionSycl.h:325
Eigen::internal::unpacket_traits< PacketType >::type OutScalar
Definition: TensorContractionSycl.h:324
static EIGEN_CONSTEXPR bool packet_load
Definition: TensorContractionSycl.h:323
static EIGEN_CONSTEXPR int nc_stride
Definition: TensorContractionSycl.h:329
GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contrac...
Definition: TensorContractionSycl.h:1235
OutAccessor out_res
Definition: TensorContractionSycl.h:1240
Scratch scratch
Definition: TensorContractionSycl.h:1237
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1236
EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex rng_)
Definition: TensorContractionSycl.h:1243
const RhsMapper rhs
Definition: TensorContractionSycl.h:1239
const StorageIndex rng
Definition: TensorContractionSycl.h:1241
const LhsMapper lhs
Definition: TensorContractionSycl.h:1238
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1247
GeneralVectorTensor is a template class that provides Tensor -vector contraction operation,...
Definition: TensorContractionSycl.h:995
Scratch scratch
Definition: TensorContractionSycl.h:1010
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1000
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, OutPtr out_ptr, const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex)
Definition: TensorContractionSycl.h:1068
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, const TensorMapper mat_, OutAccessor out_res_, const StorageIndex nonContractGroupSize_, const StorageIndex nonContractDim_, const StorageIndex contractDim_)
Definition: TensorContractionSycl.h:1018
const TensorMapper mat
Definition: TensorContractionSycl.h:1012
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, const StorageIndex &linearLocalThreadId, const StorageIndex &cOffset, const StorageIndex &C)
Definition: TensorContractionSycl.h:1171
const StorageIndex nonContractDim
Definition: TensorContractionSycl.h:1015
const StorageIndex nonContractGroupSize
Definition: TensorContractionSycl.h:1014
const StorageIndex contractDim
Definition: TensorContractionSycl.h:1016
BlockProperties< is_lhs_vec ? false :true, is_lhs_vec ? false :true, Vectorizable, PacketReturnType > VecBlockProperties
Definition: TensorContractionSycl.h:1008
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1031
OutAccessor out_res
Definition: TensorContractionSycl.h:1013
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:998
static EIGEN_CONSTEXPR StorageIndex OutScratchOffset
Definition: TensorContractionSycl.h:1002
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:997
const VectorMapper vec
Definition: TensorContractionSycl.h:1011
static EIGEN_DEVICE_FUNC void set_packet(PacketReturnType, Scalar *)
Definition: InteropHeaders.h:145
static EIGEN_DEVICE_FUNC Scalar scalarize(Index, PacketReturnType &)
Definition: InteropHeaders.h:138
Definition: TensorReductionSycl.h:79
Definition: TensorReductionSycl.h:365
TTPanelSize, a template class used for setting the panel size required for launching General Tensor T...
Definition: TensorContractionSycl.h:82
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN
Definition: TensorContractionSycl.h:95
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM
Definition: TensorContractionSycl.h:88
static EIGEN_CONSTEXPR bool DoubleBuffer
Definition: TensorContractionSycl.h:117
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM
Definition: TensorContractionSycl.h:100
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN
Definition: TensorContractionSycl.h:102
static EIGEN_CONSTEXPR StorageIndex TileSizeDimM
Definition: TensorContractionSycl.h:104
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs
Definition: TensorContractionSycl.h:108
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs
Definition: TensorContractionSycl.h:111
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:114
static EIGEN_CONSTEXPR StorageIndex TileSizeDimN
Definition: TensorContractionSycl.h:106
static EIGEN_CONSTEXPR StorageIndex TileSizeDimK
Definition: TensorContractionSycl.h:84
TVPanelSize, a template class used for setting the panel size required for launching General TensorVe...
Definition: TensorContractionSycl.h:46
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:60
static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC
Definition: TensorContractionSycl.h:52
static EIGEN_CONSTEXPR StorageIndex TileSizeDimC
Definition: TensorContractionSycl.h:54
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC
Definition: TensorContractionSycl.h:58
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC
Definition: TensorContractionSycl.h:56
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC
Definition: TensorContractionSycl.h:50
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC
Definition: TensorContractionSycl.h:48
MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it ...
Definition: TensorContractionSycl.h:504
tile_ptr ptr
Definition: TensorContractionSycl.h:505
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr)
Definition: TensorContractionSycl.h:506
TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel fr...
Definition: TensorContractionSycl.h:537
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &thread_properties, local_ptr block_start_ptr, std::enable_if_t< tp==contraction_type::local > *=0)
Definition: TensorContractionSycl.h:555
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &, local_ptr, std::enable_if_t< tp==contraction_type::no_local > *=0)
Definition: TensorContractionSycl.h:545
tile_ptr rhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:541
const std::pair< StorageIndex, StorageIndex > rhs_extract_index
Definition: TensorContractionSycl.h:543
MemHolder< contraction_tp, Properties::WorkLoadPerThreadN *Properties::TileSizeDimK > rhs_scratch_extract
Definition: TensorContractionSycl.h:539
const std::pair< StorageIndex, StorageIndex > lhs_extract_index
Definition: TensorContractionSycl.h:542
tile_ptr lhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:540
MemHolder< contraction_tp, Properties::WorkLoadPerThreadM *Properties::TileSizeDimK > lhs_scratch_extract
Definition: TensorContractionSycl.h:538
ThreadProperties is a template class that provides each thread's properties within a workgroup....
Definition: TensorContractionSycl.h:373
const StorageIndex mGroupOffset
Definition: TensorContractionSycl.h:376
const StorageIndex nGroupOffset
Definition: TensorContractionSycl.h:377
const StorageIndex mLocalOffset
Definition: TensorContractionSycl.h:379
const StorageIndex kGroupId
Definition: TensorContractionSycl.h:375
const StorageIndex linearLocalThreadId
Definition: TensorContractionSycl.h:374
const StorageIndex nLocalOffset
Definition: TensorContractionSycl.h:380
StorageIndex kSize
Definition: TensorContractionSycl.h:383
const StorageIndex kGroupOffset
Definition: TensorContractionSycl.h:378
const StorageIndex mGlobalOffset
Definition: TensorContractionSycl.h:381
const bool is_internal
Definition: TensorContractionSycl.h:384
const StorageIndex nGlobalOffset
Definition: TensorContractionSycl.h:382
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, StorageIndex kSize_, const bool is_internal_)
Definition: TensorContractionSycl.h:386
Definition: TensorFunctors.h:66
Definition: TensorContractionMapper.h:517
Definition: GenericPacketMath.h:134
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Definition: ZVector/PacketMath.h:50