19 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
20 #define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
27 namespace TensorSycl {
30 #ifndef EIGEN_SYCL_DISABLE_GEMV
45 template <
typename Scalar,
typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
81 template <
typename Scalar,
typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
87 #ifndef EIGEN_SYCL_REG_M
94 #ifndef EIGEN_SYCL_REG_N
118 #ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
160 template <
bool PacketLoad,
bool is_coalesced_layout,
bool,
typename PacketType,
typename TensorMapper,
161 typename StorageIndex>
163 const TensorMapper &tensorMapper,
const StorageIndex &NCIndex,
const StorageIndex &CIndex,
const StorageIndex &ld) {
164 const StorageIndex
row = (is_coalesced_layout) ? NCIndex : CIndex;
165 const StorageIndex
col = (is_coalesced_layout) ? CIndex : NCIndex;
166 return tensorMapper.get_tensor().template packet<Unaligned>(
row + (
col * ld));
191 template <
bool PacketLoad,
bool,
bool IsRhs,
typename PacketType,
typename TensorMapper,
typename StorageIndex>
193 const TensorMapper &tensorMapper,
const StorageIndex &NCIndex,
const StorageIndex &CIndex,
const StorageIndex &) {
194 const StorageIndex
row = (IsRhs) ? CIndex : NCIndex;
195 const StorageIndex
col = (IsRhs) ? NCIndex : CIndex;
196 return tensorMapper(
row,
col);
220 template <
typename StorageIndex, StorageIndex ld, data_source dt,
typename PacketType,
typename DataScalar>
225 for (
int i = 0;
i < PacketSize;
i++) {
246 template <data_source dt,
typename PacketType,
typename DataScalar>
251 ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
267 template <data_source dt,
typename PacketType,
typename DataScalar>
280 template <
bool is_
internal>
321 template <
bool is_transposed,
bool is_rhs_,
bool packet_load_,
typename PacketType>
326 typedef std::conditional_t<packet_load, PacketType, OutScalar>
OutType;
372 template <
typename StorageIndex>
387 const StorageIndex linearLocalThreadId_,
const StorageIndex kGroupId_,
const StorageIndex mGroupOffset_,
388 const StorageIndex nGroupOffset_,
const StorageIndex kGroupOffset_,
const StorageIndex mLocalOffset_,
389 const StorageIndex nLocalOffset_,
const StorageIndex mGlobalOffset_,
const StorageIndex nGlobalOffset_,
390 StorageIndex kSize_,
const bool is_internal_)
454 template <
typename OutScalar,
typename LhsScalar,
typename RhsScalar,
typename OutAccessor,
typename LhsMapper,
455 typename RhsMapper,
typename StorageIndex,
typename Properties,
typename TripleDim,
bool Vectorizable,
456 typename input_mapper_properties,
bool IsFinal,
contraction_type contraction_tp>
459 typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
462 Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
479 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Scratch;
480 typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space>
local_ptr;
482 typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr>
tile_ptr;
484 ? Properties::TileSizeDimM + Properties::BC
485 : Properties::WorkLoadPerThreadM;
487 ? Properties::TileSizeDimN + Properties::BC
488 : Properties::WorkLoadPerThreadN;
503 template <contraction_type, StorageIndex>
511 template <StorageIndex MemSize>
513 OutScalar
ptr[MemSize] = {OutScalar{0}};
544 template <contraction_type tp = contraction_tp>
546 std::enable_if_t<tp == contraction_type::no_local> * = 0)
551 lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
552 rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
554 template <contraction_type tp = contraction_tp>
557 std::enable_if_t<tp == contraction_type::local> * = 0)
560 ((Properties::DoubleBuffer + 1) *
LSDL * Properties::TileSizeDimK)},
579 const RhsMapper rhs_, OutAccessor out_res_,
580 const StorageIndex groupSizeM_,
581 const StorageIndex groupSizeN_,
582 const StorageIndex numTiles_,
583 const TripleDim triple_dim_)
594 const RhsMapper rhs_, OutAccessor out_res_,
595 const StorageIndex groupSizeM_,
596 const StorageIndex numTiles_,
597 const TripleDim triple_dim_)
601 const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
602 const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
603 const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
604 const StorageIndex mGroupId = itemID.get_group(0) %
groupSizeM;
607 const StorageIndex kGroupId = IsFinal ? 0 :
tmp /
groupSizeN;
608 const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
609 const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
610 const StorageIndex mLocalOffset =
PacketSize * mLocalThreadId;
611 const StorageIndex nLocalOffset =
NStride * nLocalThreadId;
612 const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
613 const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
615 const StorageIndex kSizePerWG = IsFinal ?
triple_dim.K :
numTiles * Properties::TileSizeDimK;
616 StorageIndex kGroupOffset = kGroupId * kSizePerWG;
617 const bool is_internal =
triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
618 triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
624 kGroupOffset += kSize;
626 auto thread_properties =
628 mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
632 (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
633 : compute_panel<false>(itemID, thread_properties, out_ptr);
640 StorageIndex idx = 0;
644 for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
646 StorageIndex lhs_index = 0;
648 for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM /
PacketSize; wLPTM++) {
651 lhs_block_ptr + lhs_index);
654 lhs_index += lhs_stride;
662 template <
bool is_
internal_block, StorageIndex PrivateNStr
ide,
typename OutPtr>
664 StorageIndex mGlobalOffset, StorageIndex nGlobalOffset)
const {
665 auto chk_bound = [&](
const StorageIndex &mIndex,
const StorageIndex &nIndex)
EIGEN_DEVICE_FUNC {
674 for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
676 StorageIndex outputLD = 0;
681 for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
682 StorageIndex globalRow = mGlobalOffset;
684 for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM /
PacketSize; wLPTM++) {
686 if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
689 write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
692 for (StorageIndex mId = 0; mId <
PacketSize; mId++) {
693 StorageIndex mOffset = globalRow + mId;
695 out_ptr[mOffset + outputLD] =
700 globalRow += (
PacketSize * Properties::LocalThreadSizeM);
703 privateRes += Properties::WorkLoadPerThreadM /
PacketSize;
705 out_ptr += (GlobalNStride * outputLD);
707 nGlobalOffset += (PrivateNStride * GlobalNStride);
711 template <
typename InputBlockProperties,
bool is_internal_block,
typename Input,
typename PrivateReg,
714 const Input &inpt, PrivateReg
private_ptr,
const std::pair<StorageIndex, StorageIndex> &,
715 const StorageIndex &ncOffset,
const StorageIndex cOffset)
const {
717 InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
719 InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
722 auto chk_bound = [&](
const StorageIndex &CIndex,
const StorageIndex &NCIndex)
EIGEN_DEVICE_FUNC {
723 return ((CIndex + InputBlockProperties::c_stride - 1 <
triple_dim.K) &&
724 (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
726 const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC :
triple_dim.K;
727 StorageIndex cIndex = cOffset;
730 for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
731 StorageIndex ncIndex = ncOffset;
733 for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
734 if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
736 read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
737 InputBlockProperties::is_rhs,
typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
739 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
743 for (StorageIndex
i = 0;
i < InputBlockProperties::elements_per_access;
i++) {
744 const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ?
i : 0);
745 const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 :
i);
748 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
749 inpt, ncInd, cInd, ld)
751 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
753 val,
private_ptr + (InputBlockProperties::is_coalesced_layout ?
i : 0) +
754 ((InputBlockProperties::is_coalesced_layout ? 0 :
i) * WorkLoadPerThreadNC));
760 ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 &&
PacketSize != 1)
762 : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
766 private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
767 cIndex += InputBlockProperties::c_stride;
770 template <
typename InputBlockProperties, StorageIndex TileSizeDimNC>
772 const StorageIndex &linearLocalThreadId) {
773 const StorageIndex localThreadNC =
774 (InputBlockProperties::is_coalesced_layout)
775 ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
776 : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
777 const StorageIndex localThreadC =
778 (InputBlockProperties::is_coalesced_layout)
779 ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
780 : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
781 return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
784 template <
bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
786 const cl::sycl::nd_item<1> &,
bool &db_offset) noexcept {
787 db_offset = !db_offset;
790 template <
bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
792 const cl::sycl::nd_item<1> &itemID,
bool &) noexcept {
793 itemID.barrier(cl::sycl::access::fence_space::local_space);
796 template <contraction_type ctp = contraction_tp>
798 const cl::sycl::nd_item<1> &,
bool &) noexcept {
802 template <
bool need_sync, contraction_type ctp = contraction_tp>
805 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
809 #ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
810 itemID.barrier(cl::sycl::access::fence_spacce::local_space);
815 template <
bool need_sync, contraction_type ctp = contraction_tp>
818 itemID.barrier(cl::sycl::access::fence_space::local_space);
820 template <
bool need_sync>
825 template <
bool is_
internal_block>
830 bool &db_offset)
const {
832 extract_block<RHSBlockProperties, is_internal_block>(
838 sync_thread<contraction_tp == contraction_type::no_local>(itemID);
841 extract_block<LHSBlockProperties, is_internal_block>(
848 sync_thread<contraction_tp == contraction_type::local>(itemID);
850 StorageIndex lhs_offset = (db_offset *
LSDL * Properties::TileSizeDimK);
851 StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK *
LSDR);
853 for (StorageIndex
k = 0;
k < Properties::TileSizeDimK;
k++) {
860 thread_properties.
kSize -= Properties::TileSizeDimK;
865 template <
bool is_
internal_block,
typename OutPtr>
868 OutPtr out_ptr)
const {
875 while (thread_properties.
kSize >= Properties::TileSizeDimK) {
876 compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
878 if (thread_properties.
kSize > 0) {
879 compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
883 store<is_internal_block,
889 template <
typename InputBlockProperties,
bool is_internal_block,
typename Input,
typename Local,
892 const Input &inpt, Local
local_ptr,
const std::pair<StorageIndex, StorageIndex> &local_index,
893 const StorageIndex &ncOffset,
const StorageIndex cOffset)
const {
895 InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
897 InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
899 static_assert(((
LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
900 (
LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
901 " LocalOffset must be divisible by stride");
903 StorageIndex localThreadNC = local_index.first;
904 StorageIndex localThreadC = local_index.second;
905 auto chk_bound = [&](
const StorageIndex &CIndex,
const StorageIndex &NCIndex)
EIGEN_DEVICE_FUNC {
906 return ((CIndex + InputBlockProperties::c_stride - 1 <
triple_dim.K) &&
907 (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
910 for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
911 const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
912 const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
913 const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC :
triple_dim.K;
914 if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
916 read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
917 InputBlockProperties::is_rhs,
typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
918 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
919 val,
local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
920 (InputBlockProperties::c_stride * localThreadC * LSD));
923 for (StorageIndex
i = 0;
i < InputBlockProperties::elements_per_access;
i++) {
924 const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ?
i : 0);
925 const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 :
i);
928 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
929 inpt, nCInd, cInd, ld)
932 write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
933 val,
local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
934 (InputBlockProperties::is_coalesced_layout ?
i : 0) +
935 ((InputBlockProperties::c_stride * localThreadC +
936 (InputBlockProperties::is_coalesced_layout ? 0 :
i)) *
940 localThreadNC += (InputBlockProperties::is_coalesced_layout)
941 ?
LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
942 :
LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
943 localThreadC += (InputBlockProperties::is_coalesced_layout)
944 ?
LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
945 :
LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
950 #ifndef EIGEN_SYCL_DISABLE_GEMV
993 template <
typename OutScalar,
typename OutAccessor,
typename VectorMapper,
typename TensorMapper,
typename StorageIndex,
994 typename Properties, StorageIndex KFactor,
bool Vectorizable,
bool is_lhs_vec,
bool IsFinal>
996 typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
999 Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
1000 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Scratch;
1003 KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1019 const TensorMapper mat_, OutAccessor out_res_,
1020 const StorageIndex nonContractGroupSize_,
1021 const StorageIndex nonContractDim_,
1022 const StorageIndex contractDim_)
1032 auto scratch_ptr =
scratch.get_pointer();
1033 const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
1034 StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
1035 : linearLocalThreadId % Properties::LocalThreadSizeNC;
1036 StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
1037 : linearLocalThreadId / Properties::LocalThreadSizeNC;
1039 const StorageIndex nonContractGroupId =
1041 const StorageIndex contractGroupId =
1045 const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
1046 const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
1047 auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1048 const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
1049 const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
1051 const bool is_internal =
nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
1052 contractDim - contractGroupOffset >= Properties::TileSizeDimC;
1054 ? compute_panel<true>(itemID,
vec,
mat, local_output, out_ptr,
1056 scratch_ptr, contractGroupOffset,
1059 nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
1060 : compute_panel<false>(itemID,
vec,
mat, local_output, out_ptr,
1062 scratch_ptr, contractGroupOffset,
1065 nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
1067 template <
bool is_
internal_block,
typename OutPtr>
1069 const cl::sycl::nd_item<1> &itemID,
const VectorMapper &
vec,
const TensorMapper &
mat, OutScalar *local_output,
1072 OutScalar *scratch_ptr,
const StorageIndex contractGroupOffset,
1074 const StorageIndex nonContractGroupOffset,
const StorageIndex linearLocalThreadId, StorageIndex
contractDim,
1075 StorageIndex
nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
1076 StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
1077 OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
1079 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1080 const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
1082 Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(
vec, scratch_ptr, linearLocalThreadId,
1085 itemID.barrier(cl::sycl::access::fence_space::local_space);
1086 auto in_scratch_ptr = scratch_ptr + contractId;
1089 StorageIndex privateOffsetC = 0;
1091 for (StorageIndex
i = 0;
i < Properties::WorkLoadPerThreadC;
i++) {
1092 StorageIndex privateOffsetNC = 0;
1093 bool contract_conds = ((globalContractDimOffset + privateOffsetC) <
contractDim);
1094 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1095 auto vecScalar = *in_scratch_ptr;
1097 auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
1098 ?
vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
1099 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
1103 for (StorageIndex
j = 0;
j < Properties::WorkLoadPerThreadNC;
j++) {
1104 auto matScalar = (check_boundary<is_internal_block>(
1105 contract_conds && ((globalNonContractDimOffset + privateOffsetNC) <
nonContractDim)))
1106 ?
mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
1107 : globalNonContractDimOffset + privateOffsetNC,
1108 is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
1109 : globalContractDimOffset + privateOffsetC)
1113 privateOffsetNC += Properties::LocalThreadSizeNC;
1115 privateOffsetC += Properties::LocalThreadSizeC;
1116 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1117 in_scratch_ptr += Properties::LocalThreadSizeC;
1121 auto out_scratch_ptr = local_output + outScratchIndex;
1124 for (StorageIndex
j = 0;
j < Properties::WorkLoadPerThreadNC;
j++) {
1125 *out_scratch_ptr = outScalar[
j];
1127 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1130 nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
1131 contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
1132 outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
1135 out_scratch_ptr = local_output + outScratchIndex;
1137 for (StorageIndex
j = 0;
j < Properties::WorkLoadPerThreadNC;
j++) {
1139 for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
1140 itemID.barrier(cl::sycl::access::fence_space::local_space);
1141 if (contractId < offset) {
1142 StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
1143 *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
1147 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1150 if (contractId == 0) {
1151 out_scratch_ptr = local_output + nonContractId;
1152 StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
1153 out_ptr += global_final_offset;
1155 for (StorageIndex
j = 0;
j < Properties::WorkLoadPerThreadNC;
j++) {
1156 if (check_boundary<is_internal_block>(global_final_offset <
nonContractDim)) {
1157 auto res = *out_scratch_ptr;
1160 out_ptr += Properties::LocalThreadSizeNC;
1163 out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
1164 if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
1169 template <
typename InputBlockProperties,
bool is_internal_block,
int CFactor,
int GroupSize,
typename Input,
1172 const StorageIndex &linearLocalThreadId,
1173 const StorageIndex &cOffset,
const StorageIndex &
C) {
1174 local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
1175 StorageIndex cIndex = cOffset;
1176 for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
1177 if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 <
C)) {
1178 auto val =
read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
1179 InputBlockProperties::is_rhs,
typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
1180 cIndex, StorageIndex(1));
1181 write<StorageIndex, 1, data_source::local_mem>(
val, local_ptr);
1184 for (StorageIndex
i = 0;
i < InputBlockProperties::elements_per_access;
i++) {
1187 ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
1188 inpt, StorageIndex(0), cIndex +
i, StorageIndex(1))
1190 write<StorageIndex, 1, data_source::local_mem>(
val, local_ptr +
i);
1193 local_ptr += InputBlockProperties::c_stride * GroupSize;
1194 cIndex += InputBlockProperties::c_stride * GroupSize;
1200 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1233 template <
typename OutScalar,
typename LhsScalar,
typename RhsScalar,
typename OutAccessor,
typename LhsMapper,
1234 typename RhsMapper,
typename StorageIndex,
bool Vectorizable>
1236 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Scratch;
1244 OutAccessor out_res_,
const StorageIndex rng_)
1249 OutScalar *scratch_ptr =
scratch.get_pointer();
1251 StorageIndex globalid = itemID.get_global_id(0);
1252 StorageIndex localid = itemID.get_local_id(0);
1253 OutScalar accumulator = OutScalar(0);
1254 for (StorageIndex
i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
1257 auto out_scratch_ptr = scratch_ptr + localid;
1258 *out_scratch_ptr = accumulator;
1259 for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
1260 itemID.barrier(cl::sycl::access::fence_space::local_space);
1261 if (localid < offset) {
1262 *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
1266 out_ptr[itemID.get_group(0)] = accumulator;
1275 template <
typename Indices,
typename LeftArgType,
typename RightArgType,
typename OutputKernelType>
1279 const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
1281 "SYCL tensor contraction does not support output kernels.");
1288 typedef std::remove_const_t<typename XprType::Scalar>
Scalar;
1302 BlockAccess =
false,
1306 static constexpr
int LDims = Base::LDims;
1307 static constexpr
int RDims = Base::RDims;
1308 static constexpr
int ContractDims = Base::ContractDims;
1317 static constexpr
int NumDims = LDims + RDims - 2 * ContractDims;
1323 typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType>
LhsScalar;
1324 typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType>
RhsScalar;
1329 template <
bool lhs_inner_dim_contiguous,
bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered>
1330 struct input_mapper_propertis {
1331 static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
1333 (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
1340 this->m_leftImpl.evalSubExprsIfNeeded(NULL);
1341 this->m_rightImpl.evalSubExprsIfNeeded(NULL);
1343 this->m_result = this->
m_device.get(
1344 static_cast<Scalar *
>(this->
m_device.allocate_temp(this->dimensions().TotalSize() *
sizeof(
Scalar))));
1345 data = this->m_result;
1348 return (this->m_result != NULL);
1352 if (this->m_lhs_inner_dim_contiguous) {
1353 if (this->m_rhs_inner_dim_contiguous) {
1354 if (this->m_rhs_inner_dim_reordered) {
1355 evalTyped<true, true, true, Unaligned>(buffer);
1357 evalTyped<true, true, false, Unaligned>(buffer);
1360 if (this->m_rhs_inner_dim_reordered) {
1361 evalTyped<true, false, true, Unaligned>(buffer);
1363 evalTyped<true, false, false, Unaligned>(buffer);
1367 if (this->m_rhs_inner_dim_contiguous) {
1368 if (this->m_rhs_inner_dim_reordered) {
1369 evalTyped<false, true, true, Unaligned>(buffer);
1371 evalTyped<false, true, false, Unaligned>(buffer);
1374 if (this->m_rhs_inner_dim_reordered) {
1375 evalTyped<false, false, true, Unaligned>(buffer);
1377 evalTyped<false, false, false, Unaligned>(buffer);
1383 template <
bool lhs_inner_dim_contiguous,
bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered,
int Alignment>
1385 const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
1398 LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
1399 this->m_left_contracting_strides, this->m_k_strides);
1401 RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
1402 this->m_right_contracting_strides, this->m_k_strides);
1404 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1405 if (triple_dim.M == 1 && triple_dim.N == 1) {
1406 launchSC(buffer, lhs, rhs, triple_dim.K);
1409 #ifndef EIGEN_SYCL_DISABLE_GEMV
1410 if (triple_dim.M != 1 && triple_dim.N == 1) {
1411 LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
1412 }
else if (triple_dim.M == 1 && triple_dim.N != 1) {
1413 LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
1417 typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
1418 inpt_mapper_properties;
1419 #ifndef EIGEN_SYCL_DISABLE_SKINNY
1420 bool skinny =
false;
1421 auto platform_name = this->device().getPlatformName();
1423 if (platform_name.find(
"AMD") == 0) {
1424 skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
1425 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
1428 skinny = (((
std::max(triple_dim.K, triple_dim.N) /
std::min(triple_dim.K, triple_dim.N)) > 100) ||
1429 ((
std::max(triple_dim.K, triple_dim.M) /
std::min(triple_dim.K, triple_dim.M)) > 100) ||
1430 ((
std::max(triple_dim.N, triple_dim.M) /
std::min(triple_dim.N, triple_dim.M)) > 100));
1433 adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1436 adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
1440 template <
bool skinny,
typename input_mapper_properties,
typename LhsMapper,
typename RhsMapper>
1442 const TripleDim &triple_dim)
const {
1443 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
1444 if (device().has_local_memory()) {
1446 launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
1447 buffer, lhs, rhs, triple_dim);
1450 #ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
1451 if (!(device().has_local_memory())) {
1453 launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
1454 buffer, lhs, rhs, triple_dim);
1460 typename Properties,
typename LhsMapper,
typename RhsMapper>
1462 const TripleDim &triple_dim)
const {
1463 const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
1464 const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
1465 const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
1466 const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
1468 const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
1469 StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
1473 (
StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(),
true) * 4) /
1474 (groupSizeM * groupSizeN)),
1478 const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
1480 const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
1482 const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
1483 const StorageIndex globalRange = totalGroupSize * localRange;
1486 ? ((Properties::DoubleBuffer + 1) *
1487 (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
1488 ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
1489 (Properties::TileSizeDimN + Properties::BC))
1492 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1493 if (groupSizeK == 1) {
1495 LhsMapper, RhsMapper,
StorageIndex, Properties, TripleDim,
1499 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1500 lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim)
1504 LhsMapper, RhsMapper,
StorageIndex, Properties, TripleDim,
1508 device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK *
sizeof(
CoeffReturnType)));
1512 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1513 lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
1524 .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1525 tmp_global_accessor, buffer,
1527 Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
1528 cl::sycl::range<1>(localRange)),
1531 device().deallocate_temp(temp_pointer);
1535 #ifndef EIGEN_SYCL_DISABLE_GEMV
1536 template <
bool is_lhs_vec,
typename VectorMapper,
typename TensorMapper,
typename StorageIndex>
1545 const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(
C, Properties::TileSizeDimC);
1546 const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
1547 const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
1548 const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
1550 (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
1551 const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
1553 (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
1554 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
1555 if (cNumGroups > 1) {
1557 TensorMapper,
StorageIndex, Properties, CFactor,
false,
1565 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1566 vec,
mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim,
C)
1575 .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
1576 tmp_global_accessor, buffer,
1577 cl::sycl::nd_range<1>(
1578 cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
1579 cl::sycl::range<1>(localRange)),
1582 device().deallocate_temp(temp_pointer);
1585 TensorMapper,
StorageIndex, Properties, CFactor,
false,
1589 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
1590 vec,
mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim,
C)
1596 #ifndef EIGEN_SYCL_DISABLE_SCALAR
1597 template <
typename LhsMapper,
typename RhsMapper>
1601 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
1602 "The Local thread size must be a power of 2 for the reduction "
1608 const StorageIndex num_work_group = ((
K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
1609 const StorageIndex global_range = num_work_group * local_range;
1614 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
1615 if (num_work_group > 1) {
1620 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
1621 thread_range, local_range,
K)
1628 .template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
1629 tmp_global_accessor, buffer,
1630 cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range,
1633 device().deallocate_temp(temp_pointer);
1636 .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
1644 this->m_leftImpl.cleanup();
1645 this->m_rightImpl.cleanup();
1647 if (this->m_result) {
1648 this->
m_device.deallocate_temp(this->m_result);
1649 this->m_result = NULL;
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
Definition: TensorMacros.h:51
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
Definition: TensorContraction.h:307
Eigen::internal::traits< TensorContractionOp >::Index Index
Definition: TensorContraction.h:314
internal::gebp_traits< typename LhsXprType::CoeffReturnType, typename RhsXprType::CoeffReturnType >::ResScalar CoeffReturnType
Definition: TensorContraction.h:311
TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
Definition: TensorContractionSycl.h:457
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &db_offset) noexcept
Definition: TensorContractionSycl.h:785
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:593
static EIGEN_CONSTEXPR bool is_lhs_transposed
Definition: TensorContractionSycl.h:463
const StorageIndex groupSizeM
Definition: TensorContractionSycl.h:573
BlockProperties< is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix &&Vectorizable, PacketReturnType > RHSBlockProperties
Definition: TensorContractionSycl.h:474
static EIGEN_CONSTEXPR StorageIndex LSDR
Definition: TensorContractionSycl.h:486
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::local > extract_block(const Input &inpt, Local local_ptr, const std::pair< StorageIndex, StorageIndex > &local_index, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:891
static EIGEN_CONSTEXPR StorageIndex LocalOffset
Definition: TensorContractionSycl.h:489
BlockProperties< is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix &&Vectorizable, PacketReturnType > LHSBlockProperties
Definition: TensorContractionSycl.h:470
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex groupSizeM_, const StorageIndex groupSizeN_, const StorageIndex numTiles_, const TripleDim triple_dim_)
Definition: TensorContractionSycl.h:578
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< contract_tp==contraction_type::no_local > extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair< StorageIndex, StorageIndex > &, const StorageIndex &ncOffset, const StorageIndex cOffset) const
Definition: TensorContractionSycl.h:713
std::conditional_t< contraction_tp==contraction_type::local, local_ptr, private_ptr > tile_ptr
Definition: TensorContractionSycl.h:482
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, TiledMemory &tiled_input_block, PacketReturnType *privateRes, bool &db_offset) const
Definition: TensorContractionSycl.h:826
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const
Definition: TensorContractionSycl.h:663
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync > sync_thread(const cl::sycl::nd_item< 1 > &)
Definition: TensorContractionSycl.h:821
const TripleDim triple_dim
Definition: TensorContractionSycl.h:576
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, ThreadProperties< StorageIndex > &thread_properties, OutPtr out_ptr) const
Definition: TensorContractionSycl.h:866
static EIGEN_CONSTEXPR StorageIndex LSDL
Definition: TensorContractionSycl.h:483
static EIGEN_CONSTEXPR StorageIndex NStride
Definition: TensorContractionSycl.h:476
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::local > sync_thread(const cl::sycl::nd_item< 1 > &itemID)
Definition: TensorContractionSycl.h:817
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:600
const RhsMapper rhs
Definition: TensorContractionSycl.h:571
const StorageIndex numTiles
Definition: TensorContractionSycl.h:575
OutScalar * private_ptr
Definition: TensorContractionSycl.h:481
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair< StorageIndex, StorageIndex > local_id_extract(const StorageIndex &linearLocalThreadId)
Definition: TensorContractionSycl.h:771
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< ctp==contraction_type::no_local > sync_mem(const cl::sycl::nd_item< 1 > &, bool &) noexcept
Definition: TensorContractionSycl.h:797
Scratch scratch
Definition: TensorContractionSycl.h:569
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:479
cl::sycl::multi_ptr< OutScalar, cl::sycl::access::address_space::local_space > local_ptr
Definition: TensorContractionSycl.h:480
const LhsMapper lhs
Definition: TensorContractionSycl.h:570
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:460
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< need_sync &&ctp==contraction_type::no_local > sync_thread(const cl::sycl::nd_item< 1 > &) noexcept
Definition: TensorContractionSycl.h:804
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:461
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, PacketReturnType *privateRes) const
Definition: TensorContractionSycl.h:638
OutAccessor out_res
Definition: TensorContractionSycl.h:572
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db &&ctp==contraction_type::local > sync_mem(const cl::sycl::nd_item< 1 > &itemID, bool &) noexcept
Definition: TensorContractionSycl.h:791
const StorageIndex groupSizeN
Definition: TensorContractionSycl.h:574
static EIGEN_CONSTEXPR bool is_rhs_transposed
Definition: TensorContractionSycl.h:465
Definition: matrices.h:74
@ N
Definition: constructor.cpp:22
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
@ Unaligned
Definition: Constants.h:235
char char char int int * k
Definition: level2_impl.h:374
char char * op
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
data_source
Definition: TensorContractionSycl.h:133
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< dt !=data_source::global_mem, void > write(PacketType &packet_data, DataScalar ptr)
write, a template function used for storing the data to local memory. This function is used to guaran...
Definition: TensorContractionSycl.h:221
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary< false >(bool cond)
check_boundary: specialization of the check_boundary for non-internal blocks.
Definition: TensorContractionSycl.h:291
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< PacketLoad, PacketType > read(const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld)
read, a template function used for loading the data from global memory. This function is used to guar...
Definition: TensorContractionSycl.h:162
contraction_type
Definition: TensorContractionSycl.h:129
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool)
check_boundary: is used to check the edge condition for non-internal blocks.
Definition: TensorContractionSycl.h:281
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
std::uint64_t uint64_t
Definition: Meta.h:42
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
double K
Wave number.
Definition: sphere_scattering.cc:115
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
Definition: TensorDimensions.h:161
Definition: TensorForwardDeclarations.h:25
Definition: TensorMeta.h:47
Definition: TensorForwardDeclarations.h:42
Definition: TensorContraction.h:342
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::LeftArgType LeftArgType
Definition: TensorContraction.h:344
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::RightArgType RightArgType
Definition: TensorContraction.h:345
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::OutputKernelType OutputKernelType
Definition: TensorContraction.h:346
internal::traits< TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Eigen::SyclDevice > >::Indices Indices
Definition: TensorContraction.h:343
TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType > XprType
Definition: TensorContractionSycl.h:1287
TensorContractionEvaluatorBase< Self > Base
Definition: TensorContractionSycl.h:1286
void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1461
void evalTyped(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1384
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorContractionSycl.h:1291
array< StorageIndex, RDims > right_dim_mapper_t
Definition: TensorContractionSycl.h:1311
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorContractionSycl.h:1643
Base::Storage Storage
Definition: TensorContractionSycl.h:1292
array< StorageIndex, LDims - ContractDims > left_nocontract_t
Definition: TensorContractionSycl.h:1314
array< StorageIndex, RDims - ContractDims > right_nocontract_t
Definition: TensorContractionSycl.h:1315
DSizes< StorageIndex, NumDims > Dimensions
Definition: TensorContractionSycl.h:1319
void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, const TripleDim &triple_dim) const
Definition: TensorContractionSycl.h:1441
void evalToSycl(typename Base::EvaluatorPointerType buffer) const
Definition: TensorContractionSycl.h:1351
array< StorageIndex, LDims > left_dim_mapper_t
Definition: TensorContractionSycl.h:1310
XprType::CoeffReturnType CoeffReturnType
Definition: TensorContractionSycl.h:1290
TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorContractionSycl.h:1336
TensorEvaluator< const TensorContractionOp< Indices, LeftArgType, RightArgType, OutputKernelType >, Device > Self
Definition: TensorContractionSycl.h:1285
Base::EvaluatorPointerType EvaluatorPointerType
Definition: TensorContractionSycl.h:1293
TensorEvaluator< typename Base::EvalRightArgType, Device > RightEvaluator
Definition: TensorContractionSycl.h:1322
std::remove_const_t< typename RightEvaluator::CoeffReturnType > RhsScalar
Definition: TensorContractionSycl.h:1324
const Eigen::SyclDevice & device() const
Definition: TensorContractionSycl.h:1350
XprType::Index StorageIndex
Definition: TensorContractionSycl.h:1289
void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, StorageIndex NC, StorageIndex C) const
Definition: TensorContractionSycl.h:1537
array< StorageIndex, ContractDims > contract_t
Definition: TensorContractionSycl.h:1313
LeftEvaluator::Dimensions LeftDimensions
Definition: TensorContractionSycl.h:1326
TensorEvaluator< typename Base::EvalLeftArgType, Device > LeftEvaluator
Definition: TensorContractionSycl.h:1321
RightEvaluator::Dimensions RightDimensions
Definition: TensorContractionSycl.h:1327
std::remove_const_t< typename XprType::Scalar > Scalar
Definition: TensorContractionSycl.h:1288
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data)
Definition: TensorContractionSycl.h:1339
Eigen::SyclDevice Device
Definition: TensorContractionSycl.h:1281
EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, StorageIndex K) const
Definition: TensorContractionSycl.h:1598
std::remove_const_t< typename LeftEvaluator::CoeffReturnType > LhsScalar
Definition: TensorContractionSycl.h:1323
const StorageIndex N
Definition: TensorContractionSycl.h:1296
const StorageIndex K
Definition: TensorContractionSycl.h:1297
TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_)
Definition: TensorContractionSycl.h:1298
const StorageIndex M
Definition: TensorContractionSycl.h:1295
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41
@ PacketAccess
Definition: TensorEvaluator.h:50
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
Derived::Scalar CoeffReturnType
Definition: TensorEvaluator.h:34
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36
BlockProperties is a template class that provides different characteristic of a block of each Tensor ...
Definition: TensorContractionSycl.h:322
static EIGEN_CONSTEXPR int c_stride
Definition: TensorContractionSycl.h:330
static EIGEN_CONSTEXPR int elements_per_access
Definition: TensorContractionSycl.h:327
std::conditional_t< packet_load, PacketType, OutScalar > OutType
Definition: TensorContractionSycl.h:326
static EIGEN_CONSTEXPR bool is_coalesced_layout
Definition: TensorContractionSycl.h:328
static EIGEN_CONSTEXPR bool is_rhs
Definition: TensorContractionSycl.h:325
Eigen::internal::unpacket_traits< PacketType >::type OutScalar
Definition: TensorContractionSycl.h:324
static EIGEN_CONSTEXPR bool packet_load
Definition: TensorContractionSycl.h:323
static EIGEN_CONSTEXPR int nc_stride
Definition: TensorContractionSycl.h:329
GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contrac...
Definition: TensorContractionSycl.h:1235
OutAccessor out_res
Definition: TensorContractionSycl.h:1240
Scratch scratch
Definition: TensorContractionSycl.h:1237
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1236
EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, const StorageIndex rng_)
Definition: TensorContractionSycl.h:1243
const RhsMapper rhs
Definition: TensorContractionSycl.h:1239
const StorageIndex rng
Definition: TensorContractionSycl.h:1241
const LhsMapper lhs
Definition: TensorContractionSycl.h:1238
EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1247
GeneralVectorTensor is a template class that provides Tensor -vector contraction operation,...
Definition: TensorContractionSycl.h:995
Scratch scratch
Definition: TensorContractionSycl.h:1010
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > Scratch
Definition: TensorContractionSycl.h:1000
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item< 1 > &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, OutPtr out_ptr, const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex)
Definition: TensorContractionSycl.h:1068
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, const TensorMapper mat_, OutAccessor out_res_, const StorageIndex nonContractGroupSize_, const StorageIndex nonContractDim_, const StorageIndex contractDim_)
Definition: TensorContractionSycl.h:1018
const TensorMapper mat
Definition: TensorContractionSycl.h:1012
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, const StorageIndex &linearLocalThreadId, const StorageIndex &cOffset, const StorageIndex &C)
Definition: TensorContractionSycl.h:1171
const StorageIndex nonContractDim
Definition: TensorContractionSycl.h:1015
const StorageIndex nonContractGroupSize
Definition: TensorContractionSycl.h:1014
const StorageIndex contractDim
Definition: TensorContractionSycl.h:1016
BlockProperties< is_lhs_vec ? false :true, is_lhs_vec ? false :true, Vectorizable, PacketReturnType > VecBlockProperties
Definition: TensorContractionSycl.h:1008
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorContractionSycl.h:1031
OutAccessor out_res
Definition: TensorContractionSycl.h:1013
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorContractionSycl.h:998
static EIGEN_CONSTEXPR StorageIndex OutScratchOffset
Definition: TensorContractionSycl.h:1002
Eigen::TensorSycl::internal::Vectorise< OutScalar, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorContractionSycl.h:997
const VectorMapper vec
Definition: TensorContractionSycl.h:1011
static EIGEN_DEVICE_FUNC void set_packet(PacketReturnType, Scalar *)
Definition: InteropHeaders.h:145
static EIGEN_DEVICE_FUNC Scalar scalarize(Index, PacketReturnType &)
Definition: InteropHeaders.h:138
Definition: TensorReductionSycl.h:79
Definition: TensorReductionSycl.h:365
TTPanelSize, a template class used for setting the panel size required for launching General Tensor T...
Definition: TensorContractionSycl.h:82
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN
Definition: TensorContractionSycl.h:95
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM
Definition: TensorContractionSycl.h:88
static EIGEN_CONSTEXPR bool DoubleBuffer
Definition: TensorContractionSycl.h:117
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM
Definition: TensorContractionSycl.h:100
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN
Definition: TensorContractionSycl.h:102
static EIGEN_CONSTEXPR StorageIndex TileSizeDimM
Definition: TensorContractionSycl.h:104
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs
Definition: TensorContractionSycl.h:108
static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs
Definition: TensorContractionSycl.h:111
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:114
static EIGEN_CONSTEXPR StorageIndex TileSizeDimN
Definition: TensorContractionSycl.h:106
static EIGEN_CONSTEXPR StorageIndex TileSizeDimK
Definition: TensorContractionSycl.h:84
TVPanelSize, a template class used for setting the panel size required for launching General TensorVe...
Definition: TensorContractionSycl.h:46
static EIGEN_CONSTEXPR bool BC
Definition: TensorContractionSycl.h:60
static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC
Definition: TensorContractionSycl.h:52
static EIGEN_CONSTEXPR StorageIndex TileSizeDimC
Definition: TensorContractionSycl.h:54
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC
Definition: TensorContractionSycl.h:58
static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC
Definition: TensorContractionSycl.h:56
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC
Definition: TensorContractionSycl.h:50
static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC
Definition: TensorContractionSycl.h:48
MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it ...
Definition: TensorContractionSycl.h:504
tile_ptr ptr
Definition: TensorContractionSycl.h:505
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr)
Definition: TensorContractionSycl.h:506
TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel fr...
Definition: TensorContractionSycl.h:537
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &thread_properties, local_ptr block_start_ptr, std::enable_if_t< tp==contraction_type::local > *=0)
Definition: TensorContractionSycl.h:555
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties< StorageIndex > &, local_ptr, std::enable_if_t< tp==contraction_type::no_local > *=0)
Definition: TensorContractionSycl.h:545
tile_ptr rhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:541
const std::pair< StorageIndex, StorageIndex > rhs_extract_index
Definition: TensorContractionSycl.h:543
MemHolder< contraction_tp, Properties::WorkLoadPerThreadN *Properties::TileSizeDimK > rhs_scratch_extract
Definition: TensorContractionSycl.h:539
const std::pair< StorageIndex, StorageIndex > lhs_extract_index
Definition: TensorContractionSycl.h:542
tile_ptr lhs_scratch_ptr_compute
Definition: TensorContractionSycl.h:540
MemHolder< contraction_tp, Properties::WorkLoadPerThreadM *Properties::TileSizeDimK > lhs_scratch_extract
Definition: TensorContractionSycl.h:538
ThreadProperties is a template class that provides each thread's properties within a workgroup....
Definition: TensorContractionSycl.h:373
const StorageIndex mGroupOffset
Definition: TensorContractionSycl.h:376
const StorageIndex nGroupOffset
Definition: TensorContractionSycl.h:377
const StorageIndex mLocalOffset
Definition: TensorContractionSycl.h:379
const StorageIndex kGroupId
Definition: TensorContractionSycl.h:375
const StorageIndex linearLocalThreadId
Definition: TensorContractionSycl.h:374
const StorageIndex nLocalOffset
Definition: TensorContractionSycl.h:380
StorageIndex kSize
Definition: TensorContractionSycl.h:383
const StorageIndex kGroupOffset
Definition: TensorContractionSycl.h:378
const StorageIndex mGlobalOffset
Definition: TensorContractionSycl.h:381
const bool is_internal
Definition: TensorContractionSycl.h:384
const StorageIndex nGlobalOffset
Definition: TensorContractionSycl.h:382
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, StorageIndex kSize_, const bool is_internal_)
Definition: TensorContractionSycl.h:386
Definition: TensorFunctors.h:66
Definition: GenericPacketMath.h:134
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
Definition: ZVector/PacketMath.h:50