10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
14 #include "../InternalHeaderCheck.h"
22 template <
typename LhsScalar_,
typename RhsScalar_,
bool ConjLhs_ =
false,
bool ConjRhs_ =
false,
29 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
30 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
32 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
35 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
36 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
38 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
41 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
42 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
44 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
47 #if EIGEN_ARCH_i386_OR_x86_64
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
92 m_cacheSizes.
m_l1 = *l1;
93 m_cacheSizes.
m_l2 = *l2;
94 m_cacheSizes.
m_l3 = *l3;
97 *l1 = m_cacheSizes.
m_l1;
98 *l2 = m_cacheSizes.
m_l2;
99 *l3 = m_cacheSizes.
m_l3;
117 template <
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
126 std::ptrdiff_t l1, l2, l3;
128 #ifdef EIGEN_VECTORIZE_AVX512
139 if (num_threads > 1) {
140 typedef typename Traits::ResScalar ResScalar;
142 kdiv = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
143 ksub = Traits::mr * (Traits::nr *
sizeof(ResScalar)),
153 const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
155 k = k_cache - (k_cache % kr);
159 const Index n_cache = (l2 - l1) / (nr *
sizeof(RhsScalar) *
k);
161 if (n_cache <= n_per_thread) {
164 n = n_cache - (n_cache % nr);
167 n = (numext::mini<Index>)(
n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
172 const Index m_cache = (l3 - l2) / (
sizeof(LhsScalar) *
k * num_threads);
174 if (m_cache < m_per_thread && m_cache >=
static_cast<Index>(mr)) {
175 m = m_cache - (m_cache % mr);
178 m = (numext::mini<Index>)(
m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
184 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
196 typedef typename Traits::ResScalar ResScalar;
199 k_div = KcFactor * (Traits::mr *
sizeof(LhsScalar) + Traits::nr *
sizeof(RhsScalar)),
200 k_sub = Traits::mr * (Traits::nr *
sizeof(ResScalar))
210 const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
216 k = (
k % max_kc) == 0 ? max_kc
217 : max_kc - k_peeling * ((max_kc - 1 - (
k % max_kc)) / (k_peeling * (
k / max_kc + 1)));
219 eigen_internal_assert(((old_k /
k) == (old_k / max_kc)) &&
"the number of sweeps has to remain the same");
228 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
229 const Index actual_l2 = l3;
231 const Index actual_l2 = 1572864;
241 const Index lhs_bytes =
m *
k *
sizeof(LhsScalar);
242 const Index remaining_l1 = l1 - k_sub - lhs_bytes;
243 if (remaining_l1 >=
Index(Traits::nr *
sizeof(RhsScalar)) *
k) {
245 max_nc = remaining_l1 / (
k *
sizeof(RhsScalar));
248 max_nc = (3 * actual_l2) / (2 * 2 * max_kc *
sizeof(RhsScalar));
251 Index nc = numext::mini<Index>(actual_l2 / (2 *
k *
sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
257 n = (
n % nc) == 0 ? nc : (nc - Traits::nr * ((nc - (
n % nc)) / (Traits::nr * (
n / nc + 1))));
258 }
else if (old_k ==
k) {
263 Index problem_size =
k *
n *
sizeof(LhsScalar);
264 Index actual_lm = actual_l2;
266 if (problem_size <= 1024) {
270 }
else if (l3 != 0 && problem_size <= 32768) {
274 max_mc = (numext::mini<Index>)(576, max_mc);
276 Index mc = (numext::mini<Index>)(actual_lm / (3 *
k *
sizeof(LhsScalar)), max_mc);
278 mc -= mc % Traits::mr;
281 m = (
m % mc) == 0 ? mc : (mc - Traits::mr * ((mc - (
m % mc)) / (Traits::mr * (
m / mc + 1))));
286 template <
typename Index>
288 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
320 template <
typename LhsScalar,
typename RhsScalar,
int KcFactor,
typename Index>
323 evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(
k,
m,
n, num_threads);
327 template <
typename LhsScalar,
typename RhsScalar,
typename Index>
329 computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(
k,
m,
n, num_threads);
332 template <
typename RhsPacket,
typename RhsPacketx4,
int registers_taken>
342 template <
typename Packet>
351 template <
int N,
typename T1,
typename T2,
typename T3>
356 template <
typename T1,
typename T2,
typename T3>
361 template <
typename T1,
typename T2,
typename T3>
366 #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
367 typedef typename packet_conditional< \
368 packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
369 typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
371 #define PACKET_DECL_COND(name, packet_size) \
372 typedef typename packet_conditional< \
373 packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
374 typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
376 #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
377 typedef typename packet_conditional< \
378 packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
379 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
381 #define PACKET_DECL_COND_SCALAR(packet_size) \
382 typedef typename packet_conditional< \
383 packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
384 typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
396 template <
typename LhsScalar_,
typename RhsScalar_,
bool ConjLhs_,
bool ConjRhs_,
int Arch,
int PacketSize_>
437 typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar>
LhsPacket;
438 typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar>
RhsPacket;
439 typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar>
ResPacket;
447 template <
typename RhsPacketType>
449 dest = pset1<RhsPacketType>(*
b);
456 template <
typename RhsPacketType>
465 template <
typename LhsPacketType>
467 dest = pload<LhsPacketType>(
a);
470 template <
typename LhsPacketType>
472 dest = ploadu<LhsPacketType>(
a);
475 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
477 const LaneIdType&)
const {
483 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
493 template <
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
495 const LaneIdType& lane)
const {
503 template <
typename ResPacketHalf>
509 template <
typename RealScalar,
bool ConjLhs_,
int Arch,
int PacketSize_>
530 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
541 typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar>
LhsPacket;
542 typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar>
RhsPacket;
543 typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar>
ResPacket;
552 template <
typename RhsPacketType>
554 dest = pset1<RhsPacketType>(*
b);
561 template <
typename RhsPacketType>
569 loadRhsQuad_impl(
b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
576 dest = ploadquad<RhsPacket>(
tmp);
581 dest = pset1<RhsPacket>(*
b);
586 template <
typename LhsPacketType>
588 dest = ploadu<LhsPacketType>(
a);
591 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
593 const LaneIdType&)
const {
594 madd_impl(
a,
b,
c,
tmp, std::conditional_t<Vectorizable, true_type, false_type>());
597 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
600 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
615 template <
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
617 const LaneIdType& lane)
const {
621 template <
typename ResPacketType,
typename AccPacketType>
630 template <
typename Packet>
636 template <
typename Packet>
648 template <
typename Packet>
654 template <
typename Packet>
667 template <
typename Scalar,
typename RealPacket>
674 template <
typename Scalar,
typename RealPacket>
681 dest.
first = ploadquad<RealPacket>(
r);
682 dest.
second = ploadquad<RealPacket>(
i);
685 template <
typename Packet>
699 template <
typename RealScalar,
bool ConjLhs_,
bool ConjRhs_,
int Arch,
int PacketSize_>
700 class gebp_traits<std::
complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
733 typedef std::conditional_t<Vectorizable, RealPacket, Scalar>
LhsPacket;
734 typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar>
RhsPacket;
735 typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar>
ResPacket;
736 typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar>
AccPacket;
752 template <
typename RealPacketType>
769 template <
typename RealPacketType>
786 template <
typename LhsPacketType>
791 template <
typename LhsPacketType,
typename RhsPacketType,
typename ResPacketType,
typename TmpType,
794 const RhsPacketType&
b,
797 const LaneIdType&)
const {
802 template <
typename LaneIdType>
804 const LaneIdType&)
const {
805 c = cj.pmadd(
a,
b,
c);
808 template <
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
810 const LaneIdType& lane)
const {
816 template <
typename RealPacketType,
typename ResPacketType>
818 ResPacketType&
r)
const {
842 template <
typename RealScalar,
bool ConjRhs_,
int Arch,
int PacketSize_>
856 #undef PACKET_DECL_COND_SCALAR_POSTFIX
857 #undef PACKET_DECL_COND_POSTFIX
858 #undef PACKET_DECL_COND_SCALAR
859 #undef PACKET_DECL_COND
878 typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar>
LhsPacket;
879 typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar>
RhsPacket;
880 typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar>
ResPacket;
887 template <
typename RhsPacketType>
889 dest = pset1<RhsPacketType>(*
b);
896 template <
typename RhsPacketType>
907 template <
typename LhsPacketType>
909 dest = ploaddup<LhsPacketType>(
a);
912 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType,
typename LaneIdType>
914 const LaneIdType&)
const {
915 madd_impl(
a,
b,
c,
tmp, std::conditional_t<Vectorizable, true_type, false_type>());
918 template <
typename LhsPacketType,
typename RhsPacketType,
typename AccPacketType>
921 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
936 template <
typename LhsPacketType,
typename AccPacketType,
typename LaneIdType>
938 const LaneIdType& lane)
const {
942 template <
typename ResPacketType,
typename AccPacketType>
958 template <
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
959 bool ConjugateLhs,
bool ConjugateRhs>
1013 template <
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
1014 bool ConjugateLhs,
bool ConjugateRhs,
1015 int SwappedLhsProgress =
1043 template <
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
1044 bool ConjugateLhs,
bool ConjugateRhs>
1063 SResPacketQuarter
R =
res.template gatherPacket<SResPacketQuarter>(
i, j2);
1064 SResPacketQuarter alphav = pset1<SResPacketQuarter>(
alpha);
1066 if (depth - endk > 0) {
1071 for (
Index kk = endk; kk < depth; kk++) {
1072 SLhsPacketQuarter a0;
1073 SRhsPacketQuarter b0;
1076 straits.
madd(a0, b0, c0, b0, fix<0>);
1080 straits.
acc(c0, alphav,
R);
1084 res.scatterPacket(
i, j2,
R);
1088 template <
int nr,
Index LhsProgress,
Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
1089 typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
1090 typename LinearMapper,
typename DataMapper>
1095 LhsPacket* A0,
RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
1096 AccPacket*
C1, AccPacket*
C2, AccPacket* C3) {
1099 traits.loadLhs(&blA[(0 + 1 *
K) * LhsProgress], *A0);
1100 traits.loadRhs(&blB[(0 + 4 *
K) * RhsProgress], *rhs_panel);
1101 traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1102 traits.madd(*A0, *rhs_panel, *
C1, *T0, fix<1>);
1103 traits.madd(*A0, *rhs_panel, *
C2, *T0, fix<2>);
1104 traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1105 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1106 __asm__(
"" :
"+x,m"(*A0));
1116 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
1119 for (
Index i = peelStart;
i < peelEnd;
i += LhsProgress) {
1120 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1122 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1123 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1127 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7;
1137 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1138 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1139 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1140 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1141 LinearMapper r4 =
res.getLinearMapper(
i, j2 + 4);
1142 LinearMapper r5 =
res.getLinearMapper(
i, j2 + 5);
1143 LinearMapper r6 =
res.getLinearMapper(
i, j2 + 6);
1144 LinearMapper r7 =
res.getLinearMapper(
i, j2 + 7);
1145 r0.prefetch(prefetch_res_offset);
1146 r1.prefetch(prefetch_res_offset);
1147 r2.prefetch(prefetch_res_offset);
1148 r3.prefetch(prefetch_res_offset);
1149 r4.prefetch(prefetch_res_offset);
1150 r5.prefetch(prefetch_res_offset);
1151 r6.prefetch(prefetch_res_offset);
1152 r7.prefetch(prefetch_res_offset);
1153 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1157 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1160 #define EIGEN_GEBGP_ONESTEP(K) \
1162 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1163 traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1164 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1165 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1166 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1167 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1168 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1169 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1170 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1171 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1172 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1173 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1174 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1175 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1176 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1177 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1178 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1179 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1180 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1194 blB += pk * 8 * RhsProgress;
1195 blA += pk * (1 * LhsProgress);
1200 for (
Index k = peeled_kc;
k < depth;
k++) {
1204 blB += 8 * RhsProgress;
1205 blA += 1 * LhsProgress;
1208 #undef EIGEN_GEBGP_ONESTEP
1211 ResPacket alphav = pset1<ResPacket>(
alpha);
1213 R0 = r0.template loadPacket<ResPacket>(0);
1214 R1 = r1.template loadPacket<ResPacket>(0);
1215 traits.acc(C0, alphav, R0);
1217 r0.storePacket(0, R0);
1218 r1.storePacket(0, R1);
1220 R0 = r2.template loadPacket<ResPacket>(0);
1221 R1 = r3.template loadPacket<ResPacket>(0);
1223 traits.acc(C3, alphav, R1);
1224 r2.storePacket(0, R0);
1225 r3.storePacket(0, R1);
1227 R0 = r4.template loadPacket<ResPacket>(0);
1228 R1 = r5.template loadPacket<ResPacket>(0);
1229 traits.acc(C4, alphav, R0);
1230 traits.acc(C5, alphav, R1);
1231 r4.storePacket(0, R0);
1232 r5.storePacket(0, R1);
1234 R0 = r6.template loadPacket<ResPacket>(0);
1235 R1 = r7.template loadPacket<ResPacket>(0);
1236 traits.acc(C6, alphav, R0);
1237 traits.acc(C7, alphav, R1);
1238 r6.storePacket(0, R0);
1239 r7.storePacket(0, R1);
1245 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1249 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1253 AccPacket C0,
C1,
C2, C3;
1263 AccPacket D0, D1, D2, D3;
1269 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1270 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1271 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1272 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1274 r0.prefetch(prefetch_res_offset);
1275 r1.prefetch(prefetch_res_offset);
1276 r2.prefetch(prefetch_res_offset);
1277 r3.prefetch(prefetch_res_offset);
1280 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1284 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1290 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1291 peeled_kc_onestep(1, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1292 peeled_kc_onestep(2, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1293 peeled_kc_onestep(3, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1295 peeled_kc_onestep(4, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1296 peeled_kc_onestep(5, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1297 peeled_kc_onestep(6, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1298 peeled_kc_onestep(7, blA, blB,
traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1300 blB += pk * 4 * RhsProgress;
1301 blA += pk * LhsProgress;
1311 for (
Index k = peeled_kc;
k < depth;
k++) {
1314 peeled_kc_onestep(0, blA, blB,
traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1315 blB += 4 * RhsProgress;
1320 ResPacket alphav = pset1<ResPacket>(
alpha);
1322 R0 = r0.template loadPacket<ResPacket>(0);
1323 R1 = r1.template loadPacket<ResPacket>(0);
1324 traits.acc(C0, alphav, R0);
1326 r0.storePacket(0, R0);
1327 r1.storePacket(0, R1);
1329 R0 = r2.template loadPacket<ResPacket>(0);
1330 R1 = r3.template loadPacket<ResPacket>(0);
1332 traits.acc(C3, alphav, R1);
1333 r2.storePacket(0, R0);
1334 r3.storePacket(0, R1);
1338 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
1340 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1347 LinearMapper r0 =
res.getLinearMapper(
i, j2);
1350 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1353 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1357 #define EIGEN_GEBGP_ONESTEP(K) \
1359 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1360 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1362 traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1363 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1364 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1365 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1377 blB += pk * RhsProgress;
1378 blA += pk * LhsProgress;
1384 for (
Index k = peeled_kc;
k < depth;
k++) {
1390 #undef EIGEN_GEBGP_ONESTEP
1392 ResPacket alphav = pset1<ResPacket>(
alpha);
1393 R0 = r0.template loadPacket<ResPacket>(0);
1394 traits.acc(C0, alphav, R0);
1395 r0.storePacket(0, R0);
1401 template <
int nr,
Index LhsProgress,
Index RhsProgress,
typename LhsScalar,
typename RhsScalar,
typename ResScalar,
1402 typename AccPacket,
typename LhsPacket,
typename RhsPacket,
typename ResPacket,
typename GEBPTraits,
1403 typename LinearMapper,
typename DataMapper>
1405 :
lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
1406 RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
1408 LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
1409 AccPacket* C0, AccPacket*
C1, AccPacket*
C2, AccPacket* C3) {
1412 traits.loadLhsUnaligned(&blA[(0 + 1 *
K) * (LhsProgress)], *A0);
1413 traits.broadcastRhs(&blB[(0 + 4 *
K) * RhsProgress], *B_0, *B1, *B2, *B3);
1414 traits.madd(*A0, *B_0, *C0, *B_0);
1417 traits.madd(*A0, *B3, *C3, *B3);
1422 template <
typename LhsScalar,
typename RhsScalar,
typename Index,
typename DataMapper,
int mr,
int nr,
1423 bool ConjugateLhs,
bool ConjugateRhs>
1425 ConjugateRhs>::operator()(
const DataMapper&
res,
const LhsScalar* blockA,
1432 if (strideA == -1) strideA = depth;
1433 if (strideB == -1) strideB = depth;
1435 Index packet_cols4 = nr >= 4 ? (
cols / 4) * 4 : 0;
1436 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
1437 const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (
rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
1438 const Index peeled_mc2 =
1439 mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((
rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
1440 const Index peeled_mc1 =
1441 mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((
rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
1442 const Index peeled_mc_half =
1443 mr >= LhsProgressHalf ? peeled_mc1 + ((
rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
1444 const Index peeled_mc_quarter =
1445 mr >= LhsProgressQuarter
1446 ? peeled_mc_half + ((
rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
1449 const Index peeled_kc = depth & ~(pk - 1);
1450 const int prefetch_res_offset = 32 /
sizeof(
ResScalar);
1456 if (mr >= 3 * Traits::LhsProgress) {
1465 const Index actual_panel_rows =
1466 (3 * LhsProgress) * std::max<Index>(1, ((l1 -
sizeof(
ResScalar) * mr * nr - depth * nr *
sizeof(RhsScalar)) /
1467 (depth *
sizeof(LhsScalar) * 3 * LhsProgress)));
1468 for (
Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1469 const Index actual_panel_end = (
std::min)(i1 + actual_panel_rows, peeled_mc3);
1470 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1472 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1473 for (
Index i = i1;
i < actual_panel_end;
i += 3 * LhsProgress) {
1474 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (3 * LhsProgress)];
1477 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1523 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1526 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1532 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1536 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1538 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1541 #define EIGEN_GEBP_ONESTEP(K) \
1543 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1544 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1545 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1546 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1547 EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1548 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1549 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1550 traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1551 traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1552 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1553 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1554 traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1555 traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1556 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1557 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1558 traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1559 traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1560 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1561 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1562 traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1563 traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1564 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1565 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1566 traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1567 traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1568 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1569 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1570 traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1571 traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1572 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1573 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1574 traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1575 traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1576 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1577 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1578 traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1579 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1591 blB += pk * 8 * RhsProgress;
1592 blA += pk * 3 * Traits::LhsProgress;
1597 for (
Index k = peeled_kc;
k < depth;
k++) {
1602 blB += 8 * RhsProgress;
1603 blA += 3 * Traits::LhsProgress;
1606 #undef EIGEN_GEBP_ONESTEP
1611 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1612 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1613 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1614 traits.acc(C0, alphav, R0);
1615 traits.acc(C8, alphav, R1);
1616 traits.acc(C16, alphav, R2);
1617 r0.storePacket(0 * Traits::ResPacketSize, R0);
1618 r0.storePacket(1 * Traits::ResPacketSize, R1);
1619 r0.storePacket(2 * Traits::ResPacketSize, R2);
1621 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1622 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1623 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1625 traits.acc(C9, alphav, R1);
1626 traits.acc(C17, alphav, R2);
1627 r1.storePacket(0 * Traits::ResPacketSize, R0);
1628 r1.storePacket(1 * Traits::ResPacketSize, R1);
1629 r1.storePacket(2 * Traits::ResPacketSize, R2);
1631 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1632 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1633 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1635 traits.acc(C10, alphav, R1);
1636 traits.acc(C18, alphav, R2);
1637 r2.storePacket(0 * Traits::ResPacketSize, R0);
1638 r2.storePacket(1 * Traits::ResPacketSize, R1);
1639 r2.storePacket(2 * Traits::ResPacketSize, R2);
1641 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1642 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1643 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1644 traits.acc(C3, alphav, R0);
1645 traits.acc(C11, alphav, R1);
1646 traits.acc(C19, alphav, R2);
1647 r3.storePacket(0 * Traits::ResPacketSize, R0);
1648 r3.storePacket(1 * Traits::ResPacketSize, R1);
1649 r3.storePacket(2 * Traits::ResPacketSize, R2);
1651 R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1652 R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1653 R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1654 traits.acc(C4, alphav, R0);
1655 traits.acc(C12, alphav, R1);
1656 traits.acc(C20, alphav, R2);
1657 r4.storePacket(0 * Traits::ResPacketSize, R0);
1658 r4.storePacket(1 * Traits::ResPacketSize, R1);
1659 r4.storePacket(2 * Traits::ResPacketSize, R2);
1661 R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1662 R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1663 R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1664 traits.acc(C5, alphav, R0);
1665 traits.acc(C13, alphav, R1);
1666 traits.acc(C21, alphav, R2);
1667 r5.storePacket(0 * Traits::ResPacketSize, R0);
1668 r5.storePacket(1 * Traits::ResPacketSize, R1);
1669 r5.storePacket(2 * Traits::ResPacketSize, R2);
1671 R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1672 R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1673 R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1674 traits.acc(C6, alphav, R0);
1675 traits.acc(C14, alphav, R1);
1676 traits.acc(C22, alphav, R2);
1677 r6.storePacket(0 * Traits::ResPacketSize, R0);
1678 r6.storePacket(1 * Traits::ResPacketSize, R1);
1679 r6.storePacket(2 * Traits::ResPacketSize, R2);
1681 R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1682 R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1683 R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1684 traits.acc(C7, alphav, R0);
1685 traits.acc(C15, alphav, R1);
1686 traits.acc(C23, alphav, R2);
1687 r7.storePacket(0 * Traits::ResPacketSize, R0);
1688 r7.storePacket(1 * Traits::ResPacketSize, R1);
1689 r7.storePacket(2 * Traits::ResPacketSize, R2);
1694 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1695 for (
Index i = i1;
i < actual_panel_end;
i += 3 * LhsProgress) {
1699 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (3 * LhsProgress)];
1703 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1728 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1732 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1738 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1742 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1744 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1746 #define EIGEN_GEBP_ONESTEP(K) \
1748 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1749 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1750 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1751 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1752 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1754 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1755 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1756 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1757 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1758 traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1759 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1760 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1761 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1762 traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1763 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1764 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1765 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1766 traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1767 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1768 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1769 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1770 traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1771 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1772 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1773 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1774 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1787 blB += pk * 4 * RhsProgress;
1788 blA += pk * 3 * Traits::LhsProgress;
1793 for (
Index k = peeled_kc;
k < depth;
k++) {
1798 blB += 4 * RhsProgress;
1799 blA += 3 * Traits::LhsProgress;
1802 #undef EIGEN_GEBP_ONESTEP
1807 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1808 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1809 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1810 traits.acc(C0, alphav, R0);
1811 traits.acc(C4, alphav, R1);
1812 traits.acc(C8, alphav, R2);
1813 r0.storePacket(0 * Traits::ResPacketSize, R0);
1814 r0.storePacket(1 * Traits::ResPacketSize, R1);
1815 r0.storePacket(2 * Traits::ResPacketSize, R2);
1817 R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1818 R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1819 R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1821 traits.acc(C5, alphav, R1);
1822 traits.acc(C9, alphav, R2);
1823 r1.storePacket(0 * Traits::ResPacketSize, R0);
1824 r1.storePacket(1 * Traits::ResPacketSize, R1);
1825 r1.storePacket(2 * Traits::ResPacketSize, R2);
1827 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1828 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1829 R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1831 traits.acc(C6, alphav, R1);
1832 traits.acc(C10, alphav, R2);
1833 r2.storePacket(0 * Traits::ResPacketSize, R0);
1834 r2.storePacket(1 * Traits::ResPacketSize, R1);
1835 r2.storePacket(2 * Traits::ResPacketSize, R2);
1837 R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1838 R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1839 R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1840 traits.acc(C3, alphav, R0);
1841 traits.acc(C7, alphav, R1);
1842 traits.acc(C11, alphav, R2);
1843 r3.storePacket(0 * Traits::ResPacketSize, R0);
1844 r3.storePacket(1 * Traits::ResPacketSize, R1);
1845 r3.storePacket(2 * Traits::ResPacketSize, R2);
1850 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
1851 for (
Index i = i1;
i < actual_panel_end;
i += 3 * LhsProgress) {
1853 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (3 * Traits::LhsProgress)];
1866 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1869 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1872 #define EIGEN_GEBGP_ONESTEP(K) \
1874 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1875 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1876 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1877 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1878 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1879 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1880 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1881 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1882 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1883 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1895 blB +=
int(pk) *
int(RhsProgress);
1896 blA +=
int(pk) * 3 *
int(Traits::LhsProgress);
1902 for (
Index k = peeled_kc;
k < depth;
k++) {
1906 blA += 3 * Traits::LhsProgress;
1908 #undef EIGEN_GEBGP_ONESTEP
1912 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1913 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1914 R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1915 traits.acc(C0, alphav, R0);
1916 traits.acc(C4, alphav, R1);
1917 traits.acc(C8, alphav, R2);
1918 r0.storePacket(0 * Traits::ResPacketSize, R0);
1919 r0.storePacket(1 * Traits::ResPacketSize, R1);
1920 r0.storePacket(2 * Traits::ResPacketSize, R2);
1927 if (mr >= 2 * Traits::LhsProgress) {
1932 Index actual_panel_rows =
1933 (2 * LhsProgress) * std::max<Index>(1, ((l1 -
sizeof(
ResScalar) * mr * nr - depth * nr *
sizeof(RhsScalar)) /
1934 (depth *
sizeof(LhsScalar) * 2 * LhsProgress)));
1936 for (
Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1937 Index actual_panel_end = (
std::min)(i1 + actual_panel_rows, peeled_mc2);
1938 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1940 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1941 for (
Index i = i1;
i < actual_panel_end;
i += 2 * LhsProgress) {
1942 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (2 * Traits::LhsProgress)];
1945 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1971 r0.prefetch(prefetch_res_offset);
1972 r1.prefetch(prefetch_res_offset);
1973 r2.prefetch(prefetch_res_offset);
1974 r3.prefetch(prefetch_res_offset);
1975 r4.prefetch(prefetch_res_offset);
1976 r5.prefetch(prefetch_res_offset);
1977 r6.prefetch(prefetch_res_offset);
1978 r7.prefetch(prefetch_res_offset);
1980 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1983 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1988 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1989 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1991 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1993 #define EIGEN_GEBGP_ONESTEP(K) \
1995 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1996 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1997 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1998 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1999 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2000 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2001 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2002 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2003 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2004 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2005 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2006 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2007 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2008 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2009 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2010 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2011 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2012 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2013 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2014 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2015 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2016 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2017 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2018 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2019 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2020 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2021 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2022 EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2036 blB += pk * 8 * RhsProgress;
2037 blA += pk * (2 * Traits::LhsProgress);
2042 for (
Index k = peeled_kc;
k < depth;
k++) {
2046 blB += 8 * RhsProgress;
2047 blA += 2 * Traits::LhsProgress;
2050 #undef EIGEN_GEBGP_ONESTEP
2055 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2056 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2057 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2058 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2059 traits.acc(C0, alphav, R0);
2060 traits.acc(C8, alphav, R1);
2062 traits.acc(C9, alphav, R3);
2063 r0.storePacket(0 * Traits::ResPacketSize, R0);
2064 r0.storePacket(1 * Traits::ResPacketSize, R1);
2065 r1.storePacket(0 * Traits::ResPacketSize, R2);
2066 r1.storePacket(1 * Traits::ResPacketSize, R3);
2068 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2069 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2070 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2071 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2073 traits.acc(C10, alphav, R1);
2074 traits.acc(C3, alphav, R2);
2075 traits.acc(C11, alphav, R3);
2076 r2.storePacket(0 * Traits::ResPacketSize, R0);
2077 r2.storePacket(1 * Traits::ResPacketSize, R1);
2078 r3.storePacket(0 * Traits::ResPacketSize, R2);
2079 r3.storePacket(1 * Traits::ResPacketSize, R3);
2081 R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2082 R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2083 R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2084 R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2085 traits.acc(C4, alphav, R0);
2086 traits.acc(C12, alphav, R1);
2087 traits.acc(C5, alphav, R2);
2088 traits.acc(C13, alphav, R3);
2089 r4.storePacket(0 * Traits::ResPacketSize, R0);
2090 r4.storePacket(1 * Traits::ResPacketSize, R1);
2091 r5.storePacket(0 * Traits::ResPacketSize, R2);
2092 r5.storePacket(1 * Traits::ResPacketSize, R3);
2094 R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2095 R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2096 R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2097 R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2098 traits.acc(C6, alphav, R0);
2099 traits.acc(C14, alphav, R1);
2100 traits.acc(C7, alphav, R2);
2101 traits.acc(C15, alphav, R3);
2102 r6.storePacket(0 * Traits::ResPacketSize, R0);
2103 r6.storePacket(1 * Traits::ResPacketSize, R1);
2104 r7.storePacket(0 * Traits::ResPacketSize, R2);
2105 r7.storePacket(1 * Traits::ResPacketSize, R3);
2110 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2111 for (
Index i = i1;
i < actual_panel_end;
i += 2 * LhsProgress) {
2115 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (2 * Traits::LhsProgress)];
2134 r0.prefetch(prefetch_res_offset);
2135 r1.prefetch(prefetch_res_offset);
2136 r2.prefetch(prefetch_res_offset);
2137 r3.prefetch(prefetch_res_offset);
2140 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2144 for (
Index k = 0;
k < peeled_kc;
k += pk) {
2151 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2152 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2154 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2156 #define EIGEN_GEBGP_ONESTEP(K) \
2158 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2159 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2160 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2161 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2162 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2163 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2164 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2165 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2166 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2167 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2168 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2169 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2170 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2171 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2185 blB += pk * 4 * RhsProgress;
2186 blA += pk * (2 * Traits::LhsProgress);
2191 for (
Index k = peeled_kc;
k < depth;
k++) {
2195 blB += 4 * RhsProgress;
2196 blA += 2 * Traits::LhsProgress;
2198 #undef EIGEN_GEBGP_ONESTEP
2203 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2204 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2205 R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2206 R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2207 traits.acc(C0, alphav, R0);
2208 traits.acc(C4, alphav, R1);
2210 traits.acc(C5, alphav, R3);
2211 r0.storePacket(0 * Traits::ResPacketSize, R0);
2212 r0.storePacket(1 * Traits::ResPacketSize, R1);
2213 r1.storePacket(0 * Traits::ResPacketSize, R2);
2214 r1.storePacket(1 * Traits::ResPacketSize, R3);
2216 R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2217 R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2218 R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2219 R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2221 traits.acc(C6, alphav, R1);
2222 traits.acc(C3, alphav, R2);
2223 traits.acc(C7, alphav, R3);
2224 r2.storePacket(0 * Traits::ResPacketSize, R0);
2225 r2.storePacket(1 * Traits::ResPacketSize, R1);
2226 r3.storePacket(0 * Traits::ResPacketSize, R2);
2227 r3.storePacket(1 * Traits::ResPacketSize, R3);
2232 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
2233 for (
Index i = i1;
i < actual_panel_end;
i += 2 * LhsProgress) {
2235 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (2 * Traits::LhsProgress)];
2244 r0.prefetch(prefetch_res_offset);
2247 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2250 for (
Index k = 0;
k < peeled_kc;
k += pk) {
2254 #define EIGEN_GEBGP_ONESTEP(K) \
2256 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2257 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2258 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2259 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2260 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2261 traits.madd(A0, B_0, C0, B1, fix<0>); \
2262 traits.madd(A1, B_0, C4, B_0, fix<0>); \
2263 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2275 blB +=
int(pk) *
int(RhsProgress);
2276 blA +=
int(pk) * 2 *
int(Traits::LhsProgress);
2282 for (
Index k = peeled_kc;
k < depth;
k++) {
2286 blA += 2 * Traits::LhsProgress;
2288 #undef EIGEN_GEBGP_ONESTEP
2292 R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2293 R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2294 traits.acc(C0, alphav, R0);
2295 traits.acc(C4, alphav, R1);
2296 r0.storePacket(0 * Traits::ResPacketSize, R0);
2297 r0.storePacket(1 * Traits::ResPacketSize, R1);
2303 if (mr >= 1 * Traits::LhsProgress) {
2307 p(
res, blockA, blockB,
alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2308 peeled_kc, pk,
cols, depth, packet_cols4);
2311 if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
2315 p(
res, blockA, blockB,
alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2316 peeled_kc, pk,
cols, depth, packet_cols4);
2319 if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
2324 p(
res, blockA, blockB,
alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2325 prefetch_res_offset, peeled_kc, pk,
cols, depth, packet_cols4);
2328 if (peeled_mc_quarter <
rows) {
2329 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2332 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2335 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2338 ResScalar C0(0),
C1(0),
C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2339 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2340 for (
Index k = 0;
k < depth;
k++) {
2341 LhsScalar A0 = blA[
k];
2345 C0 = cj.
pmadd(A0, B_0, C0);
2354 C3 = cj.
pmadd(A0, B_0, C3);
2357 C4 = cj.
pmadd(A0, B_0, C4);
2360 C5 = cj.
pmadd(A0, B_0, C5);
2363 C6 = cj.
pmadd(A0, B_0, C6);
2366 C7 = cj.
pmadd(A0, B_0, C7);
2383 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2386 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2388 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2394 const int SResPacketQuarterSize =
2400 constexpr
bool kCanLoadSRhsQuad =
2403 if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
2404 (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
2405 (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
2412 const Index spk = (
std::max)(1, SwappedTraits::LhsProgress / 4);
2413 const Index endk = (depth / spk) * spk;
2414 const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2417 for (;
k < endk4;
k += 4 * spk) {
2426 straits.
madd(A0, B_0, C0, B_0, fix<0>);
2427 straits.
madd(A1, B_1,
C1, B_1, fix<0>);
2433 straits.
madd(A0, B_0,
C2, B_0, fix<0>);
2434 straits.
madd(A1, B_1, C3, B_1, fix<0>);
2436 blB += 4 * SwappedTraits::LhsProgress;
2440 for (;
k < endk;
k += spk) {
2446 straits.
madd(A0, B_0, C0, B_0, fix<0>);
2448 blB += SwappedTraits::LhsProgress;
2451 if (SwappedTraits::LhsProgress == 8) {
2466 SResPacketHalf
R =
res.template gatherPacket<SResPacketHalf>(
i, j2);
2467 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
2469 if (depth - endk > 0) {
2476 straits.
madd(a0, b0, c0, b0, fix<0>);
2477 straits.
acc(c0, alphav,
R);
2481 res.scatterPacket(
i, j2,
R);
2482 }
else if (SwappedTraits::LhsProgress == 16) {
2488 p(
res, straits, blA, blB, depth, endk,
i, j2,
alpha, C0);
2492 straits.
acc(C0, alphav,
R);
2493 res.scatterPacket(
i, j2,
R);
2500 for (
Index k = 0;
k < depth;
k++) {
2508 C0 = cj.
pmadd(A0, B_0, C0);
2514 C3 = cj.
pmadd(A0, B_1, C3);
2526 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
2529 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2533 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2534 for (
Index k = 0;
k < depth;
k++) {
2535 LhsScalar A0 = blA[
k];
2536 RhsScalar B_0 = blB[
k];
2537 C0 = cj.
pmadd(A0, B_0, C0);
2570 PanelMode>::operator()(
Scalar* blockA,
const DataMapper& lhs,
Index depth,
2578 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2579 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize
2585 eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2586 eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
2590 const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (
rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
2591 const Index peeled_mc2 =
2592 Pack1 >= 2 * PacketSize ? peeled_mc3 + ((
rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
2593 const Index peeled_mc1 =
2594 Pack1 >= 1 * PacketSize ? peeled_mc2 + ((
rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
2595 const Index peeled_mc_half =
2596 Pack1 >= HalfPacketSize ? peeled_mc1 + ((
rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
2597 const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (
rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
2598 const Index last_lhs_progress =
rows > peeled_mc_quarter ? (
rows - peeled_mc_quarter) & ~1 : 0;
2599 const Index peeled_mc0 = Pack2 >= PacketSize ? peeled_mc_quarter
2600 : Pack2 > 1 && last_lhs_progress ? (
rows / last_lhs_progress) * last_lhs_progress
2606 if (Pack1 >= 3 * PacketSize) {
2607 for (;
i < peeled_mc3;
i += 3 * PacketSize) {
2608 if (PanelMode) count += (3 * PacketSize) * offset;
2610 for (
Index k = 0;
k < depth;
k++) {
2612 A = lhs.template loadPacket<Packet>(
i + 0 * PacketSize,
k);
2613 B = lhs.template loadPacket<Packet>(
i + 1 * PacketSize,
k);
2614 C = lhs.template loadPacket<Packet>(
i + 2 * PacketSize,
k);
2615 pstore(blockA + count, cj.pconj(
A));
2616 count += PacketSize;
2617 pstore(blockA + count, cj.pconj(
B));
2618 count += PacketSize;
2619 pstore(blockA + count, cj.pconj(
C));
2620 count += PacketSize;
2622 if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
2626 if (Pack1 >= 2 * PacketSize) {
2627 for (;
i < peeled_mc2;
i += 2 * PacketSize) {
2628 if (PanelMode) count += (2 * PacketSize) * offset;
2630 for (
Index k = 0;
k < depth;
k++) {
2632 A = lhs.template loadPacket<Packet>(
i + 0 * PacketSize,
k);
2633 B = lhs.template loadPacket<Packet>(
i + 1 * PacketSize,
k);
2634 pstore(blockA + count, cj.pconj(
A));
2635 count += PacketSize;
2636 pstore(blockA + count, cj.pconj(
B));
2637 count += PacketSize;
2639 if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
2643 if (Pack1 >= 1 * PacketSize) {
2644 for (;
i < peeled_mc1;
i += 1 * PacketSize) {
2645 if (PanelMode) count += (1 * PacketSize) * offset;
2647 for (
Index k = 0;
k < depth;
k++) {
2649 A = lhs.template loadPacket<Packet>(
i + 0 * PacketSize,
k);
2650 pstore(blockA + count, cj.pconj(
A));
2651 count += PacketSize;
2653 if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
2657 if (HasHalf && Pack1 >= HalfPacketSize) {
2658 for (;
i < peeled_mc_half;
i += HalfPacketSize) {
2659 if (PanelMode) count += (HalfPacketSize)*offset;
2661 for (
Index k = 0;
k < depth;
k++) {
2663 A = lhs.template loadPacket<HalfPacket>(
i + 0 * (HalfPacketSize),
k);
2664 pstoreu(blockA + count, cj.pconj(
A));
2665 count += HalfPacketSize;
2667 if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
2671 if (HasQuarter && Pack1 >= QuarterPacketSize) {
2672 for (;
i < peeled_mc_quarter;
i += QuarterPacketSize) {
2673 if (PanelMode) count += (QuarterPacketSize)*offset;
2675 for (
Index k = 0;
k < depth;
k++) {
2677 A = lhs.template loadPacket<QuarterPacket>(
i + 0 * (QuarterPacketSize),
k);
2678 pstoreu(blockA + count, cj.pconj(
A));
2679 count += QuarterPacketSize;
2681 if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
2690 if (Pack2 < PacketSize && Pack2 > 1) {
2691 for (;
i < peeled_mc0;
i += last_lhs_progress) {
2692 if (PanelMode) count += last_lhs_progress * offset;
2695 for (
Index w = 0;
w < last_lhs_progress;
w++) blockA[count++] = cj(lhs(
i +
w,
k));
2697 if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
2702 if (PanelMode) count += offset;
2703 for (
Index k = 0;
k < depth;
k++) blockA[count++] = cj(lhs(
i,
k));
2704 if (PanelMode) count += (stride - offset - depth);
2719 PanelMode>::operator()(
Scalar* blockA,
const DataMapper& lhs,
Index depth,
2727 HasHalf = (
int)HalfPacketSize < (
int)PacketSize,
2728 HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize
2734 eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2737 bool gone_half =
false, gone_quarter =
false, gone_last =
false;
2741 Index psize = PacketSize;
2744 Index peeled_mc = gone_last ? Pack2 > 1 ? (
rows / pack) * pack : 0 :
i + (remaining_rows / pack) * pack;
2746 for (;
i < peeled_mc;
i += pack) {
2747 if (PanelMode) count += pack * offset;
2750 if (pack >= psize && psize >= QuarterPacketSize) {
2751 const Index peeled_k = (depth / psize) * psize;
2752 for (;
k < peeled_k;
k += psize) {
2753 for (
Index m = 0;
m < pack;
m += psize) {
2754 if (psize == PacketSize) {
2756 for (
Index p = 0;
p < psize; ++
p) kernel.
packet[
p] = lhs.template loadPacket<Packet>(
i +
p +
m,
k);
2759 }
else if (HasHalf && psize == HalfPacketSize) {
2763 kernel_half.
packet[
p] = lhs.template loadPacket<HalfPacket>(
i +
p +
m,
k);
2766 }
else if (HasQuarter && psize == QuarterPacketSize) {
2767 gone_quarter =
true;
2770 kernel_quarter.
packet[
p] = lhs.template loadPacket<QuarterPacket>(
i +
p +
m,
k);
2773 pstore(blockA + count +
m + (pack)*
p, cj.pconj(kernel_quarter.
packet[
p]));
2776 count += psize * pack;
2780 for (;
k < depth;
k++) {
2782 for (;
w < pack - 3;
w += 4) {
2783 Scalar a(cj(lhs(
i +
w + 0,
k))),
b(cj(lhs(
i +
w + 1,
k))),
c(cj(lhs(
i +
w + 2,
k))), d(cj(lhs(
i +
w + 3,
k)));
2784 blockA[count++] =
a;
2785 blockA[count++] =
b;
2786 blockA[count++] =
c;
2787 blockA[count++] = d;
2790 for (;
w < pack; ++
w) blockA[count++] = cj(lhs(
i +
w,
k));
2793 if (PanelMode) count += pack * (stride - offset - depth);
2799 if (!gone_last && (starting_pos ==
i || left >= psize / 2 || left >= psize / 4) &&
2800 ((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
2801 (psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2812 if (Pack2 < PacketSize && !gone_last) {
2814 psize = pack = left & ~1;
2820 if (PanelMode) count += offset;
2821 for (
Index k = 0;
k < depth;
k++) blockA[count++] = cj(lhs(
i,
k));
2822 if (PanelMode) count += (stride - offset - depth);
2833 template <
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2842 template <
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
2848 eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2850 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
2851 Index packet_cols4 = nr >= 4 ? (
cols / 4) * 4 : 0;
2853 const Index peeled_k = (depth / PacketSize) * PacketSize;
2855 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2857 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2859 if (PanelMode) count += 8 * offset;
2860 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2861 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2862 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2863 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2864 const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2865 const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2866 const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2867 const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2869 if (PacketSize % 2 == 0 && PacketSize <= 8)
2871 for (;
k < peeled_k;
k += PacketSize) {
2872 if (PacketSize == 2) {
2874 kernel0.
packet[0 % PacketSize] = dm0.template loadPacket<Packet>(
k);
2875 kernel0.
packet[1 % PacketSize] = dm1.template loadPacket<Packet>(
k);
2876 kernel1.
packet[0 % PacketSize] = dm2.template loadPacket<Packet>(
k);
2877 kernel1.
packet[1 % PacketSize] = dm3.template loadPacket<Packet>(
k);
2878 kernel2.
packet[0 % PacketSize] = dm4.template loadPacket<Packet>(
k);
2879 kernel2.
packet[1 % PacketSize] = dm5.template loadPacket<Packet>(
k);
2880 kernel3.
packet[0 % PacketSize] = dm6.template loadPacket<Packet>(
k);
2881 kernel3.
packet[1 % PacketSize] = dm7.template loadPacket<Packet>(
k);
2887 pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.
packet[0 % PacketSize]));
2888 pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.
packet[0 % PacketSize]));
2889 pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.
packet[0 % PacketSize]));
2890 pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.
packet[0 % PacketSize]));
2892 pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.
packet[1 % PacketSize]));
2893 pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.
packet[1 % PacketSize]));
2894 pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.
packet[1 % PacketSize]));
2895 pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.
packet[1 % PacketSize]));
2896 count += 8 * PacketSize;
2897 }
else if (PacketSize == 4) {
2900 kernel0.
packet[0 % PacketSize] = dm0.template loadPacket<Packet>(
k);
2901 kernel0.
packet[1 % PacketSize] = dm1.template loadPacket<Packet>(
k);
2902 kernel0.
packet[2 % PacketSize] = dm2.template loadPacket<Packet>(
k);
2903 kernel0.
packet[3 % PacketSize] = dm3.template loadPacket<Packet>(
k);
2904 kernel1.
packet[0 % PacketSize] = dm4.template loadPacket<Packet>(
k);
2905 kernel1.
packet[1 % PacketSize] = dm5.template loadPacket<Packet>(
k);
2906 kernel1.
packet[2 % PacketSize] = dm6.template loadPacket<Packet>(
k);
2907 kernel1.
packet[3 % PacketSize] = dm7.template loadPacket<Packet>(
k);
2911 pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.
packet[0 % PacketSize]));
2912 pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.
packet[0 % PacketSize]));
2913 pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.
packet[1 % PacketSize]));
2914 pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.
packet[1 % PacketSize]));
2915 pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.
packet[2 % PacketSize]));
2916 pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.
packet[2 % PacketSize]));
2917 pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.
packet[3 % PacketSize]));
2918 pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.
packet[3 % PacketSize]));
2919 count += 8 * PacketSize;
2920 }
else if (PacketSize == 8) {
2923 kernel0.
packet[0 % PacketSize] = dm0.template loadPacket<Packet>(
k);
2924 kernel0.
packet[1 % PacketSize] = dm1.template loadPacket<Packet>(
k);
2925 kernel0.
packet[2 % PacketSize] = dm2.template loadPacket<Packet>(
k);
2926 kernel0.
packet[3 % PacketSize] = dm3.template loadPacket<Packet>(
k);
2927 kernel0.
packet[4 % PacketSize] = dm4.template loadPacket<Packet>(
k);
2928 kernel0.
packet[5 % PacketSize] = dm5.template loadPacket<Packet>(
k);
2929 kernel0.
packet[6 % PacketSize] = dm6.template loadPacket<Packet>(
k);
2930 kernel0.
packet[7 % PacketSize] = dm7.template loadPacket<Packet>(
k);
2933 pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.
packet[0 % PacketSize]));
2934 pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.
packet[1 % PacketSize]));
2935 pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.
packet[2 % PacketSize]));
2936 pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.
packet[3 % PacketSize]));
2937 pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.
packet[4 % PacketSize]));
2938 pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.
packet[5 % PacketSize]));
2939 pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.
packet[6 % PacketSize]));
2940 pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.
packet[7 % PacketSize]));
2941 count += 8 * PacketSize;
2946 for (;
k < depth;
k++) {
2947 blockB[count + 0] = cj(dm0(
k));
2948 blockB[count + 1] = cj(dm1(
k));
2949 blockB[count + 2] = cj(dm2(
k));
2950 blockB[count + 3] = cj(dm3(
k));
2951 blockB[count + 4] = cj(dm4(
k));
2952 blockB[count + 5] = cj(dm5(
k));
2953 blockB[count + 6] = cj(dm6(
k));
2954 blockB[count + 7] = cj(dm7(
k));
2958 if (PanelMode) count += 8 * (stride - offset - depth);
2964 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2966 if (PanelMode) count += 4 * offset;
2967 const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2968 const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2969 const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2970 const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2973 if ((PacketSize % 4) == 0)
2975 for (;
k < peeled_k;
k += PacketSize) {
2977 kernel.packet[0] = dm0.template loadPacket<Packet>(
k);
2978 kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(
k);
2979 kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(
k);
2980 kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(
k);
2982 pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
2983 pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
2984 pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
2985 pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
2986 count += 4 * PacketSize;
2989 for (;
k < depth;
k++) {
2990 blockB[count + 0] = cj(dm0(
k));
2991 blockB[count + 1] = cj(dm1(
k));
2992 blockB[count + 2] = cj(dm2(
k));
2993 blockB[count + 3] = cj(dm3(
k));
2997 if (PanelMode) count += 4 * (stride - offset - depth);
3002 for (
Index j2 = packet_cols4; j2 <
cols; ++j2) {
3003 if (PanelMode) count += offset;
3005 for (
Index k = 0;
k < depth;
k++) {
3006 blockB[count] = cj(dm0(
k));
3009 if (PanelMode) count += (stride - offset - depth);
3014 template <
typename Scalar,
typename Index,
typename DataMapper,
int nr,
bool Conjugate,
bool PanelMode>
3030 eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
3031 const bool HasHalf = (
int)HalfPacketSize < (
int)PacketSize;
3032 const bool HasQuarter = (
int)QuarterPacketSize < (
int)HalfPacketSize;
3034 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
3035 Index packet_cols4 = nr >= 4 ? (
cols / 4) * 4 : 0;
3038 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
3040 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
3042 if (PanelMode) count += 8 * offset;
3043 for (
Index k = 0;
k < depth;
k++) {
3044 if (PacketSize == 8) {
3045 Packet A = rhs.template loadPacket<Packet>(
k, j2);
3046 pstoreu(blockB + count, cj.pconj(
A));
3047 count += PacketSize;
3048 }
else if (PacketSize == 4) {
3049 Packet A = rhs.template loadPacket<Packet>(
k, j2);
3050 Packet B = rhs.template loadPacket<Packet>(
k, j2 + 4);
3051 pstoreu(blockB + count, cj.pconj(
A));
3052 pstoreu(blockB + count + PacketSize, cj.pconj(
B));
3053 count += 2 * PacketSize;
3056 blockB[count + 0] = cj(dm0(0));
3057 blockB[count + 1] = cj(dm0(1));
3058 blockB[count + 2] = cj(dm0(2));
3059 blockB[count + 3] = cj(dm0(3));
3060 blockB[count + 4] = cj(dm0(4));
3061 blockB[count + 5] = cj(dm0(5));
3062 blockB[count + 6] = cj(dm0(6));
3063 blockB[count + 7] = cj(dm0(7));
3068 if (PanelMode) count += 8 * (stride - offset - depth);
3074 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
3076 if (PanelMode) count += 4 * offset;
3077 for (
Index k = 0;
k < depth;
k++) {
3078 if (PacketSize == 4) {
3079 Packet A = rhs.template loadPacket<Packet>(
k, j2);
3080 pstoreu(blockB + count, cj.pconj(
A));
3081 count += PacketSize;
3082 }
else if (HasHalf && HalfPacketSize == 4) {
3083 HalfPacket A = rhs.template loadPacket<HalfPacket>(
k, j2);
3084 pstoreu(blockB + count, cj.pconj(
A));
3085 count += HalfPacketSize;
3086 }
else if (HasQuarter && QuarterPacketSize == 4) {
3088 pstoreu(blockB + count, cj.pconj(
A));
3089 count += QuarterPacketSize;
3092 blockB[count + 0] = cj(dm0(0));
3093 blockB[count + 1] = cj(dm0(1));
3094 blockB[count + 2] = cj(dm0(2));
3095 blockB[count + 3] = cj(dm0(3));
3100 if (PanelMode) count += 4 * (stride - offset - depth);
3104 for (
Index j2 = packet_cols4; j2 <
cols; ++j2) {
3105 if (PanelMode) count += offset;
3106 for (
Index k = 0;
k < depth;
k++) {
3107 blockB[count] = cj(rhs(
k, j2));
3110 if (PanelMode) count += stride - offset - depth;
3120 std::ptrdiff_t l1, l2, l3;
3128 std::ptrdiff_t l1, l2, l3;
3136 std::ptrdiff_t l1, l2, l3;
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
Definition: AltiVec/PacketMath.h:25
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
Definition: AltiVec/PacketMath.h:30
AnnoyingScalar imag(const AnnoyingScalar &)
Definition: AnnoyingScalar.h:132
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_COMP_MSVC
Definition: Macros.h:131
#define eigen_internal_assert(x)
Definition: Macros.h:916
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DONT_INLINE
Definition: Macros.h:853
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
RowVector3d w
Definition: Matrix_resize_int.cpp:3
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
@ R
Definition: StatisticsVector.h:21
float * p
Definition: Tutorial_Map_using.cpp:9
int rows
Definition: Tutorial_commainit_02.cpp:1
int cols
Definition: Tutorial_commainit_02.cpp:1
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Scalar * b
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition: bench_gemm.cpp:47
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
NumTraits< Scalar >::Real RealScalar
Definition: bench_gemm.cpp:46
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition: bench_gemm.cpp:48
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
Definition: benchmark-blocking-sizes.cpp:22
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
Definition: benchmark-blocking-sizes.cpp:21
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
Definition: benchmark-blocking-sizes.cpp:20
internal::packet_traits< Scalar >::type Packet
Definition: benchmark-blocking-sizes.cpp:54
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
Definition: benchmark-blocking-sizes.cpp:23
boost::multiprecision::number< boost::multiprecision::cpp_dec_float< 100 >, boost::multiprecision::et_on > Real
Definition: boostmultiprec.cpp:77
#define _(A, B)
Definition: cfortran.h:132
Definition: ForwardDeclarations.h:102
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
Definition: IntegralConstant.h:23
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:913
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_)
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:943
Scalar RhsScalar
Definition: products/GeneralBlockPanelKernel.h:847
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:892
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:885
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:908
LhsPacket LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:881
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:897
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:878
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:880
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:905
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:919
std::complex< RealScalar > Scalar
Definition: products/GeneralBlockPanelKernel.h:845
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:882
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:888
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:901
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:937
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:883
RealScalar LhsScalar
Definition: products/GeneralBlockPanelKernel.h:846
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:931
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_)
PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_)
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:903
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:879
Scalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:848
PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:616
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:592
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:562
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:542
RealScalar RhsScalar
Definition: products/GeneralBlockPanelKernel.h:513
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:550
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:587
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:566
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:557
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:579
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:610
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:622
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:548
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:543
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:572
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:568
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:541
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: products/GeneralBlockPanelKernel.h:514
std::complex< RealScalar > LhsScalar
Definition: products/GeneralBlockPanelKernel.h:512
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:546
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_)
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:553
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_)
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:598
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:584
LhsPacket LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:544
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:739
std::conditional_t< Vectorizable, DoublePacketType, Scalar > AccPacket
Definition: products/GeneralBlockPanelKernel.h:736
PACKET_DECL_COND(Real, PacketSize_)
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:787
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:749
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:803
PACKET_DECL_COND_SCALAR(PacketSize_)
std::conditional_t< Vectorizable, RealPacket, Scalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:733
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
Definition: products/GeneralBlockPanelKernel.h:814
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:777
std::complex< RealScalar > Scalar
Definition: products/GeneralBlockPanelKernel.h:702
std::complex< RealScalar > ResScalar
Definition: products/GeneralBlockPanelKernel.h:705
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_)
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_)
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:817
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
Definition: products/GeneralBlockPanelKernel.h:743
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:809
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: products/GeneralBlockPanelKernel.h:753
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:782
std::conditional_t< Vectorizable, ScalarPacket, Scalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:735
DoublePacket< RealPacket > DoublePacketType
Definition: products/GeneralBlockPanelKernel.h:730
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:758
std::complex< RealScalar > RhsScalar
Definition: products/GeneralBlockPanelKernel.h:704
std::complex< RealScalar > LhsScalar
Definition: products/GeneralBlockPanelKernel.h:703
std::conditional_t< Vectorizable, DoublePacketType, Scalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:734
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:776
EIGEN_STRONG_INLINE std::enable_if_t<!is_same< RhsPacketType, RhsPacketx4 >::value > madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:793
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:766
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
Definition: products/GeneralBlockPanelKernel.h:839
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:774
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
Definition: products/GeneralBlockPanelKernel.h:741
std::conditional_t< Vectorizable, ScalarPacket, Scalar > LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:732
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: products/GeneralBlockPanelKernel.h:770
Definition: products/GeneralBlockPanelKernel.h:397
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_)
LhsPacket LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:440
RhsScalar_ RhsScalar
Definition: products/GeneralBlockPanelKernel.h:400
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition: products/GeneralBlockPanelKernel.h:499
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:448
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:452
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:457
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:466
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:442
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:443
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:471
LhsScalar_ LhsScalar
Definition: products/GeneralBlockPanelKernel.h:399
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:494
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Definition: products/GeneralBlockPanelKernel.h:504
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:461
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:439
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_)
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:438
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:437
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:445
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: products/GeneralBlockPanelKernel.h:401
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:463
@ nr
Definition: products/GeneralBlockPanelKernel.h:418
@ default_mr
Definition: products/GeneralBlockPanelKernel.h:421
@ ConjLhs
Definition: products/GeneralBlockPanelKernel.h:408
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:413
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:410
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:434
@ RhsPacketSize
Definition: products/GeneralBlockPanelKernel.h:412
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:433
@ mr
Definition: products/GeneralBlockPanelKernel.h:430
@ NumberOfRegisters
Definition: products/GeneralBlockPanelKernel.h:415
@ ConjRhs
Definition: products/GeneralBlockPanelKernel.h:409
@ LhsPacketSize
Definition: products/GeneralBlockPanelKernel.h:411
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:476
Definition: matrices.h:74
float real
Definition: datatypes.h:10
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
Action
Definition: Constants.h:516
@ GetAction
Definition: Constants.h:516
@ SetAction
Definition: Constants.h:516
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
RealScalar alpha
Definition: level1_cplx_impl.h:151
const Scalar * a
Definition: level2_cplx_impl.h:32
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
@ Target
Definition: Constants.h:495
const std::ptrdiff_t defaultL2CacheSize
Definition: products/GeneralBlockPanelKernel.h:62
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
constexpr int plain_enum_min(A a, B b)
Definition: Meta.h:649
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Definition: GenericPacketMath.h:849
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:1263
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, std::enable_if_t< unpacket_traits< RealPacket >::size<=8 > *=0)
Definition: products/GeneralBlockPanelKernel.h:668
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
Definition: products/GeneralBlockPanelKernel.h:118
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
Definition: LSX/Complex.h:218
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
const std::ptrdiff_t defaultL3CacheSize
Definition: products/GeneralBlockPanelKernel.h:63
GEBPPacketSizeType
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketHalf
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketFull
Definition: products/GeneralBlockPanelKernel.h:20
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Definition: products/GeneralBlockPanelKernel.h:321
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
Definition: products/GeneralBlockPanelKernel.h:86
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
Definition: products/GeneralBlockPanelKernel.h:287
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:911
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Definition: products/GeneralBlockPanelKernel.h:27
const std::ptrdiff_t defaultL1CacheSize
Definition: products/GeneralBlockPanelKernel.h:61
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
std::ptrdiff_t l1CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3119
std::ptrdiff_t l2CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3127
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
std::ptrdiff_t l3CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3135
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: products/GeneralBlockPanelKernel.h:3146
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
double K
Wave number.
Definition: sphere_scattering.cc:115
int c
Definition: calibrate.py:100
action
Definition: calibrate.py:47
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:32
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:44
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:38
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:217
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:1043
Definition: products/GeneralBlockPanelKernel.h:71
std::ptrdiff_t m_l1
Definition: products/GeneralBlockPanelKernel.h:80
CacheSizes()
Definition: products/GeneralBlockPanelKernel.h:72
std::ptrdiff_t m_l2
Definition: products/GeneralBlockPanelKernel.h:81
std::ptrdiff_t m_l3
Definition: products/GeneralBlockPanelKernel.h:82
Definition: products/GeneralBlockPanelKernel.h:631
Packet second
Definition: products/GeneralBlockPanelKernel.h:633
Packet first
Definition: products/GeneralBlockPanelKernel.h:632
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
Definition: products/GeneralBlockPanelKernel.h:343
const Packet & get(const FixedInt< 0 > &) const
Definition: products/GeneralBlockPanelKernel.h:345
Packet B2
Definition: products/GeneralBlockPanelKernel.h:344
Packet B_0
Definition: products/GeneralBlockPanelKernel.h:344
const Packet & get(const FixedInt< 2 > &) const
Definition: products/GeneralBlockPanelKernel.h:347
const Packet & get(const FixedInt< 3 > &) const
Definition: products/GeneralBlockPanelKernel.h:348
Packet B1
Definition: products/GeneralBlockPanelKernel.h:344
const Packet & get(const FixedInt< 1 > &) const
Definition: products/GeneralBlockPanelKernel.h:346
Packet B3
Definition: products/GeneralBlockPanelKernel.h:344
Definition: products/GeneralBlockPanelKernel.h:333
static constexpr int remaining_registers
Definition: products/GeneralBlockPanelKernel.h:335
RhsPacket type
Definition: products/GeneralBlockPanelKernel.h:339
typedef RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:339
Definition: ConjHelper.h:71
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:79
Definition: ConjHelper.h:42
Definition: products/GeneralBlockPanelKernel.h:960
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:995
Traits::RhsPacket RhsPacket
Definition: products/GeneralBlockPanelKernel.h:969
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:982
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition: products/GeneralBlockPanelKernel.h:963
QuarterTraits::ResPacket ResPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:992
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:983
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:991
HalfTraits::ResPacket ResPacketHalf
Definition: products/GeneralBlockPanelKernel.h:987
HalfTraits::RhsPacket RhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:986
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:1005
@ RhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1004
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:998
@ LhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1001
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:1002
@ LhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1000
@ RhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1003
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:999
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:961
Traits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:972
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:981
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:990
HalfTraits::LhsPacket LhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:985
QuarterTraits::AccPacket AccPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:993
SwappedTraits::ResScalar SResScalar
Definition: products/GeneralBlockPanelKernel.h:979
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:977
RhsPanelHelper< RhsPacket, RhsPacketx4, 27 >::type RhsPanel27
Definition: products/GeneralBlockPanelKernel.h:975
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition: products/GeneralBlockPanelKernel.h:974
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition: products/GeneralBlockPanelKernel.h:965
Traits::LhsPacket LhsPacket
Definition: products/GeneralBlockPanelKernel.h:968
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:967
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:980
Traits::ResPacket ResPacket
Definition: products/GeneralBlockPanelKernel.h:970
Traits::AccPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:971
HalfTraits::AccPacket AccPacketHalf
Definition: products/GeneralBlockPanelKernel.h:988
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
Definition: products/GeneralBlockPanelKernel.h:1425
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2711
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2562
Definition: BlasUtil.h:34
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2836
packet_traits< Scalar >::type Packet
Definition: products/GeneralBlockPanelKernel.h:2835
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:3019
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
Definition: products/GeneralBlockPanelKernel.h:3025
unpacket_traits< Packet >::half HalfPacket
Definition: products/GeneralBlockPanelKernel.h:3017
unpacket_traits< typename unpacket_traits< Packet >::half >::half QuarterPacket
Definition: products/GeneralBlockPanelKernel.h:3018
packet_traits< Scalar >::type Packet
Definition: products/GeneralBlockPanelKernel.h:3016
Definition: BlasUtil.h:30
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:1050
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:1052
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: products/GeneralBlockPanelKernel.h:1055
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:1053
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:1046
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:1051
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:1049
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:1047
Definition: products/GeneralBlockPanelKernel.h:1017
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:1025
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:1024
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:1018
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:1021
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:1022
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:1019
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: products/GeneralBlockPanelKernel.h:1027
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:1023
Definition: products/GeneralBlockPanelKernel.h:1406
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1407
Definition: products/GeneralBlockPanelKernel.h:1091
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
Definition: products/GeneralBlockPanelKernel.h:1111
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:1092
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1094
T1 type
Definition: products/GeneralBlockPanelKernel.h:358
T2 type
Definition: products/GeneralBlockPanelKernel.h:363
Definition: products/GeneralBlockPanelKernel.h:352
T3 type
Definition: products/GeneralBlockPanelKernel.h:353
Definition: GenericPacketMath.h:108
Definition: ForwardDeclarations.h:21
DoublePacket< typename unpacket_traits< Packet >::half > half
Definition: products/GeneralBlockPanelKernel.h:687
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
Definition: datatypes.h:12
Definition: ZVector/PacketMath.h:50