1116 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
1119 for (
Index i = peelStart;
i < peelEnd;
i += LhsProgress) {
1120 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1122 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1123 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1127 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7;
1137 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1138 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1139 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1140 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1141 LinearMapper r4 =
res.getLinearMapper(
i, j2 + 4);
1142 LinearMapper r5 =
res.getLinearMapper(
i, j2 + 5);
1143 LinearMapper r6 =
res.getLinearMapper(
i, j2 + 6);
1144 LinearMapper r7 =
res.getLinearMapper(
i, j2 + 7);
1145 r0.prefetch(prefetch_res_offset);
1146 r1.prefetch(prefetch_res_offset);
1147 r2.prefetch(prefetch_res_offset);
1148 r3.prefetch(prefetch_res_offset);
1149 r4.prefetch(prefetch_res_offset);
1150 r5.prefetch(prefetch_res_offset);
1151 r6.prefetch(prefetch_res_offset);
1152 r7.prefetch(prefetch_res_offset);
1153 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1157 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1160 #define EIGEN_GEBGP_ONESTEP(K) \
1162 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1163 traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1164 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1165 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1166 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1167 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1168 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1169 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1170 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1171 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1172 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1173 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1174 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1175 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1176 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1177 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1178 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1179 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1180 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1194 blB += pk * 8 * RhsProgress;
1195 blA += pk * (1 * LhsProgress);
1200 for (
Index k = peeled_kc;
k < depth;
k++) {
1204 blB += 8 * RhsProgress;
1205 blA += 1 * LhsProgress;
1208 #undef EIGEN_GEBGP_ONESTEP
1211 ResPacket alphav = pset1<ResPacket>(
alpha);
1213 R0 = r0.template loadPacket<ResPacket>(0);
1214 R1 = r1.template loadPacket<ResPacket>(0);
1215 traits.acc(C0, alphav, R0);
1216 traits.acc(
C1, alphav, R1);
1217 r0.storePacket(0, R0);
1218 r1.storePacket(0, R1);
1220 R0 = r2.template loadPacket<ResPacket>(0);
1221 R1 = r3.template loadPacket<ResPacket>(0);
1222 traits.acc(
C2, alphav, R0);
1223 traits.acc(C3, alphav, R1);
1224 r2.storePacket(0, R0);
1225 r3.storePacket(0, R1);
1227 R0 = r4.template loadPacket<ResPacket>(0);
1228 R1 = r5.template loadPacket<ResPacket>(0);
1229 traits.acc(C4, alphav, R0);
1230 traits.acc(C5, alphav, R1);
1231 r4.storePacket(0, R0);
1232 r5.storePacket(0, R1);
1234 R0 = r6.template loadPacket<ResPacket>(0);
1235 R1 = r7.template loadPacket<ResPacket>(0);
1236 traits.acc(C6, alphav, R0);
1237 traits.acc(C7, alphav, R1);
1238 r6.storePacket(0, R0);
1239 r7.storePacket(0, R1);
1245 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1249 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1253 AccPacket C0,
C1,
C2, C3;
1263 AccPacket D0, D1, D2, D3;
1269 LinearMapper r0 =
res.getLinearMapper(
i, j2 + 0);
1270 LinearMapper r1 =
res.getLinearMapper(
i, j2 + 1);
1271 LinearMapper r2 =
res.getLinearMapper(
i, j2 + 2);
1272 LinearMapper r3 =
res.getLinearMapper(
i, j2 + 3);
1274 r0.prefetch(prefetch_res_offset);
1275 r1.prefetch(prefetch_res_offset);
1276 r2.prefetch(prefetch_res_offset);
1277 r3.prefetch(prefetch_res_offset);
1280 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1284 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1290 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1291 peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1292 peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1293 peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1295 peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1296 peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1297 peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1298 peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1300 blB += pk * 4 * RhsProgress;
1301 blA += pk * LhsProgress;
1311 for (
Index k = peeled_kc;
k < depth;
k++) {
1314 peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &
C1, &
C2, &C3);
1315 blB += 4 * RhsProgress;
1320 ResPacket alphav = pset1<ResPacket>(
alpha);
1322 R0 = r0.template loadPacket<ResPacket>(0);
1323 R1 = r1.template loadPacket<ResPacket>(0);
1324 traits.acc(C0, alphav, R0);
1325 traits.acc(
C1, alphav, R1);
1326 r0.storePacket(0, R0);
1327 r1.storePacket(0, R1);
1329 R0 = r2.template loadPacket<ResPacket>(0);
1330 R1 = r3.template loadPacket<ResPacket>(0);
1331 traits.acc(
C2, alphav, R0);
1332 traits.acc(C3, alphav, R1);
1333 r2.storePacket(0, R0);
1334 r3.storePacket(0, R1);
1338 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
1340 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (LhsProgress)];
1347 LinearMapper r0 =
res.getLinearMapper(
i, j2);
1350 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1353 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1357 #define EIGEN_GEBGP_ONESTEP(K) \
1359 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1360 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1362 traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1363 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1364 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1365 EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1377 blB += pk * RhsProgress;
1378 blA += pk * LhsProgress;
1384 for (
Index k = peeled_kc;
k < depth;
k++) {
1390 #undef EIGEN_GEBGP_ONESTEP
1392 ResPacket alphav = pset1<ResPacket>(
alpha);
1393 R0 = r0.template loadPacket<ResPacket>(0);
1394 traits.acc(C0, alphav, R0);
1395 r0.storePacket(0, R0);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
int cols
Definition: Tutorial_commainit_02.cpp:1
RealScalar alpha
Definition: level1_cplx_impl.h:151
char char char int int * k
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
#define EIGEN_GEBGP_ONESTEP(K)
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:1092
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1094