Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper > Struct Template Reference

#include <GeneralBlockPanelKernel.h>

+ Inheritance diagram for Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >:

Public Types

typedef GEBPTraits::RhsPacketx4 RhsPacketx4
 

Public Member Functions

EIGEN_STRONG_INLINE void peeled_kc_onestep (Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
 
EIGEN_STRONG_INLINE void operator() (const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
 

Member Typedef Documentation

◆ RhsPacketx4

template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar , typename RhsScalar , typename ResScalar , typename AccPacket , typename LhsPacket , typename RhsPacket , typename ResPacket , typename GEBPTraits , typename LinearMapper , typename DataMapper >
typedef GEBPTraits::RhsPacketx4 Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >::RhsPacketx4

Member Function Documentation

◆ operator()()

template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar , typename RhsScalar , typename ResScalar , typename AccPacket , typename LhsPacket , typename RhsPacket , typename ResPacket , typename GEBPTraits , typename LinearMapper , typename DataMapper >
EIGEN_STRONG_INLINE void Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >::operator() ( const DataMapper &  res,
const LhsScalar *  blockA,
const RhsScalar *  blockB,
ResScalar  alpha,
Index  peelStart,
Index  peelEnd,
Index  strideA,
Index  strideB,
Index  offsetA,
Index  offsetB,
int  prefetch_res_offset,
Index  peeled_kc,
Index  pk,
Index  cols,
Index  depth,
Index  packet_cols4 
)
inline
1114  {
1115  GEBPTraits traits;
1116  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1117  // loops on each largest micro horizontal panel of lhs
1118  // (LhsProgress x depth)
1119  for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
1120 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1121  EIGEN_IF_CONSTEXPR(nr >= 8) {
1122  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1123  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1124  prefetch(&blA[0]);
1125 
1126  // gets res block as register
1127  AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1128  traits.initAcc(C0);
1129  traits.initAcc(C1);
1130  traits.initAcc(C2);
1131  traits.initAcc(C3);
1132  traits.initAcc(C4);
1133  traits.initAcc(C5);
1134  traits.initAcc(C6);
1135  traits.initAcc(C7);
1136 
1137  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1138  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1139  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1140  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1141  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1142  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1143  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1144  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1145  r0.prefetch(prefetch_res_offset);
1146  r1.prefetch(prefetch_res_offset);
1147  r2.prefetch(prefetch_res_offset);
1148  r3.prefetch(prefetch_res_offset);
1149  r4.prefetch(prefetch_res_offset);
1150  r5.prefetch(prefetch_res_offset);
1151  r6.prefetch(prefetch_res_offset);
1152  r7.prefetch(prefetch_res_offset);
1153  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1154  prefetch(&blB[0]);
1155 
1156  LhsPacket A0;
1157  for (Index k = 0; k < peeled_kc; k += pk) {
1158  RhsPacketx4 rhs_panel;
1159  RhsPacket T0;
1160 #define EIGEN_GEBGP_ONESTEP(K) \
1161  do { \
1162  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1163  traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1164  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1165  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1166  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1167  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1168  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1169  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1170  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1171  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1172  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1173  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1174  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1175  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1176  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1177  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1178  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1179  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1180  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1181  } while (false)
1182 
1183  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
1184 
1193 
1194  blB += pk * 8 * RhsProgress;
1195  blA += pk * (1 * LhsProgress);
1196 
1197  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
1198  }
1199  // process remaining peeled loop
1200  for (Index k = peeled_kc; k < depth; k++) {
1201  RhsPacketx4 rhs_panel;
1202  RhsPacket T0;
1204  blB += 8 * RhsProgress;
1205  blA += 1 * LhsProgress;
1206  }
1207 
1208 #undef EIGEN_GEBGP_ONESTEP
1209 
1210  ResPacket R0, R1;
1211  ResPacket alphav = pset1<ResPacket>(alpha);
1212 
1213  R0 = r0.template loadPacket<ResPacket>(0);
1214  R1 = r1.template loadPacket<ResPacket>(0);
1215  traits.acc(C0, alphav, R0);
1216  traits.acc(C1, alphav, R1);
1217  r0.storePacket(0, R0);
1218  r1.storePacket(0, R1);
1219 
1220  R0 = r2.template loadPacket<ResPacket>(0);
1221  R1 = r3.template loadPacket<ResPacket>(0);
1222  traits.acc(C2, alphav, R0);
1223  traits.acc(C3, alphav, R1);
1224  r2.storePacket(0, R0);
1225  r3.storePacket(0, R1);
1226 
1227  R0 = r4.template loadPacket<ResPacket>(0);
1228  R1 = r5.template loadPacket<ResPacket>(0);
1229  traits.acc(C4, alphav, R0);
1230  traits.acc(C5, alphav, R1);
1231  r4.storePacket(0, R0);
1232  r5.storePacket(0, R1);
1233 
1234  R0 = r6.template loadPacket<ResPacket>(0);
1235  R1 = r7.template loadPacket<ResPacket>(0);
1236  traits.acc(C6, alphav, R0);
1237  traits.acc(C7, alphav, R1);
1238  r6.storePacket(0, R0);
1239  r7.storePacket(0, R1);
1240  }
1241  }
1242 #endif
1243 
1244  // loops on each largest micro vertical panel of rhs (depth * nr)
1245  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1246  // We select a LhsProgress x nr micro block of res
1247  // which is entirely stored into 1 x nr registers.
1248 
1249  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1250  prefetch(&blA[0]);
1251 
1252  // gets res block as register
1253  AccPacket C0, C1, C2, C3;
1254  traits.initAcc(C0);
1255  traits.initAcc(C1);
1256  traits.initAcc(C2);
1257  traits.initAcc(C3);
1258  // To improve instruction pipelining, let's double the accumulation registers:
1259  // even k will accumulate in C*, while odd k will accumulate in D*.
1260  // This trick is crucial to get good performance with FMA, otherwise it is
1261  // actually faster to perform separated MUL+ADD because of a naturally
1262  // better instruction-level parallelism.
1263  AccPacket D0, D1, D2, D3;
1264  traits.initAcc(D0);
1265  traits.initAcc(D1);
1266  traits.initAcc(D2);
1267  traits.initAcc(D3);
1268 
1269  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1270  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1271  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1272  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1273 
1274  r0.prefetch(prefetch_res_offset);
1275  r1.prefetch(prefetch_res_offset);
1276  r2.prefetch(prefetch_res_offset);
1277  r3.prefetch(prefetch_res_offset);
1278 
1279  // performs "inner" products
1280  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1281  prefetch(&blB[0]);
1282  LhsPacket A0, A1;
1283 
1284  for (Index k = 0; k < peeled_kc; k += pk) {
1285  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1286  RhsPacketx4 rhs_panel;
1287  RhsPacket T0;
1288 
1289  internal::prefetch(blB + (48 + 0));
1290  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1291  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1292  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1293  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1294  internal::prefetch(blB + (48 + 16));
1295  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1296  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1297  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1298  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1299 
1300  blB += pk * 4 * RhsProgress;
1301  blA += pk * LhsProgress;
1302 
1303  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1304  }
1305  C0 = padd(C0, D0);
1306  C1 = padd(C1, D1);
1307  C2 = padd(C2, D2);
1308  C3 = padd(C3, D3);
1309 
1310  // process remaining peeled loop
1311  for (Index k = peeled_kc; k < depth; k++) {
1312  RhsPacketx4 rhs_panel;
1313  RhsPacket T0;
1314  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1315  blB += 4 * RhsProgress;
1316  blA += LhsProgress;
1317  }
1318 
1319  ResPacket R0, R1;
1320  ResPacket alphav = pset1<ResPacket>(alpha);
1321 
1322  R0 = r0.template loadPacket<ResPacket>(0);
1323  R1 = r1.template loadPacket<ResPacket>(0);
1324  traits.acc(C0, alphav, R0);
1325  traits.acc(C1, alphav, R1);
1326  r0.storePacket(0, R0);
1327  r1.storePacket(0, R1);
1328 
1329  R0 = r2.template loadPacket<ResPacket>(0);
1330  R1 = r3.template loadPacket<ResPacket>(0);
1331  traits.acc(C2, alphav, R0);
1332  traits.acc(C3, alphav, R1);
1333  r2.storePacket(0, R0);
1334  r3.storePacket(0, R1);
1335  }
1336 
1337  // Deal with remaining columns of the rhs
1338  for (Index j2 = packet_cols4; j2 < cols; j2++) {
1339  // One column at a time
1340  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1341  prefetch(&blA[0]);
1342 
1343  // gets res block as register
1344  AccPacket C0;
1345  traits.initAcc(C0);
1346 
1347  LinearMapper r0 = res.getLinearMapper(i, j2);
1348 
1349  // performs "inner" products
1350  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1351  LhsPacket A0;
1352 
1353  for (Index k = 0; k < peeled_kc; k += pk) {
1354  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1355  RhsPacket B_0;
1356 
1357 #define EIGEN_GEBGP_ONESTEP(K) \
1358  do { \
1359  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1360  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1361  /* FIXME: why unaligned???? */ \
1362  traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1363  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1364  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1365  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1366  } while (false);
1367 
1376 
1377  blB += pk * RhsProgress;
1378  blA += pk * LhsProgress;
1379 
1380  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1381  }
1382 
1383  // process remaining peeled loop
1384  for (Index k = peeled_kc; k < depth; k++) {
1385  RhsPacket B_0;
1387  blB += RhsProgress;
1388  blA += LhsProgress;
1389  }
1390 #undef EIGEN_GEBGP_ONESTEP
1391  ResPacket R0;
1392  ResPacket alphav = pset1<ResPacket>(alpha);
1393  R0 = r0.template loadPacket<ResPacket>(0);
1394  traits.acc(C0, alphav, R0);
1395  r0.storePacket(0, R0);
1396  }
1397  }
1398  }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
int cols
Definition: Tutorial_commainit_02.cpp:1
RealScalar alpha
Definition: level1_cplx_impl.h:151
char char char int int * k
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
#define EIGEN_GEBGP_ONESTEP(K)
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:1092
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1094

References alpha, Global_Physical_Variables::C1, Global_Physical_Variables::C2, cols, EIGEN_ASM_COMMENT, EIGEN_GEBGP_ONESTEP, EIGEN_IF_CONSTEXPR, i, k, Eigen::internal::padd(), Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >::peeled_kc_onestep(), Eigen::internal::prefetch(), and res.

◆ peeled_kc_onestep()

template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar , typename RhsScalar , typename ResScalar , typename AccPacket , typename LhsPacket , typename RhsPacket , typename ResPacket , typename GEBPTraits , typename LinearMapper , typename DataMapper >
EIGEN_STRONG_INLINE void Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >::peeled_kc_onestep ( Index  K,
const LhsScalar *  blA,
const RhsScalar *  blB,
GEBPTraits  traits,
LhsPacket *  A0,
RhsPacketx4 rhs_panel,
RhsPacket *  T0,
AccPacket *  C0,
AccPacket *  C1,
AccPacket *  C2,
AccPacket *  C3 
)
inline
1096  {
1097  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1098  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1099  traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
1100  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
1101  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1102  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1103  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1104  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1105 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1106  __asm__("" : "+x,m"(*A0));
1107 #endif
1108  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1109  }
double K
Wave number.
Definition: sphere_scattering.cc:115

References Global_Physical_Variables::C1, Global_Physical_Variables::C2, EIGEN_ASM_COMMENT, and PlanarWave::K.

Referenced by Eigen::internal::lhs_process_one_packet< nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper >::operator()().


The documentation for this struct was generated from the following file: