Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs > Struct Template Reference

#include <GeneralBlockPanelKernel.h>

Public Types

enum  {
  Vectorizable = Traits::Vectorizable , LhsProgress = Traits::LhsProgress , LhsProgressHalf = HalfTraits::LhsProgress , LhsProgressQuarter = QuarterTraits::LhsProgress ,
  RhsProgress = Traits::RhsProgress , RhsProgressHalf = HalfTraits::RhsProgress , RhsProgressQuarter = QuarterTraits::RhsProgress , ResPacketSize = Traits::ResPacketSize
}
 
typedef gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::TargetTraits
 
typedef gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalfHalfTraits
 
typedef gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarterQuarterTraits
 
typedef Traits::ResScalar ResScalar
 
typedef Traits::LhsPacket LhsPacket
 
typedef Traits::RhsPacket RhsPacket
 
typedef Traits::ResPacket ResPacket
 
typedef Traits::AccPacket AccPacket
 
typedef Traits::RhsPacketx4 RhsPacketx4
 
typedef RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
 
typedef RhsPanelHelper< RhsPacket, RhsPacketx4, 27 >::type RhsPanel27
 
typedef gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::TargetSwappedTraits
 
typedef SwappedTraits::ResScalar SResScalar
 
typedef SwappedTraits::LhsPacket SLhsPacket
 
typedef SwappedTraits::RhsPacket SRhsPacket
 
typedef SwappedTraits::ResPacket SResPacket
 
typedef SwappedTraits::AccPacket SAccPacket
 
typedef HalfTraits::LhsPacket LhsPacketHalf
 
typedef HalfTraits::RhsPacket RhsPacketHalf
 
typedef HalfTraits::ResPacket ResPacketHalf
 
typedef HalfTraits::AccPacket AccPacketHalf
 
typedef QuarterTraits::LhsPacket LhsPacketQuarter
 
typedef QuarterTraits::RhsPacket RhsPacketQuarter
 
typedef QuarterTraits::ResPacket ResPacketQuarter
 
typedef QuarterTraits::AccPacket AccPacketQuarter
 
typedef DataMapper::LinearMapper LinearMapper
 

Public Member Functions

EIGEN_DONT_INLINE void operator() (const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
 

Member Typedef Documentation

◆ AccPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::AccPacket

◆ AccPacketHalf

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef HalfTraits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::AccPacketHalf

◆ AccPacketQuarter

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef QuarterTraits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::AccPacketQuarter

◆ HalfTraits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::HalfTraits

◆ LhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LhsPacket

◆ LhsPacketHalf

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef HalfTraits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LhsPacketHalf

◆ LhsPacketQuarter

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef QuarterTraits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LhsPacketQuarter

◆ LinearMapper

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef DataMapper::LinearMapper Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::LinearMapper

◆ QuarterTraits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::QuarterTraits

◆ ResPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResPacket

◆ ResPacketHalf

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef HalfTraits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResPacketHalf

◆ ResPacketQuarter

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef QuarterTraits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResPacketQuarter

◆ ResScalar

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::ResScalar

◆ RhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacket

◆ RhsPacketHalf

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef HalfTraits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacketHalf

◆ RhsPacketQuarter

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef QuarterTraits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacketQuarter

◆ RhsPacketx4

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef Traits::RhsPacketx4 Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPacketx4

◆ RhsPanel15

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPanel15

◆ RhsPanel27

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::RhsPanel27

◆ SAccPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::AccPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SAccPacket

◆ SLhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::LhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SLhsPacket

◆ SResPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::ResPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResPacket

◆ SResScalar

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::ResScalar Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SResScalar

◆ SRhsPacket

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef SwappedTraits::RhsPacket Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SRhsPacket

◆ SwappedTraits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::SwappedTraits

◆ Traits

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::Traits

Member Enumeration Documentation

◆ anonymous enum

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
anonymous enum
Enumerator
Vectorizable 
LhsProgress 
LhsProgressHalf 
LhsProgressQuarter 
RhsProgress 
RhsProgressHalf 
RhsProgressQuarter 
ResPacketSize 
997  {
1006  };
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:413
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:410
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:434
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:433
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:1005
@ RhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1004
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:998
@ LhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1001
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:1002
@ LhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1000
@ RhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1003
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:999

Member Function Documentation

◆ operator()()

template<typename LhsScalar , typename RhsScalar , typename Index , typename DataMapper , int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
EIGEN_DONT_INLINE void Eigen::internal::gebp_kernel< LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs >::operator() ( const DataMapper &  res,
const LhsScalar *  blockA,
const RhsScalar *  blockB,
Index  rows,
Index  depth,
Index  cols,
ResScalar  alpha,
Index  strideA = -1,
Index  strideB = -1,
Index  offsetA = 0,
Index  offsetB = 0 
)
1428  {
1429  Traits traits;
1430  SwappedTraits straits;
1431 
1432  if (strideA == -1) strideA = depth;
1433  if (strideB == -1) strideB = depth;
1434  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
1435  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
1436  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1437  const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
1438  const Index peeled_mc2 =
1439  mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
1440  const Index peeled_mc1 =
1441  mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
1442  const Index peeled_mc_half =
1443  mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
1444  const Index peeled_mc_quarter =
1445  mr >= LhsProgressQuarter
1446  ? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
1447  : 0;
1448  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1449  const Index peeled_kc = depth & ~(pk - 1);
1450  const int prefetch_res_offset = 32 / sizeof(ResScalar);
1451  // const Index depth2 = depth & ~1;
1452 
1453  //---------- Process 3 * LhsProgress rows at once ----------
1454  // This corresponds to 3*LhsProgress x nr register blocks.
1455  // Usually, make sense only with FMA
1456  if (mr >= 3 * Traits::LhsProgress) {
1457  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
1458  // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
1459  // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
1460  // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
1461  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1462  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1463  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1464  // guess), or because we are testing specific blocking sizes.
1465  const Index actual_panel_rows =
1466  (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1467  (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
1468  for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1469  const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
1470 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1471  EIGEN_IF_CONSTEXPR(nr >= 8) {
1472  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1473  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1474  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1475  prefetch(&blA[0]);
1476  // gets res block as register
1477  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1478  C21, C22, C23;
1479  traits.initAcc(C0);
1480  traits.initAcc(C1);
1481  traits.initAcc(C2);
1482  traits.initAcc(C3);
1483  traits.initAcc(C4);
1484  traits.initAcc(C5);
1485  traits.initAcc(C6);
1486  traits.initAcc(C7);
1487  traits.initAcc(C8);
1488  traits.initAcc(C9);
1489  traits.initAcc(C10);
1490  traits.initAcc(C11);
1491  traits.initAcc(C12);
1492  traits.initAcc(C13);
1493  traits.initAcc(C14);
1494  traits.initAcc(C15);
1495  traits.initAcc(C16);
1496  traits.initAcc(C17);
1497  traits.initAcc(C18);
1498  traits.initAcc(C19);
1499  traits.initAcc(C20);
1500  traits.initAcc(C21);
1501  traits.initAcc(C22);
1502  traits.initAcc(C23);
1503 
1504  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1505  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1506  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1507  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1508  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1509  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1510  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1511  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1512 
1513  r0.prefetch(0);
1514  r1.prefetch(0);
1515  r2.prefetch(0);
1516  r3.prefetch(0);
1517  r4.prefetch(0);
1518  r5.prefetch(0);
1519  r6.prefetch(0);
1520  r7.prefetch(0);
1521 
1522  // performs "inner" products
1523  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1524  prefetch(&blB[0]);
1525  LhsPacket A0, A1;
1526  for (Index k = 0; k < peeled_kc; k += pk) {
1527  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
1528  // 27 registers are taken (24 for acc, 3 for lhs).
1529  RhsPanel27 rhs_panel;
1530  RhsPacket T0;
1531  LhsPacket A2;
1532 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1533 // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1534 // without this workaround A0, A1, and A2 are loaded in the same register,
1535 // which is not good for pipelining
1536 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1537 #else
1538 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1539 #endif
1540 
1541 #define EIGEN_GEBP_ONESTEP(K) \
1542  do { \
1543  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1544  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1545  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1546  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1547  EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1548  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1549  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1550  traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1551  traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1552  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1553  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1554  traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1555  traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1556  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1557  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1558  traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1559  traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1560  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1561  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1562  traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1563  traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1564  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1565  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1566  traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1567  traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1568  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1569  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1570  traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1571  traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1572  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1573  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1574  traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1575  traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1576  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1577  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1578  traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1579  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1580  } while (false)
1581 
1582  EIGEN_GEBP_ONESTEP(0);
1583  EIGEN_GEBP_ONESTEP(1);
1584  EIGEN_GEBP_ONESTEP(2);
1585  EIGEN_GEBP_ONESTEP(3);
1586  EIGEN_GEBP_ONESTEP(4);
1587  EIGEN_GEBP_ONESTEP(5);
1588  EIGEN_GEBP_ONESTEP(6);
1589  EIGEN_GEBP_ONESTEP(7);
1590 
1591  blB += pk * 8 * RhsProgress;
1592  blA += pk * 3 * Traits::LhsProgress;
1593  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
1594  }
1595 
1596  // process remaining peeled loop
1597  for (Index k = peeled_kc; k < depth; k++) {
1598  RhsPanel27 rhs_panel;
1599  RhsPacket T0;
1600  LhsPacket A2;
1601  EIGEN_GEBP_ONESTEP(0);
1602  blB += 8 * RhsProgress;
1603  blA += 3 * Traits::LhsProgress;
1604  }
1605 
1606 #undef EIGEN_GEBP_ONESTEP
1607 
1608  ResPacket R0, R1, R2;
1609  ResPacket alphav = pset1<ResPacket>(alpha);
1610 
1611  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1612  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1613  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1614  traits.acc(C0, alphav, R0);
1615  traits.acc(C8, alphav, R1);
1616  traits.acc(C16, alphav, R2);
1617  r0.storePacket(0 * Traits::ResPacketSize, R0);
1618  r0.storePacket(1 * Traits::ResPacketSize, R1);
1619  r0.storePacket(2 * Traits::ResPacketSize, R2);
1620 
1621  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1622  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1623  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1624  traits.acc(C1, alphav, R0);
1625  traits.acc(C9, alphav, R1);
1626  traits.acc(C17, alphav, R2);
1627  r1.storePacket(0 * Traits::ResPacketSize, R0);
1628  r1.storePacket(1 * Traits::ResPacketSize, R1);
1629  r1.storePacket(2 * Traits::ResPacketSize, R2);
1630 
1631  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1632  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1633  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1634  traits.acc(C2, alphav, R0);
1635  traits.acc(C10, alphav, R1);
1636  traits.acc(C18, alphav, R2);
1637  r2.storePacket(0 * Traits::ResPacketSize, R0);
1638  r2.storePacket(1 * Traits::ResPacketSize, R1);
1639  r2.storePacket(2 * Traits::ResPacketSize, R2);
1640 
1641  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1642  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1643  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1644  traits.acc(C3, alphav, R0);
1645  traits.acc(C11, alphav, R1);
1646  traits.acc(C19, alphav, R2);
1647  r3.storePacket(0 * Traits::ResPacketSize, R0);
1648  r3.storePacket(1 * Traits::ResPacketSize, R1);
1649  r3.storePacket(2 * Traits::ResPacketSize, R2);
1650 
1651  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1652  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1653  R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1654  traits.acc(C4, alphav, R0);
1655  traits.acc(C12, alphav, R1);
1656  traits.acc(C20, alphav, R2);
1657  r4.storePacket(0 * Traits::ResPacketSize, R0);
1658  r4.storePacket(1 * Traits::ResPacketSize, R1);
1659  r4.storePacket(2 * Traits::ResPacketSize, R2);
1660 
1661  R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1662  R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1663  R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1664  traits.acc(C5, alphav, R0);
1665  traits.acc(C13, alphav, R1);
1666  traits.acc(C21, alphav, R2);
1667  r5.storePacket(0 * Traits::ResPacketSize, R0);
1668  r5.storePacket(1 * Traits::ResPacketSize, R1);
1669  r5.storePacket(2 * Traits::ResPacketSize, R2);
1670 
1671  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1672  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1673  R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1674  traits.acc(C6, alphav, R0);
1675  traits.acc(C14, alphav, R1);
1676  traits.acc(C22, alphav, R2);
1677  r6.storePacket(0 * Traits::ResPacketSize, R0);
1678  r6.storePacket(1 * Traits::ResPacketSize, R1);
1679  r6.storePacket(2 * Traits::ResPacketSize, R2);
1680 
1681  R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1682  R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1683  R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1684  traits.acc(C7, alphav, R0);
1685  traits.acc(C15, alphav, R1);
1686  traits.acc(C23, alphav, R2);
1687  r7.storePacket(0 * Traits::ResPacketSize, R0);
1688  r7.storePacket(1 * Traits::ResPacketSize, R1);
1689  r7.storePacket(2 * Traits::ResPacketSize, R2);
1690  }
1691  }
1692  }
1693 #endif
1694  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1695  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1696  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1697  // stored into 3 x nr registers.
1698 
1699  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1700  prefetch(&blA[0]);
1701 
1702  // gets res block as register
1703  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1704  traits.initAcc(C0);
1705  traits.initAcc(C1);
1706  traits.initAcc(C2);
1707  traits.initAcc(C3);
1708  traits.initAcc(C4);
1709  traits.initAcc(C5);
1710  traits.initAcc(C6);
1711  traits.initAcc(C7);
1712  traits.initAcc(C8);
1713  traits.initAcc(C9);
1714  traits.initAcc(C10);
1715  traits.initAcc(C11);
1716 
1717  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1718  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1719  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1720  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1721 
1722  r0.prefetch(0);
1723  r1.prefetch(0);
1724  r2.prefetch(0);
1725  r3.prefetch(0);
1726 
1727  // performs "inner" products
1728  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1729  prefetch(&blB[0]);
1730  LhsPacket A0, A1;
1731 
1732  for (Index k = 0; k < peeled_kc; k += pk) {
1733  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1734  // 15 registers are taken (12 for acc, 3 for lhs).
1735  RhsPanel15 rhs_panel;
1736  RhsPacket T0;
1737  LhsPacket A2;
1738 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1739 // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1740 // without this workaround A0, A1, and A2 are loaded in the same register,
1741 // which is not good for pipelining
1742 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1743 #else
1744 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1745 #endif
1746 #define EIGEN_GEBP_ONESTEP(K) \
1747  do { \
1748  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1749  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1750  internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1751  if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1752  internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1753  } /* Bug 953 */ \
1754  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1755  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1756  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1757  EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1758  traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1759  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1760  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1761  traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1762  traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1763  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1764  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1765  traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1766  traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1767  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1768  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1769  traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1770  traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1771  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1772  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1773  traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1774  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1775  } while (false)
1776 
1777  internal::prefetch(blB);
1778  EIGEN_GEBP_ONESTEP(0);
1779  EIGEN_GEBP_ONESTEP(1);
1780  EIGEN_GEBP_ONESTEP(2);
1781  EIGEN_GEBP_ONESTEP(3);
1782  EIGEN_GEBP_ONESTEP(4);
1783  EIGEN_GEBP_ONESTEP(5);
1784  EIGEN_GEBP_ONESTEP(6);
1785  EIGEN_GEBP_ONESTEP(7);
1786 
1787  blB += pk * 4 * RhsProgress;
1788  blA += pk * 3 * Traits::LhsProgress;
1789 
1790  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1791  }
1792  // process remaining peeled loop
1793  for (Index k = peeled_kc; k < depth; k++) {
1794  RhsPanel15 rhs_panel;
1795  RhsPacket T0;
1796  LhsPacket A2;
1797  EIGEN_GEBP_ONESTEP(0);
1798  blB += 4 * RhsProgress;
1799  blA += 3 * Traits::LhsProgress;
1800  }
1801 
1802 #undef EIGEN_GEBP_ONESTEP
1803 
1804  ResPacket R0, R1, R2;
1805  ResPacket alphav = pset1<ResPacket>(alpha);
1806 
1807  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1808  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1809  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1810  traits.acc(C0, alphav, R0);
1811  traits.acc(C4, alphav, R1);
1812  traits.acc(C8, alphav, R2);
1813  r0.storePacket(0 * Traits::ResPacketSize, R0);
1814  r0.storePacket(1 * Traits::ResPacketSize, R1);
1815  r0.storePacket(2 * Traits::ResPacketSize, R2);
1816 
1817  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1818  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1819  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1820  traits.acc(C1, alphav, R0);
1821  traits.acc(C5, alphav, R1);
1822  traits.acc(C9, alphav, R2);
1823  r1.storePacket(0 * Traits::ResPacketSize, R0);
1824  r1.storePacket(1 * Traits::ResPacketSize, R1);
1825  r1.storePacket(2 * Traits::ResPacketSize, R2);
1826 
1827  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1828  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1829  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1830  traits.acc(C2, alphav, R0);
1831  traits.acc(C6, alphav, R1);
1832  traits.acc(C10, alphav, R2);
1833  r2.storePacket(0 * Traits::ResPacketSize, R0);
1834  r2.storePacket(1 * Traits::ResPacketSize, R1);
1835  r2.storePacket(2 * Traits::ResPacketSize, R2);
1836 
1837  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1838  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1839  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1840  traits.acc(C3, alphav, R0);
1841  traits.acc(C7, alphav, R1);
1842  traits.acc(C11, alphav, R2);
1843  r3.storePacket(0 * Traits::ResPacketSize, R0);
1844  r3.storePacket(1 * Traits::ResPacketSize, R1);
1845  r3.storePacket(2 * Traits::ResPacketSize, R2);
1846  }
1847  }
1848 
1849  // Deal with remaining columns of the rhs
1850  for (Index j2 = packet_cols4; j2 < cols; j2++) {
1851  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1852  // One column at a time
1853  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
1854  prefetch(&blA[0]);
1855 
1856  // gets res block as register
1857  AccPacket C0, C4, C8;
1858  traits.initAcc(C0);
1859  traits.initAcc(C4);
1860  traits.initAcc(C8);
1861 
1862  LinearMapper r0 = res.getLinearMapper(i, j2);
1863  r0.prefetch(0);
1864 
1865  // performs "inner" products
1866  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1867  LhsPacket A0, A1, A2;
1868 
1869  for (Index k = 0; k < peeled_kc; k += pk) {
1870  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1871  RhsPacket B_0;
1872 #define EIGEN_GEBGP_ONESTEP(K) \
1873  do { \
1874  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1875  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1876  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1877  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1878  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1879  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1880  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1881  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1882  traits.madd(A2, B_0, C8, B_0, fix<0>); \
1883  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1884  } while (false)
1885 
1894 
1895  blB += int(pk) * int(RhsProgress);
1896  blA += int(pk) * 3 * int(Traits::LhsProgress);
1897 
1898  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1899  }
1900 
1901  // process remaining peeled loop
1902  for (Index k = peeled_kc; k < depth; k++) {
1903  RhsPacket B_0;
1905  blB += RhsProgress;
1906  blA += 3 * Traits::LhsProgress;
1907  }
1908 #undef EIGEN_GEBGP_ONESTEP
1909  ResPacket R0, R1, R2;
1910  ResPacket alphav = pset1<ResPacket>(alpha);
1911 
1912  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1913  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1914  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1915  traits.acc(C0, alphav, R0);
1916  traits.acc(C4, alphav, R1);
1917  traits.acc(C8, alphav, R2);
1918  r0.storePacket(0 * Traits::ResPacketSize, R0);
1919  r0.storePacket(1 * Traits::ResPacketSize, R1);
1920  r0.storePacket(2 * Traits::ResPacketSize, R2);
1921  }
1922  }
1923  }
1924  }
1925 
1926  //---------- Process 2 * LhsProgress rows at once ----------
1927  if (mr >= 2 * Traits::LhsProgress) {
1928  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1929  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1930  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1931  // guess), or because we are testing specific blocking sizes.
1932  Index actual_panel_rows =
1933  (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1934  (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
1935 
1936  for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1937  Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
1938 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1939  EIGEN_IF_CONSTEXPR(nr >= 8) {
1940  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1941  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1942  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1943  prefetch(&blA[0]);
1944 
1945  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1946  traits.initAcc(C0);
1947  traits.initAcc(C1);
1948  traits.initAcc(C2);
1949  traits.initAcc(C3);
1950  traits.initAcc(C4);
1951  traits.initAcc(C5);
1952  traits.initAcc(C6);
1953  traits.initAcc(C7);
1954  traits.initAcc(C8);
1955  traits.initAcc(C9);
1956  traits.initAcc(C10);
1957  traits.initAcc(C11);
1958  traits.initAcc(C12);
1959  traits.initAcc(C13);
1960  traits.initAcc(C14);
1961  traits.initAcc(C15);
1962 
1963  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1964  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1965  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1966  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1967  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1968  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1969  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1970  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1971  r0.prefetch(prefetch_res_offset);
1972  r1.prefetch(prefetch_res_offset);
1973  r2.prefetch(prefetch_res_offset);
1974  r3.prefetch(prefetch_res_offset);
1975  r4.prefetch(prefetch_res_offset);
1976  r5.prefetch(prefetch_res_offset);
1977  r6.prefetch(prefetch_res_offset);
1978  r7.prefetch(prefetch_res_offset);
1979 
1980  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1981  prefetch(&blB[0]);
1982  LhsPacket A0, A1;
1983  for (Index k = 0; k < peeled_kc; k += pk) {
1984  RhsPacketx4 rhs_panel;
1985  RhsPacket T0;
1986 // NOTE: the begin/end asm comments below work around bug 935!
1987 // but they are not enough for gcc>=6 without FMA (bug 1637)
1988 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1989 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1990 #else
1991 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1992 #endif
1993 #define EIGEN_GEBGP_ONESTEP(K) \
1994  do { \
1995  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1996  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1997  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1998  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1999  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2000  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2001  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2002  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2003  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2004  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2005  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2006  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2007  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2008  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2009  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2010  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2011  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2012  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2013  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2014  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2015  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2016  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2017  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2018  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2019  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2020  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2021  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2022  EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2023  } while (false)
2024 
2025  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
2026 
2035 
2036  blB += pk * 8 * RhsProgress;
2037  blA += pk * (2 * Traits::LhsProgress);
2038 
2039  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
2040  }
2041  // process remaining peeled loop
2042  for (Index k = peeled_kc; k < depth; k++) {
2043  RhsPacketx4 rhs_panel;
2044  RhsPacket T0;
2046  blB += 8 * RhsProgress;
2047  blA += 2 * Traits::LhsProgress;
2048  }
2049 
2050 #undef EIGEN_GEBGP_ONESTEP
2051 
2052  ResPacket R0, R1, R2, R3;
2053  ResPacket alphav = pset1<ResPacket>(alpha);
2054 
2055  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2056  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2057  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2058  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2059  traits.acc(C0, alphav, R0);
2060  traits.acc(C8, alphav, R1);
2061  traits.acc(C1, alphav, R2);
2062  traits.acc(C9, alphav, R3);
2063  r0.storePacket(0 * Traits::ResPacketSize, R0);
2064  r0.storePacket(1 * Traits::ResPacketSize, R1);
2065  r1.storePacket(0 * Traits::ResPacketSize, R2);
2066  r1.storePacket(1 * Traits::ResPacketSize, R3);
2067 
2068  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2069  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2070  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2071  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2072  traits.acc(C2, alphav, R0);
2073  traits.acc(C10, alphav, R1);
2074  traits.acc(C3, alphav, R2);
2075  traits.acc(C11, alphav, R3);
2076  r2.storePacket(0 * Traits::ResPacketSize, R0);
2077  r2.storePacket(1 * Traits::ResPacketSize, R1);
2078  r3.storePacket(0 * Traits::ResPacketSize, R2);
2079  r3.storePacket(1 * Traits::ResPacketSize, R3);
2080 
2081  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2082  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2083  R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2084  R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2085  traits.acc(C4, alphav, R0);
2086  traits.acc(C12, alphav, R1);
2087  traits.acc(C5, alphav, R2);
2088  traits.acc(C13, alphav, R3);
2089  r4.storePacket(0 * Traits::ResPacketSize, R0);
2090  r4.storePacket(1 * Traits::ResPacketSize, R1);
2091  r5.storePacket(0 * Traits::ResPacketSize, R2);
2092  r5.storePacket(1 * Traits::ResPacketSize, R3);
2093 
2094  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2095  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2096  R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2097  R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2098  traits.acc(C6, alphav, R0);
2099  traits.acc(C14, alphav, R1);
2100  traits.acc(C7, alphav, R2);
2101  traits.acc(C15, alphav, R3);
2102  r6.storePacket(0 * Traits::ResPacketSize, R0);
2103  r6.storePacket(1 * Traits::ResPacketSize, R1);
2104  r7.storePacket(0 * Traits::ResPacketSize, R2);
2105  r7.storePacket(1 * Traits::ResPacketSize, R3);
2106  }
2107  }
2108  }
2109 #endif
2110  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2111  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
2112  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
2113  // stored into 2 x nr registers.
2114 
2115  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
2116  prefetch(&blA[0]);
2117 
2118  // gets res block as register
2119  AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
2120  traits.initAcc(C0);
2121  traits.initAcc(C1);
2122  traits.initAcc(C2);
2123  traits.initAcc(C3);
2124  traits.initAcc(C4);
2125  traits.initAcc(C5);
2126  traits.initAcc(C6);
2127  traits.initAcc(C7);
2128 
2129  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
2130  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
2131  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
2132  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
2133 
2134  r0.prefetch(prefetch_res_offset);
2135  r1.prefetch(prefetch_res_offset);
2136  r2.prefetch(prefetch_res_offset);
2137  r3.prefetch(prefetch_res_offset);
2138 
2139  // performs "inner" products
2140  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2141  prefetch(&blB[0]);
2142  LhsPacket A0, A1;
2143 
2144  for (Index k = 0; k < peeled_kc; k += pk) {
2145  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
2146  RhsPacketx4 rhs_panel;
2147  RhsPacket T0;
2148 
2149 // NOTE: the begin/end asm comments below work around bug 935!
2150 // but they are not enough for gcc>=6 without FMA (bug 1637)
2151 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2152 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2153 #else
2154 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2155 #endif
2156 #define EIGEN_GEBGP_ONESTEP(K) \
2157  do { \
2158  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2159  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2160  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2161  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2162  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2163  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2164  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2165  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2166  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2167  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2168  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2169  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2170  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2171  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2172  } while (false)
2173 
2174  internal::prefetch(blB + (48 + 0));
2179  internal::prefetch(blB + (48 + 16));
2184 
2185  blB += pk * 4 * RhsProgress;
2186  blA += pk * (2 * Traits::LhsProgress);
2187 
2188  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
2189  }
2190  // process remaining peeled loop
2191  for (Index k = peeled_kc; k < depth; k++) {
2192  RhsPacketx4 rhs_panel;
2193  RhsPacket T0;
2195  blB += 4 * RhsProgress;
2196  blA += 2 * Traits::LhsProgress;
2197  }
2198 #undef EIGEN_GEBGP_ONESTEP
2199 
2200  ResPacket R0, R1, R2, R3;
2201  ResPacket alphav = pset1<ResPacket>(alpha);
2202 
2203  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2204  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2205  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2206  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2207  traits.acc(C0, alphav, R0);
2208  traits.acc(C4, alphav, R1);
2209  traits.acc(C1, alphav, R2);
2210  traits.acc(C5, alphav, R3);
2211  r0.storePacket(0 * Traits::ResPacketSize, R0);
2212  r0.storePacket(1 * Traits::ResPacketSize, R1);
2213  r1.storePacket(0 * Traits::ResPacketSize, R2);
2214  r1.storePacket(1 * Traits::ResPacketSize, R3);
2215 
2216  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2217  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2218  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2219  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2220  traits.acc(C2, alphav, R0);
2221  traits.acc(C6, alphav, R1);
2222  traits.acc(C3, alphav, R2);
2223  traits.acc(C7, alphav, R3);
2224  r2.storePacket(0 * Traits::ResPacketSize, R0);
2225  r2.storePacket(1 * Traits::ResPacketSize, R1);
2226  r3.storePacket(0 * Traits::ResPacketSize, R2);
2227  r3.storePacket(1 * Traits::ResPacketSize, R3);
2228  }
2229  }
2230 
2231  // Deal with remaining columns of the rhs
2232  for (Index j2 = packet_cols4; j2 < cols; j2++) {
2233  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
2234  // One column at a time
2235  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
2236  prefetch(&blA[0]);
2237 
2238  // gets res block as register
2239  AccPacket C0, C4;
2240  traits.initAcc(C0);
2241  traits.initAcc(C4);
2242 
2243  LinearMapper r0 = res.getLinearMapper(i, j2);
2244  r0.prefetch(prefetch_res_offset);
2245 
2246  // performs "inner" products
2247  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2248  LhsPacket A0, A1;
2249 
2250  for (Index k = 0; k < peeled_kc; k += pk) {
2251  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
2252  RhsPacket B_0, B1;
2253 
2254 #define EIGEN_GEBGP_ONESTEP(K) \
2255  do { \
2256  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2257  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2258  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2259  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2260  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2261  traits.madd(A0, B_0, C0, B1, fix<0>); \
2262  traits.madd(A1, B_0, C4, B_0, fix<0>); \
2263  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2264  } while (false)
2265 
2274 
2275  blB += int(pk) * int(RhsProgress);
2276  blA += int(pk) * 2 * int(Traits::LhsProgress);
2277 
2278  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
2279  }
2280 
2281  // process remaining peeled loop
2282  for (Index k = peeled_kc; k < depth; k++) {
2283  RhsPacket B_0, B1;
2285  blB += RhsProgress;
2286  blA += 2 * Traits::LhsProgress;
2287  }
2288 #undef EIGEN_GEBGP_ONESTEP
2289  ResPacket R0, R1;
2290  ResPacket alphav = pset1<ResPacket>(alpha);
2291 
2292  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2293  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2294  traits.acc(C0, alphav, R0);
2295  traits.acc(C4, alphav, R1);
2296  r0.storePacket(0 * Traits::ResPacketSize, R0);
2297  r0.storePacket(1 * Traits::ResPacketSize, R1);
2298  }
2299  }
2300  }
2301  }
2302  //---------- Process 1 * LhsProgress rows at once ----------
2303  if (mr >= 1 * Traits::LhsProgress) {
2304  lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
2305  RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
2306  p;
2307  p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2308  peeled_kc, pk, cols, depth, packet_cols4);
2309  }
2310  //---------- Process LhsProgressHalf rows at once ----------
2311  if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
2312  lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
2314  p;
2315  p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2316  peeled_kc, pk, cols, depth, packet_cols4);
2317  }
2318  //---------- Process LhsProgressQuarter rows at once ----------
2320  lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
2322  QuarterTraits, LinearMapper, DataMapper>
2323  p;
2324  p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2325  prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2326  }
2327  //---------- Process remaining rows, 1 at once ----------
2328  if (peeled_mc_quarter < rows) {
2329 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2330  EIGEN_IF_CONSTEXPR(nr >= 8) {
2331  // loop on each panel of the rhs
2332  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2333  // loop on each row of the lhs (1*LhsProgress x depth)
2334  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2335  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2336  prefetch(&blA[0]);
2337  // gets a 1 x 1 res block as registers
2338  ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2339  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2340  for (Index k = 0; k < depth; k++) {
2341  LhsScalar A0 = blA[k];
2342  RhsScalar B_0;
2343 
2344  B_0 = blB[0];
2345  C0 = cj.pmadd(A0, B_0, C0);
2346 
2347  B_0 = blB[1];
2348  C1 = cj.pmadd(A0, B_0, C1);
2349 
2350  B_0 = blB[2];
2351  C2 = cj.pmadd(A0, B_0, C2);
2352 
2353  B_0 = blB[3];
2354  C3 = cj.pmadd(A0, B_0, C3);
2355 
2356  B_0 = blB[4];
2357  C4 = cj.pmadd(A0, B_0, C4);
2358 
2359  B_0 = blB[5];
2360  C5 = cj.pmadd(A0, B_0, C5);
2361 
2362  B_0 = blB[6];
2363  C6 = cj.pmadd(A0, B_0, C6);
2364 
2365  B_0 = blB[7];
2366  C7 = cj.pmadd(A0, B_0, C7);
2367 
2368  blB += 8;
2369  }
2370  res(i, j2 + 0) += alpha * C0;
2371  res(i, j2 + 1) += alpha * C1;
2372  res(i, j2 + 2) += alpha * C2;
2373  res(i, j2 + 3) += alpha * C3;
2374  res(i, j2 + 4) += alpha * C4;
2375  res(i, j2 + 5) += alpha * C5;
2376  res(i, j2 + 6) += alpha * C6;
2377  res(i, j2 + 7) += alpha * C7;
2378  }
2379  }
2380  }
2381 #endif
2382 
2383  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2384  // loop on each row of the lhs (1*LhsProgress x depth)
2385  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2386  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2387  prefetch(&blA[0]);
2388  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2389 
2390  // If LhsProgress is 8 or 16, it assumes that there is a
2391  // half or quarter packet, respectively, of the same size as
2392  // nr (which is currently 4) for the return type.
2394  const int SResPacketQuarterSize =
2396  // The following code assumes we can load SRhsPacket in such a way that
2397  // it multiplies blocks of 4 elements in SLhsPacket. This is not the
2398  // case for some customized kernels (i.e. NEON fp16). If the assumption
2399  // fails, drop down to the scalar path.
2400  constexpr bool kCanLoadSRhsQuad =
2402  (unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
2403  if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
2404  (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
2405  (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
2406  SAccPacket C0, C1, C2, C3;
2407  straits.initAcc(C0);
2408  straits.initAcc(C1);
2409  straits.initAcc(C2);
2410  straits.initAcc(C3);
2411 
2412  const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
2413  const Index endk = (depth / spk) * spk;
2414  const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2415 
2416  Index k = 0;
2417  for (; k < endk4; k += 4 * spk) {
2418  SLhsPacket A0, A1;
2419  SRhsPacket B_0, B_1;
2420 
2421  straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
2422  straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
2423 
2424  straits.loadRhsQuad(blA + 0 * spk, B_0);
2425  straits.loadRhsQuad(blA + 1 * spk, B_1);
2426  straits.madd(A0, B_0, C0, B_0, fix<0>);
2427  straits.madd(A1, B_1, C1, B_1, fix<0>);
2428 
2429  straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
2430  straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
2431  straits.loadRhsQuad(blA + 2 * spk, B_0);
2432  straits.loadRhsQuad(blA + 3 * spk, B_1);
2433  straits.madd(A0, B_0, C2, B_0, fix<0>);
2434  straits.madd(A1, B_1, C3, B_1, fix<0>);
2435 
2436  blB += 4 * SwappedTraits::LhsProgress;
2437  blA += 4 * spk;
2438  }
2439  C0 = padd(padd(C0, C1), padd(C2, C3));
2440  for (; k < endk; k += spk) {
2441  SLhsPacket A0;
2442  SRhsPacket B_0;
2443 
2444  straits.loadLhsUnaligned(blB, A0);
2445  straits.loadRhsQuad(blA, B_0);
2446  straits.madd(A0, B_0, C0, B_0, fix<0>);
2447 
2449  blA += spk;
2450  }
2451  if (SwappedTraits::LhsProgress == 8) {
2452  // Special case where we have to first reduce the accumulation register C0
2453  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
2454  SResPacket>
2455  SResPacketHalf;
2456  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
2457  SLhsPacket>
2458  SLhsPacketHalf;
2459  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
2460  SRhsPacket>
2461  SRhsPacketHalf;
2462  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
2463  SAccPacket>
2464  SAccPacketHalf;
2465 
2466  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2467  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2468 
2469  if (depth - endk > 0) {
2470  // We have to handle the last row of the rhs which corresponds to a half-packet
2471  SLhsPacketHalf a0;
2472  SRhsPacketHalf b0;
2473  straits.loadLhsUnaligned(blB, a0);
2474  straits.loadRhs(blA, b0);
2475  SAccPacketHalf c0 = predux_half_dowto4(C0);
2476  straits.madd(a0, b0, c0, b0, fix<0>);
2477  straits.acc(c0, alphav, R);
2478  } else {
2479  straits.acc(predux_half_dowto4(C0), alphav, R);
2480  }
2481  res.scatterPacket(i, j2, R);
2482  } else if (SwappedTraits::LhsProgress == 16) {
2483  // Special case where we have to first reduce the
2484  // accumulation register C0. We specialize the block in
2485  // template form, so that LhsProgress < 16 paths don't
2486  // fail to compile
2487  last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
2488  p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
2489  } else {
2490  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2491  SResPacket alphav = pset1<SResPacket>(alpha);
2492  straits.acc(C0, alphav, R);
2493  res.scatterPacket(i, j2, R);
2494  }
2495  } else // scalar path
2496  {
2497  // get a 1 x 4 res block as registers
2498  ResScalar C0(0), C1(0), C2(0), C3(0);
2499 
2500  for (Index k = 0; k < depth; k++) {
2501  LhsScalar A0;
2502  RhsScalar B_0, B_1;
2503 
2504  A0 = blA[k];
2505 
2506  B_0 = blB[0];
2507  B_1 = blB[1];
2508  C0 = cj.pmadd(A0, B_0, C0);
2509  C1 = cj.pmadd(A0, B_1, C1);
2510 
2511  B_0 = blB[2];
2512  B_1 = blB[3];
2513  C2 = cj.pmadd(A0, B_0, C2);
2514  C3 = cj.pmadd(A0, B_1, C3);
2515 
2516  blB += 4;
2517  }
2518  res(i, j2 + 0) += alpha * C0;
2519  res(i, j2 + 1) += alpha * C1;
2520  res(i, j2 + 2) += alpha * C2;
2521  res(i, j2 + 3) += alpha * C3;
2522  }
2523  }
2524  }
2525  // remaining columns
2526  for (Index j2 = packet_cols4; j2 < cols; j2++) {
2527  // loop on each row of the lhs (1*LhsProgress x depth)
2528  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2529  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2530  prefetch(&blA[0]);
2531  // gets a 1 x 1 res block as registers
2532  ResScalar C0(0);
2533  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2534  for (Index k = 0; k < depth; k++) {
2535  LhsScalar A0 = blA[k];
2536  RhsScalar B_0 = blB[k];
2537  C0 = cj.pmadd(A0, B_0, C0);
2538  }
2539  res(i, j2) += alpha * C0;
2540  }
2541  }
2542  }
2543 }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
@ R
Definition: StatisticsVector.h:21
float * p
Definition: Tutorial_Map_using.cpp:9
int rows
Definition: Tutorial_commainit_02.cpp:1
int cols
Definition: Tutorial_commainit_02.cpp:1
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
return int(ret)+1
RealScalar alpha
Definition: level1_cplx_impl.h:151
char char char int int * k
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
const std::ptrdiff_t defaultL1CacheSize
Definition: products/GeneralBlockPanelKernel.h:61
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:995
Traits::RhsPacket RhsPacket
Definition: products/GeneralBlockPanelKernel.h:969
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:982
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition: products/GeneralBlockPanelKernel.h:963
QuarterTraits::ResPacket ResPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:992
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:983
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:991
HalfTraits::ResPacket ResPacketHalf
Definition: products/GeneralBlockPanelKernel.h:987
HalfTraits::RhsPacket RhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:986
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:961
Traits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:972
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:981
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:990
HalfTraits::LhsPacket LhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:985
QuarterTraits::AccPacket AccPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:993
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:977
RhsPanelHelper< RhsPacket, RhsPacketx4, 27 >::type RhsPanel27
Definition: products/GeneralBlockPanelKernel.h:975
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition: products/GeneralBlockPanelKernel.h:974
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition: products/GeneralBlockPanelKernel.h:965
Traits::LhsPacket LhsPacket
Definition: products/GeneralBlockPanelKernel.h:968
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:967
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:980
Traits::ResPacket ResPacket
Definition: products/GeneralBlockPanelKernel.h:970
Traits::AccPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:971
HalfTraits::AccPacket AccPacketHalf
Definition: products/GeneralBlockPanelKernel.h:988
T half
Definition: GenericPacketMath.h:136
@ size
Definition: GenericPacketMath.h:139

References Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::acc(), alpha, Global_Physical_Variables::C1, Global_Physical_Variables::C2, cols, Eigen::internal::defaultL1CacheSize, EIGEN_ASM_COMMENT, EIGEN_GEBGP_ONESTEP, EIGEN_GEBP_ONESTEP, EIGEN_IF_CONSTEXPR, i, Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::initAcc(), int(), k, Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::loadLhsUnaligned(), Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::loadRhs(), Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::loadRhsQuad(), Eigen::internal::gebp_traits< LhsScalar_, RhsScalar_, ConjLhs_, ConjRhs_, Arch, PacketSize_ >::madd(), max, min, p, Eigen::internal::padd(), Eigen::internal::conj_helper< LhsType, RhsType, ConjLhs, ConjRhs >::pmadd(), Eigen::internal::predux_half_dowto4(), Eigen::internal::prefetch(), R, res, rows, and size.


The documentation for this struct was generated from the following file: