1432 if (strideA == -1) strideA = depth;
1433 if (strideB == -1) strideB = depth;
1434 conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
1435 Index packet_cols4 = nr >= 4 ? (
cols / 4) * 4 : 0;
1436 Index packet_cols8 = nr >= 8 ? (
cols / 8) * 8 : 0;
1438 const Index peeled_mc2 =
1440 const Index peeled_mc1 =
1442 const Index peeled_mc_half =
1444 const Index peeled_mc_quarter =
1449 const Index peeled_kc = depth & ~(pk - 1);
1450 const int prefetch_res_offset = 32 /
sizeof(
ResScalar);
1465 const Index actual_panel_rows =
1466 (3 *
LhsProgress) * std::max<Index>(1, ((l1 -
sizeof(
ResScalar) * mr * nr - depth * nr *
sizeof(RhsScalar)) /
1468 for (
Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1469 const Index actual_panel_end = (
std::min)(i1 + actual_panel_rows, peeled_mc3);
1470 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1472 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1474 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (3 *
LhsProgress)];
1477 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1489 traits.initAcc(C10);
1490 traits.initAcc(C11);
1491 traits.initAcc(C12);
1492 traits.initAcc(C13);
1493 traits.initAcc(C14);
1494 traits.initAcc(C15);
1495 traits.initAcc(C16);
1496 traits.initAcc(C17);
1497 traits.initAcc(C18);
1498 traits.initAcc(C19);
1499 traits.initAcc(C20);
1500 traits.initAcc(C21);
1501 traits.initAcc(C22);
1502 traits.initAcc(C23);
1523 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1526 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1532 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1536 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1538 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1541 #define EIGEN_GEBP_ONESTEP(K) \
1543 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1544 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1545 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1546 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1547 EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1548 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1549 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1550 traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1551 traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1552 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1553 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1554 traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1555 traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1556 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1557 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1558 traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1559 traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1560 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1561 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1562 traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1563 traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1564 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1565 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1566 traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1567 traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1568 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1569 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1570 traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1571 traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1572 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1573 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1574 traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1575 traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1576 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1577 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1578 traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1579 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1597 for (
Index k = peeled_kc;
k < depth;
k++) {
1606 #undef EIGEN_GEBP_ONESTEP
1614 traits.acc(C0, alphav, R0);
1615 traits.acc(C8, alphav, R1);
1616 traits.acc(C16, alphav, R2);
1624 traits.acc(
C1, alphav, R0);
1625 traits.acc(C9, alphav, R1);
1626 traits.acc(C17, alphav, R2);
1634 traits.acc(
C2, alphav, R0);
1635 traits.acc(C10, alphav, R1);
1636 traits.acc(C18, alphav, R2);
1644 traits.acc(C3, alphav, R0);
1645 traits.acc(C11, alphav, R1);
1646 traits.acc(C19, alphav, R2);
1654 traits.acc(C4, alphav, R0);
1655 traits.acc(C12, alphav, R1);
1656 traits.acc(C20, alphav, R2);
1664 traits.acc(C5, alphav, R0);
1665 traits.acc(C13, alphav, R1);
1666 traits.acc(C21, alphav, R2);
1674 traits.acc(C6, alphav, R0);
1675 traits.acc(C14, alphav, R1);
1676 traits.acc(C22, alphav, R2);
1684 traits.acc(C7, alphav, R0);
1685 traits.acc(C15, alphav, R1);
1686 traits.acc(C23, alphav, R2);
1694 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1699 const LhsScalar* blA = &blockA[
i * strideA + offsetA * (3 *
LhsProgress)];
1703 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1714 traits.initAcc(C10);
1715 traits.initAcc(C11);
1728 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1732 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1738 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1742 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1744 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1746 #define EIGEN_GEBP_ONESTEP(K) \
1748 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1749 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1750 internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1751 if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1752 internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1754 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1755 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1756 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1757 EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1758 traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1759 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1760 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1761 traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1762 traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1763 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1764 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1765 traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1766 traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1767 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1768 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1769 traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1770 traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1771 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1772 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1773 traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1774 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1793 for (
Index k = peeled_kc;
k < depth;
k++) {
1802 #undef EIGEN_GEBP_ONESTEP
1810 traits.acc(C0, alphav, R0);
1811 traits.acc(C4, alphav, R1);
1812 traits.acc(C8, alphav, R2);
1820 traits.acc(
C1, alphav, R0);
1821 traits.acc(C5, alphav, R1);
1822 traits.acc(C9, alphav, R2);
1830 traits.acc(
C2, alphav, R0);
1831 traits.acc(C6, alphav, R1);
1832 traits.acc(C10, alphav, R2);
1840 traits.acc(C3, alphav, R0);
1841 traits.acc(C7, alphav, R1);
1842 traits.acc(C11, alphav, R2);
1850 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
1866 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1869 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1872 #define EIGEN_GEBGP_ONESTEP(K) \
1874 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1875 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1876 traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1877 traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1878 traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1879 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1880 traits.madd(A0, B_0, C0, B_0, fix<0>); \
1881 traits.madd(A1, B_0, C4, B_0, fix<0>); \
1882 traits.madd(A2, B_0, C8, B_0, fix<0>); \
1883 EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1902 for (
Index k = peeled_kc;
k < depth;
k++) {
1908 #undef EIGEN_GEBGP_ONESTEP
1915 traits.acc(C0, alphav, R0);
1916 traits.acc(C4, alphav, R1);
1917 traits.acc(C8, alphav, R2);
1932 Index actual_panel_rows =
1933 (2 *
LhsProgress) * std::max<Index>(1, ((l1 -
sizeof(
ResScalar) * mr * nr - depth * nr *
sizeof(RhsScalar)) /
1936 for (
Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1937 Index actual_panel_end = (
std::min)(i1 + actual_panel_rows, peeled_mc2);
1938 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1940 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1945 AccPacket C0,
C1,
C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1956 traits.initAcc(C10);
1957 traits.initAcc(C11);
1958 traits.initAcc(C12);
1959 traits.initAcc(C13);
1960 traits.initAcc(C14);
1961 traits.initAcc(C15);
1971 r0.prefetch(prefetch_res_offset);
1972 r1.prefetch(prefetch_res_offset);
1973 r2.prefetch(prefetch_res_offset);
1974 r3.prefetch(prefetch_res_offset);
1975 r4.prefetch(prefetch_res_offset);
1976 r5.prefetch(prefetch_res_offset);
1977 r6.prefetch(prefetch_res_offset);
1978 r7.prefetch(prefetch_res_offset);
1980 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1983 for (
Index k = 0;
k < peeled_kc;
k += pk) {
1988 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1989 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1991 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1993 #define EIGEN_GEBGP_ONESTEP(K) \
1995 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1996 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1997 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1998 traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1999 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2000 traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2001 traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2002 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2003 traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2004 traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2005 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2006 traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2007 traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2008 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2009 traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2010 traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2011 traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2012 traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2013 traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2014 traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2015 traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2016 traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2017 traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2018 traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2019 traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2020 traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2021 traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2022 EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2042 for (
Index k = peeled_kc;
k < depth;
k++) {
2050 #undef EIGEN_GEBGP_ONESTEP
2059 traits.acc(C0, alphav, R0);
2060 traits.acc(C8, alphav, R1);
2061 traits.acc(
C1, alphav, R2);
2062 traits.acc(C9, alphav, R3);
2072 traits.acc(
C2, alphav, R0);
2073 traits.acc(C10, alphav, R1);
2074 traits.acc(C3, alphav, R2);
2075 traits.acc(C11, alphav, R3);
2085 traits.acc(C4, alphav, R0);
2086 traits.acc(C12, alphav, R1);
2087 traits.acc(C5, alphav, R2);
2088 traits.acc(C13, alphav, R3);
2098 traits.acc(C6, alphav, R0);
2099 traits.acc(C14, alphav, R1);
2100 traits.acc(C7, alphav, R2);
2101 traits.acc(C15, alphav, R3);
2110 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2134 r0.prefetch(prefetch_res_offset);
2135 r1.prefetch(prefetch_res_offset);
2136 r2.prefetch(prefetch_res_offset);
2137 r3.prefetch(prefetch_res_offset);
2140 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2144 for (
Index k = 0;
k < peeled_kc;
k += pk) {
2151 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2152 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2154 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2156 #define EIGEN_GEBGP_ONESTEP(K) \
2158 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2159 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2160 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2161 traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2162 traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2163 traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2164 traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2165 traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2166 traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2167 traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2168 traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2169 traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2170 EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2171 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2191 for (
Index k = peeled_kc;
k < depth;
k++) {
2198 #undef EIGEN_GEBGP_ONESTEP
2207 traits.acc(C0, alphav, R0);
2208 traits.acc(C4, alphav, R1);
2209 traits.acc(
C1, alphav, R2);
2210 traits.acc(C5, alphav, R3);
2220 traits.acc(
C2, alphav, R0);
2221 traits.acc(C6, alphav, R1);
2222 traits.acc(C3, alphav, R2);
2223 traits.acc(C7, alphav, R3);
2232 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
2244 r0.prefetch(prefetch_res_offset);
2247 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2250 for (
Index k = 0;
k < peeled_kc;
k += pk) {
2254 #define EIGEN_GEBGP_ONESTEP(K) \
2256 EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2257 EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2258 traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2259 traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2260 traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2261 traits.madd(A0, B_0, C0, B1, fix<0>); \
2262 traits.madd(A1, B_0, C4, B_0, fix<0>); \
2263 EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2282 for (
Index k = peeled_kc;
k < depth;
k++) {
2288 #undef EIGEN_GEBGP_ONESTEP
2294 traits.acc(C0, alphav, R0);
2295 traits.acc(C4, alphav, R1);
2307 p(
res, blockA, blockB,
alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2308 peeled_kc, pk,
cols, depth, packet_cols4);
2315 p(
res, blockA, blockB,
alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2316 peeled_kc, pk,
cols, depth, packet_cols4);
2324 p(
res, blockA, blockB,
alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2325 prefetch_res_offset, peeled_kc, pk,
cols, depth, packet_cols4);
2328 if (peeled_mc_quarter <
rows) {
2329 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2332 for (
Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2335 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2338 ResScalar C0(0),
C1(0),
C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2339 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2340 for (
Index k = 0;
k < depth;
k++) {
2341 LhsScalar A0 = blA[
k];
2345 C0 = cj.pmadd(A0, B_0, C0);
2348 C1 = cj.pmadd(A0, B_0,
C1);
2351 C2 = cj.pmadd(A0, B_0,
C2);
2354 C3 = cj.pmadd(A0, B_0, C3);
2357 C4 = cj.pmadd(A0, B_0, C4);
2360 C5 = cj.pmadd(A0, B_0, C5);
2363 C6 = cj.pmadd(A0, B_0, C6);
2366 C7 = cj.pmadd(A0, B_0, C7);
2383 for (
Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2386 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2388 const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2394 const int SResPacketQuarterSize =
2400 constexpr
bool kCanLoadSRhsQuad =
2407 straits.initAcc(C0);
2408 straits.initAcc(
C1);
2409 straits.initAcc(
C2);
2410 straits.initAcc(C3);
2413 const Index endk = (depth / spk) * spk;
2414 const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2417 for (;
k < endk4;
k += 4 * spk) {
2424 straits.loadRhsQuad(blA + 0 * spk, B_0);
2425 straits.loadRhsQuad(blA + 1 * spk, B_1);
2426 straits.madd(A0, B_0, C0, B_0, fix<0>);
2427 straits.madd(A1, B_1,
C1, B_1, fix<0>);
2431 straits.loadRhsQuad(blA + 2 * spk, B_0);
2432 straits.loadRhsQuad(blA + 3 * spk, B_1);
2433 straits.madd(A0, B_0,
C2, B_0, fix<0>);
2434 straits.madd(A1, B_1, C3, B_1, fix<0>);
2440 for (;
k < endk;
k += spk) {
2444 straits.loadLhsUnaligned(blB, A0);
2445 straits.loadRhsQuad(blA, B_0);
2446 straits.madd(A0, B_0, C0, B_0, fix<0>);
2466 SResPacketHalf
R =
res.template gatherPacket<SResPacketHalf>(
i, j2);
2467 SResPacketHalf alphav = pset1<SResPacketHalf>(
alpha);
2469 if (depth - endk > 0) {
2473 straits.loadLhsUnaligned(blB, a0);
2474 straits.loadRhs(blA, b0);
2476 straits.madd(a0, b0, c0, b0, fix<0>);
2477 straits.acc(c0, alphav,
R);
2481 res.scatterPacket(
i, j2,
R);
2487 last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>
p;
2488 p(
res, straits, blA, blB, depth, endk,
i, j2,
alpha, C0);
2492 straits.acc(C0, alphav,
R);
2493 res.scatterPacket(
i, j2,
R);
2500 for (
Index k = 0;
k < depth;
k++) {
2508 C0 = cj.pmadd(A0, B_0, C0);
2509 C1 = cj.pmadd(A0, B_1,
C1);
2513 C2 = cj.pmadd(A0, B_0,
C2);
2514 C3 = cj.pmadd(A0, B_1, C3);
2526 for (
Index j2 = packet_cols4; j2 <
cols; j2++) {
2529 const LhsScalar* blA = &blockA[
i * strideA + offsetA];
2533 const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2534 for (
Index k = 0;
k < depth;
k++) {
2535 LhsScalar A0 = blA[
k];
2536 RhsScalar B_0 = blB[
k];
2537 C0 = cj.pmadd(A0, B_0, C0);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
@ R
Definition: StatisticsVector.h:21
float * p
Definition: Tutorial_Map_using.cpp:9
int rows
Definition: Tutorial_commainit_02.cpp:1
int cols
Definition: Tutorial_commainit_02.cpp:1
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
RealScalar alpha
Definition: level1_cplx_impl.h:151
char char char int int * k
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
const std::ptrdiff_t defaultL1CacheSize
Definition: products/GeneralBlockPanelKernel.h:61
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:995
Traits::RhsPacket RhsPacket
Definition: products/GeneralBlockPanelKernel.h:969
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:982
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition: products/GeneralBlockPanelKernel.h:963
QuarterTraits::ResPacket ResPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:992
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:983
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:991
HalfTraits::ResPacket ResPacketHalf
Definition: products/GeneralBlockPanelKernel.h:987
HalfTraits::RhsPacket RhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:986
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:961
Traits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:972
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:981
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:990
HalfTraits::LhsPacket LhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:985
QuarterTraits::AccPacket AccPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:993
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:977
RhsPanelHelper< RhsPacket, RhsPacketx4, 27 >::type RhsPanel27
Definition: products/GeneralBlockPanelKernel.h:975
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition: products/GeneralBlockPanelKernel.h:974
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition: products/GeneralBlockPanelKernel.h:965
Traits::LhsPacket LhsPacket
Definition: products/GeneralBlockPanelKernel.h:968
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:967
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:980
Traits::ResPacket ResPacket
Definition: products/GeneralBlockPanelKernel.h:970
Traits::AccPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:971
HalfTraits::AccPacket AccPacketHalf
Definition: products/GeneralBlockPanelKernel.h:988
T half
Definition: GenericPacketMath.h:136
@ size
Definition: GenericPacketMath.h:139