products/GeneralBlockPanelKernel.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
11 #define EIGEN_GENERAL_BLOCK_PANEL_H
12 
13 // IWYU pragma: private
14 #include "../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
21 
22 template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
23  int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
24 class gebp_traits;
25 
27 inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a <= 0 ? b : a; }
28 
29 #if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
30 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
31 #else
32 #define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
33 #endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
34 
35 #if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
36 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
37 #else
38 #define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
39 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
40 
41 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
42 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
43 #else
44 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
45 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
46 
47 #if EIGEN_ARCH_i386_OR_x86_64
48 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32 * 1024);
49 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256 * 1024);
50 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2 * 1024 * 1024);
51 #elif EIGEN_ARCH_PPC
52 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
53 #ifdef _ARCH_PWR10
54 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
55 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
56 #else
57 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
58 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
59 #endif
60 #else
61 const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
62 const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
63 const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
64 #endif
65 
66 #undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
67 #undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
68 #undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
69 
71 struct CacheSizes {
72  CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
78  }
79 
80  std::ptrdiff_t m_l1;
81  std::ptrdiff_t m_l2;
82  std::ptrdiff_t m_l3;
83 };
84 
86 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) {
87  static CacheSizes m_cacheSizes;
88 
89  if (action == SetAction) {
90  // set the cpu cache size and cache all block sizes from a global cache size in byte
91  eigen_internal_assert(l1 != 0 && l2 != 0);
92  m_cacheSizes.m_l1 = *l1;
93  m_cacheSizes.m_l2 = *l2;
94  m_cacheSizes.m_l3 = *l3;
95  } else if (action == GetAction) {
96  eigen_internal_assert(l1 != 0 && l2 != 0);
97  *l1 = m_cacheSizes.m_l1;
98  *l2 = m_cacheSizes.m_l2;
99  *l3 = m_cacheSizes.m_l3;
100  } else {
101  eigen_internal_assert(false);
102  }
103 }
104 
105 /* Helper for computeProductBlockingSizes.
106  *
107  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
108  * this function computes the blocking size parameters along the respective dimensions
109  * for matrix products and related algorithms. The blocking sizes depends on various
110  * parameters:
111  * - the L1 and L2 cache sizes,
112  * - the register level blocking sizes defined by gebp_traits,
113  * - the number of scalars that fit into a packet (when vectorization is enabled).
114  *
115  * \sa setCpuCacheSizes */
116 
117 template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
119  typedef gebp_traits<LhsScalar, RhsScalar> Traits;
120 
121  // Explanations:
122  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
123  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
124  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
125  // at the register level. This small horizontal panel has to stay within L1 cache.
126  std::ptrdiff_t l1, l2, l3;
127  manage_caching_sizes(GetAction, &l1, &l2, &l3);
128 #ifdef EIGEN_VECTORIZE_AVX512
129  // We need to find a rationale for that, but without this adjustment,
130  // performance with AVX512 is pretty bad, like -20% slower.
131  // One reason is that with increasing packet-size, the blocking size k
132  // has to become pretty small if we want that 1 lhs panel fit within L1.
133  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
134  // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
135  // This is quite small for a good reuse of the accumulation registers.
136  l1 *= 4;
137 #endif
138 
139  if (num_threads > 1) {
140  typedef typename Traits::ResScalar ResScalar;
141  enum {
142  kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
143  ksub = Traits::mr * (Traits::nr * sizeof(ResScalar)),
144  kr = 8,
145  mr = Traits::mr,
146  nr = Traits::nr
147  };
148  // Increasing k gives us more time to prefetch the content of the "C"
149  // registers. However once the latency is hidden there is no point in
150  // increasing the value of k, so we'll cap it at 320 (value determined
151  // experimentally).
152  // To avoid that k vanishes, we make k_cache at least as big as kr
153  const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
154  if (k_cache < k) {
155  k = k_cache - (k_cache % kr);
157  }
158 
159  const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
160  const Index n_per_thread = numext::div_ceil(n, num_threads);
161  if (n_cache <= n_per_thread) {
162  // Don't exceed the capacity of the l2 cache.
163  eigen_internal_assert(n_cache >= static_cast<Index>(nr));
164  n = n_cache - (n_cache % nr);
166  } else {
167  n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
168  }
169 
170  if (l3 > l2) {
171  // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
172  const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
173  const Index m_per_thread = numext::div_ceil(m, num_threads);
174  if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
175  m = m_cache - (m_cache % mr);
177  } else {
178  m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
179  }
180  }
181  } else {
182  // In unit tests we do not want to use extra large matrices,
183  // so we reduce the cache size to check the blocking strategy is not flawed
184 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
185  l1 = 9 * 1024;
186  l2 = 32 * 1024;
187  l3 = 512 * 1024;
188 #endif
189 
190  // Early return for small problems because the computation below are time consuming for small problems.
191  // Perhaps it would make more sense to consider k*n*m??
192  // Note that for very tiny problem, this function should be bypassed anyway
193  // because we use the coefficient-based implementation for them.
194  if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
195 
196  typedef typename Traits::ResScalar ResScalar;
197  enum {
198  k_peeling = 8,
199  k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
200  k_sub = Traits::mr * (Traits::nr * sizeof(ResScalar))
201  };
202 
203  // ---- 1st level of blocking on L1, yields kc ----
204 
205  // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
206  // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
207  // We also include a register-level block of the result (mx x nr).
208  // (In an ideal world only the lhs panel would stay in L1)
209  // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
210  const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
211  const Index old_k = k;
212  if (k > max_kc) {
213  // We are really blocking on the third dimension:
214  // -> reduce blocking size to make sure the last block is as large as possible
215  // while keeping the same number of sweeps over the result.
216  k = (k % max_kc) == 0 ? max_kc
217  : max_kc - k_peeling * ((max_kc - 1 - (k % max_kc)) / (k_peeling * (k / max_kc + 1)));
218 
219  eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
220  }
221 
222 // ---- 2nd level of blocking on max(L2,L3), yields nc ----
223 
224 // TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
225 // actual_l2 = max(l2, l3/nb_core_sharing_l3)
226 // The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
227 // For instance, it corresponds to 6MB of L3 shared among 4 cores.
228 #ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
229  const Index actual_l2 = l3;
230 #else
231  const Index actual_l2 = 1572864; // == 1.5 MB
232 #endif
233 
234  // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
235  // The second half is implicitly reserved to access the result and lhs coefficients.
236  // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
237  // to limit this growth: we bound nc to growth by a factor x1.5.
238  // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
239  // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
240  Index max_nc;
241  const Index lhs_bytes = m * k * sizeof(LhsScalar);
242  const Index remaining_l1 = l1 - k_sub - lhs_bytes;
243  if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
244  // L1 blocking
245  max_nc = remaining_l1 / (k * sizeof(RhsScalar));
246  } else {
247  // L2 blocking
248  max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
249  }
250  // WARNING Below, we assume that Traits::nr is a power of two.
251  Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
252  if (n > nc) {
253  // We are really blocking over the columns:
254  // -> reduce blocking size to make sure the last block is as large as possible
255  // while keeping the same number of sweeps over the packed lhs.
256  // Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
257  n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
258  } else if (old_k == k) {
259  // So far, no blocking at all, i.e., kc==k, and nc==n.
260  // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
261  // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
262  // here should be obsolete.
263  Index problem_size = k * n * sizeof(LhsScalar);
264  Index actual_lm = actual_l2;
265  Index max_mc = m;
266  if (problem_size <= 1024) {
267  // problem is small enough to keep in L1
268  // Let's choose m such that lhs's block fit in 1/3 of L1
269  actual_lm = l1;
270  } else if (l3 != 0 && problem_size <= 32768) {
271  // we have both L2 and L3, and problem is small enough to be kept in L2
272  // Let's choose m such that lhs's block fit in 1/3 of L2
273  actual_lm = l2;
274  max_mc = (numext::mini<Index>)(576, max_mc);
275  }
276  Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
277  if (mc > Traits::mr)
278  mc -= mc % Traits::mr;
279  else if (mc == 0)
280  return;
281  m = (m % mc) == 0 ? mc : (mc - Traits::mr * ((mc /*-1*/ - (m % mc)) / (Traits::mr * (m / mc + 1))));
282  }
283  }
284 }
285 
286 template <typename Index>
288 #ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
290  k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
291  m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
292  n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
293  return true;
294  }
295 #else
299 #endif
300  return false;
301 }
302 
320 template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
321 void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
322  if (!useSpecificBlockingSizes(k, m, n)) {
323  evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
324  }
325 }
326 
327 template <typename LhsScalar, typename RhsScalar, typename Index>
328 inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
329  computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(k, m, n, num_threads);
330 }
331 
332 template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
334  private:
335  static constexpr int remaining_registers =
336  (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
337 
338  public:
339  typedef std::conditional_t<remaining_registers >= 4, RhsPacketx4, RhsPacket> type;
340 };
341 
342 template <typename Packet>
343 struct QuadPacket {
345  const Packet& get(const FixedInt<0>&) const { return B_0; }
346  const Packet& get(const FixedInt<1>&) const { return B1; }
347  const Packet& get(const FixedInt<2>&) const { return B2; }
348  const Packet& get(const FixedInt<3>&) const { return B3; }
349 };
350 
351 template <int N, typename T1, typename T2, typename T3>
353  typedef T3 type;
354 };
355 
356 template <typename T1, typename T2, typename T3>
357 struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
358  typedef T1 type;
359 };
360 
361 template <typename T1, typename T2, typename T3>
362 struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
363  typedef T2 type;
364 };
365 
366 #define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size) \
367  typedef typename packet_conditional< \
368  packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
369  typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
370 
371 #define PACKET_DECL_COND(name, packet_size) \
372  typedef typename packet_conditional< \
373  packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
374  typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
375 
376 #define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size) \
377  typedef typename packet_conditional< \
378  packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
379  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
380 
381 #define PACKET_DECL_COND_SCALAR(packet_size) \
382  typedef typename packet_conditional< \
383  packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
384  typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
385 
386 /* Vectorization logic
387  * real*real: unpack rhs to constant packets, ...
388  *
389  * cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
390  * storing each res packet into two packets (2x2),
391  * at the end combine them: swap the second and addsub them
392  * cf*cf : same but with 2x4 blocks
393  * cplx*real : unpack rhs to constant packets, ...
394  * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
395  */
396 template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
397 class gebp_traits {
398  public:
399  typedef LhsScalar_ LhsScalar;
400  typedef RhsScalar_ RhsScalar;
402 
405  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
406 
407  enum {
408  ConjLhs = ConjLhs_,
409  ConjRhs = ConjRhs_,
414 
416 
417  // register block size along the N direction must be 1 or 4
418  nr = 4,
419 
420  // register block size along the M direction (currently, this one cannot be modified)
422 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && \
423  !defined(EIGEN_VECTORIZE_VSX) && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC >= 1914))
424  // we assume 16 registers or more
425  // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
426  // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
427  // Bug 1515: MSVC prior to v19.14 yields to register spilling.
429 #else
431 #endif
432 
434  RhsProgress = 1
435  };
436 
437  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
438  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
439  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
441 
444 
445  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
446 
447  template <typename RhsPacketType>
448  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
449  dest = pset1<RhsPacketType>(*b);
450  }
451 
452  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
453  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
454  }
455 
456  template <typename RhsPacketType>
457  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
458  loadRhs(b, dest);
459  }
460 
462 
463  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
464 
465  template <typename LhsPacketType>
466  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const {
467  dest = pload<LhsPacketType>(a);
468  }
469 
470  template <typename LhsPacketType>
471  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
472  dest = ploadu<LhsPacketType>(a);
473  }
474 
475  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
476  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
477  const LaneIdType&) const {
479  // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
480  // let gcc allocate the register in which to store the result of the pmul
481  // (in the case where there is no FMA) gcc fails to figure out how to avoid
482  // spilling register.
483 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
485  c = cj.pmadd(a, b, c);
486 #else
487  tmp = b;
488  tmp = cj.pmul(a, tmp);
489  c = padd(c, tmp);
490 #endif
491  }
492 
493  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
494  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
495  const LaneIdType& lane) const {
496  madd(a, b.get(lane), c, tmp, lane);
497  }
498 
499  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
500  r = pmadd(c, alpha, r);
501  }
502 
503  template <typename ResPacketHalf>
504  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const {
505  r = pmadd(c, alpha, r);
506  }
507 };
508 
509 template <typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
510 class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_> {
511  public:
512  typedef std::complex<RealScalar> LhsScalar;
515 
518  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
519 
520  enum {
521  ConjLhs = ConjLhs_,
522  ConjRhs = false,
527 
529  nr = 4,
530 #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
531  // we assume 16 registers
532  mr = 3 * LhsPacketSize,
533 #else
535 #endif
536 
538  RhsProgress = 1
539  };
540 
541  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
542  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
543  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
545 
547 
549 
550  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
551 
552  template <typename RhsPacketType>
553  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
554  dest = pset1<RhsPacketType>(*b);
555  }
556 
557  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
558  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
559  }
560 
561  template <typename RhsPacketType>
562  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
563  loadRhs(b, dest);
564  }
565 
567 
569  loadRhsQuad_impl(b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
570  }
571 
573  // FIXME we can do better!
574  // what we want here is a ploadheight
575  RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
576  dest = ploadquad<RhsPacket>(tmp);
577  }
578 
581  dest = pset1<RhsPacket>(*b);
582  }
583 
584  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload<LhsPacket>(a); }
585 
586  template <typename LhsPacketType>
587  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
588  dest = ploadu<LhsPacketType>(a);
589  }
590 
591  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
592  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
593  const LaneIdType&) const {
594  madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
595  }
596 
597  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
598  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
599  RhsPacketType& tmp, const true_type&) const {
600 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
602  c.v = pmadd(a.v, b, c.v);
603 #else
604  tmp = b;
605  tmp = pmul(a.v, tmp);
606  c.v = padd(c.v, tmp);
607 #endif
608  }
609 
611  const false_type&) const {
612  c += a * b;
613  }
614 
615  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
616  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
617  const LaneIdType& lane) const {
618  madd(a, b.get(lane), c, tmp, lane);
619  }
620 
621  template <typename ResPacketType, typename AccPacketType>
622  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
624  r = cj.pmadd(c, alpha, r);
625  }
626 
627  protected:
628 };
629 
630 template <typename Packet>
631 struct DoublePacket {
634 };
635 
636 template <typename Packet>
639  res.first = padd(a.first, b.first);
640  res.second = padd(a.second, b.second);
641  return res;
642 }
643 
644 // note that for DoublePacket<RealPacket> the "4" in "downto4"
645 // corresponds to the number of complexes, so it means "8"
646 // it terms of real coefficients.
647 
648 template <typename Packet>
650  std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
651  return a;
652 }
653 
654 template <typename Packet>
656  const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
657  // yes, that's pretty hackish :(
660  typedef typename packet_traits<Cplx>::type CplxPacket;
661  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
662  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
663  return res;
664 }
665 
666 // same here, "quad" actually means "8" in terms of real coefficients
667 template <typename Scalar, typename RealPacket>
669  std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) {
670  dest.first = pset1<RealPacket>(numext::real(*b));
671  dest.second = pset1<RealPacket>(numext::imag(*b));
672 }
673 
674 template <typename Scalar, typename RealPacket>
676  std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
677  // yes, that's pretty hackish too :(
678  typedef typename NumTraits<Scalar>::Real RealScalar;
679  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
680  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
681  dest.first = ploadquad<RealPacket>(r);
682  dest.second = ploadquad<RealPacket>(i);
683 }
684 
685 template <typename Packet>
689 };
690 // template<typename Packet>
691 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
692 // {
693 // DoublePacket<Packet> res;
694 // res.first = padd(a.first, b.first);
695 // res.second = padd(a.second,b.second);
696 // return res;
697 // }
698 
699 template <typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
700 class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
701  public:
702  typedef std::complex<RealScalar> Scalar;
703  typedef std::complex<RealScalar> LhsScalar;
704  typedef std::complex<RealScalar> RhsScalar;
705  typedef std::complex<RealScalar> ResScalar;
706 
709  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
710  PACKET_DECL_COND(Real, PacketSize_);
712 
713  enum {
714  ConjLhs = ConjLhs_,
715  ConjRhs = ConjRhs_,
722 
723  nr = 4,
725 
727  RhsProgress = 1
728  };
729 
731 
732  typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> LhsPacket4Packing;
733  typedef std::conditional_t<Vectorizable, RealPacket, Scalar> LhsPacket;
734  typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> RhsPacket;
735  typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> ResPacket;
736  typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> AccPacket;
737 
738  // this actually holds 8 packets!
740 
742 
744  p.first = pset1<RealPacket>(RealScalar(0));
745  p.second = pset1<RealPacket>(RealScalar(0));
746  }
747 
748  // Scalar path
749  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1<ScalarPacket>(*b); }
750 
751  // Vectorized path
752  template <typename RealPacketType>
754  dest.first = pset1<RealPacketType>(numext::real(*b));
755  dest.second = pset1<RealPacketType>(numext::imag(*b));
756  }
757 
758  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
759  loadRhs(b, dest.B_0);
760  loadRhs(b + 1, dest.B1);
761  loadRhs(b + 2, dest.B2);
762  loadRhs(b + 3, dest.B3);
763  }
764 
765  // Scalar path
766  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); }
767 
768  // Vectorized path
769  template <typename RealPacketType>
771  loadRhs(b, dest);
772  }
773 
775 
776  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); }
778  loadQuadToDoublePacket(b, dest);
779  }
780 
781  // nothing special here
782  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const {
783  dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
784  }
785 
786  template <typename LhsPacketType>
787  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
788  dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
789  }
790 
791  template <typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
792  typename LaneIdType>
794  const RhsPacketType& b,
796  TmpType& /*tmp*/,
797  const LaneIdType&) const {
798  c.first = pmadd(a, b.first, c.first);
799  c.second = pmadd(a, b.second, c.second);
800  }
801 
802  template <typename LaneIdType>
803  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
804  const LaneIdType&) const {
805  c = cj.pmadd(a, b, c);
806  }
807 
808  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
809  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
810  const LaneIdType& lane) const {
811  madd(a, b.get(lane), c, tmp, lane);
812  }
813 
814  EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
815 
816  template <typename RealPacketType, typename ResPacketType>
817  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
818  ResPacketType& r) const {
819  // assemble c
820  ResPacketType tmp;
821  if ((!ConjLhs) && (!ConjRhs)) {
822  tmp = pcplxflip(pconj(ResPacketType(c.second)));
823  tmp = padd(ResPacketType(c.first), tmp);
824  } else if ((!ConjLhs) && (ConjRhs)) {
825  tmp = pconj(pcplxflip(ResPacketType(c.second)));
826  tmp = padd(ResPacketType(c.first), tmp);
827  } else if ((ConjLhs) && (!ConjRhs)) {
828  tmp = pcplxflip(ResPacketType(c.second));
829  tmp = padd(pconj(ResPacketType(c.first)), tmp);
830  } else if ((ConjLhs) && (ConjRhs)) {
831  tmp = pcplxflip(ResPacketType(c.second));
832  tmp = psub(pconj(ResPacketType(c.first)), tmp);
833  }
834 
835  r = pmadd(tmp, alpha, r);
836  }
837 
838  protected:
840 };
841 
842 template <typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
843 class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_> {
844  public:
845  typedef std::complex<RealScalar> Scalar;
847  typedef Scalar RhsScalar;
848  typedef Scalar ResScalar;
849 
852  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
855 
856 #undef PACKET_DECL_COND_SCALAR_POSTFIX
857 #undef PACKET_DECL_COND_POSTFIX
858 #undef PACKET_DECL_COND_SCALAR
859 #undef PACKET_DECL_COND
860 
861  enum {
862  ConjLhs = false,
863  ConjRhs = ConjRhs_,
868 
870  // FIXME: should depend on NumberOfRegisters
871  nr = 4,
873 
875  RhsProgress = 1
876  };
877 
878  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
879  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
880  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
884 
885  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
886 
887  template <typename RhsPacketType>
888  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
889  dest = pset1<RhsPacketType>(*b);
890  }
891 
892  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
893  pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
894  }
895 
896  template <typename RhsPacketType>
897  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
898  loadRhs(b, dest);
899  }
900 
902 
903  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); }
904 
905  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
906 
907  template <typename LhsPacketType>
908  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
909  dest = ploaddup<LhsPacketType>(a);
910  }
911 
912  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
913  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
914  const LaneIdType&) const {
915  madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
916  }
917 
918  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
919  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
920  RhsPacketType& tmp, const true_type&) const {
921 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
923  c.v = pmadd(a, b.v, c.v);
924 #else
925  tmp = b;
926  tmp.v = pmul(a, tmp.v);
927  c = padd(c, tmp);
928 #endif
929  }
930 
932  const false_type&) const {
933  c += a * b;
934  }
935 
936  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
937  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
938  const LaneIdType& lane) const {
939  madd(a, b.get(lane), c, tmp, lane);
940  }
941 
942  template <typename ResPacketType, typename AccPacketType>
943  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
945  r = cj.pmadd(alpha, c, r);
946  }
947 
948  protected:
949 };
950 
951 /* optimized General packed Block * packed Panel product kernel
952  *
953  * Mixing type logic: C += A * B
954  * | A | B | comments
955  * |real |cplx | no vectorization yet, would require to pack A with duplication
956  * |cplx |real | easy vectorization
957  */
958 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
959  bool ConjugateLhs, bool ConjugateRhs>
960 struct gebp_kernel {
966 
967  typedef typename Traits::ResScalar ResScalar;
968  typedef typename Traits::LhsPacket LhsPacket;
969  typedef typename Traits::RhsPacket RhsPacket;
970  typedef typename Traits::ResPacket ResPacket;
971  typedef typename Traits::AccPacket AccPacket;
973 
976 
978 
984 
989 
994 
995  typedef typename DataMapper::LinearMapper LinearMapper;
996 
997  enum {
1006  };
1007 
1008  EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
1009  Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
1010  Index offsetA = 0, Index offsetB = 0);
1011 };
1012 
1013 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1014  bool ConjugateLhs, bool ConjugateRhs,
1015  int SwappedLhsProgress =
1020 
1021  typedef typename Traits::ResScalar ResScalar;
1026 
1027  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1028  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1029  ResScalar alpha, SAccPacket& C0) {
1031  EIGEN_UNUSED_VARIABLE(straits);
1032  EIGEN_UNUSED_VARIABLE(blA);
1033  EIGEN_UNUSED_VARIABLE(blB);
1034  EIGEN_UNUSED_VARIABLE(depth);
1035  EIGEN_UNUSED_VARIABLE(endk);
1040  }
1041 };
1042 
1043 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1044  bool ConjugateLhs, bool ConjugateRhs>
1045 struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
1048 
1049  typedef typename Traits::ResScalar ResScalar;
1054 
1055  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
1056  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
1057  ResScalar alpha, SAccPacket& C0) {
1058  typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
1059  typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
1060  typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
1061  typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
1062 
1063  SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
1064  SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
1065 
1066  if (depth - endk > 0) {
1067  // We have to handle the last row(s) of the rhs, which
1068  // correspond to a half-packet
1069  SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
1070 
1071  for (Index kk = endk; kk < depth; kk++) {
1072  SLhsPacketQuarter a0;
1073  SRhsPacketQuarter b0;
1074  straits.loadLhsUnaligned(blB, a0);
1075  straits.loadRhs(blA, b0);
1076  straits.madd(a0, b0, c0, b0, fix<0>);
1077  blB += SwappedTraits::LhsProgress / 4;
1078  blA += 1;
1079  }
1080  straits.acc(c0, alphav, R);
1081  } else {
1082  straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
1083  }
1084  res.scatterPacket(i, j2, R);
1085  }
1086 };
1087 
1088 template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1089  typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1090  typename LinearMapper, typename DataMapper>
1092  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
1093 
1094  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1095  LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
1096  AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1097  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1098  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1099  traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
1100  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
1101  traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
1102  traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
1103  traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
1104  traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
1105 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
1106  __asm__("" : "+x,m"(*A0));
1107 #endif
1108  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1109  }
1110 
1111  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
1112  ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
1113  Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
1114  Index cols, Index depth, Index packet_cols4) {
1115  GEBPTraits traits;
1116  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1117  // loops on each largest micro horizontal panel of lhs
1118  // (LhsProgress x depth)
1119  for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
1120 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1121  EIGEN_IF_CONSTEXPR(nr >= 8) {
1122  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1123  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1124  prefetch(&blA[0]);
1125 
1126  // gets res block as register
1127  AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
1128  traits.initAcc(C0);
1129  traits.initAcc(C1);
1130  traits.initAcc(C2);
1131  traits.initAcc(C3);
1132  traits.initAcc(C4);
1133  traits.initAcc(C5);
1134  traits.initAcc(C6);
1135  traits.initAcc(C7);
1136 
1137  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1138  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1139  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1140  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1141  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1142  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1143  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1144  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1145  r0.prefetch(prefetch_res_offset);
1146  r1.prefetch(prefetch_res_offset);
1147  r2.prefetch(prefetch_res_offset);
1148  r3.prefetch(prefetch_res_offset);
1149  r4.prefetch(prefetch_res_offset);
1150  r5.prefetch(prefetch_res_offset);
1151  r6.prefetch(prefetch_res_offset);
1152  r7.prefetch(prefetch_res_offset);
1153  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1154  prefetch(&blB[0]);
1155 
1156  LhsPacket A0;
1157  for (Index k = 0; k < peeled_kc; k += pk) {
1158  RhsPacketx4 rhs_panel;
1159  RhsPacket T0;
1160 #define EIGEN_GEBGP_ONESTEP(K) \
1161  do { \
1162  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8"); \
1163  traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0); \
1164  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1165  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1166  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
1167  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1168  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
1169  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1170  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
1171  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1172  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
1173  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1174  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
1175  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1176  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
1177  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1178  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
1179  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1180  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8"); \
1181  } while (false)
1182 
1183  EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
1184 
1193 
1194  blB += pk * 8 * RhsProgress;
1195  blA += pk * (1 * LhsProgress);
1196 
1197  EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
1198  }
1199  // process remaining peeled loop
1200  for (Index k = peeled_kc; k < depth; k++) {
1201  RhsPacketx4 rhs_panel;
1202  RhsPacket T0;
1204  blB += 8 * RhsProgress;
1205  blA += 1 * LhsProgress;
1206  }
1207 
1208 #undef EIGEN_GEBGP_ONESTEP
1209 
1210  ResPacket R0, R1;
1211  ResPacket alphav = pset1<ResPacket>(alpha);
1212 
1213  R0 = r0.template loadPacket<ResPacket>(0);
1214  R1 = r1.template loadPacket<ResPacket>(0);
1215  traits.acc(C0, alphav, R0);
1216  traits.acc(C1, alphav, R1);
1217  r0.storePacket(0, R0);
1218  r1.storePacket(0, R1);
1219 
1220  R0 = r2.template loadPacket<ResPacket>(0);
1221  R1 = r3.template loadPacket<ResPacket>(0);
1222  traits.acc(C2, alphav, R0);
1223  traits.acc(C3, alphav, R1);
1224  r2.storePacket(0, R0);
1225  r3.storePacket(0, R1);
1226 
1227  R0 = r4.template loadPacket<ResPacket>(0);
1228  R1 = r5.template loadPacket<ResPacket>(0);
1229  traits.acc(C4, alphav, R0);
1230  traits.acc(C5, alphav, R1);
1231  r4.storePacket(0, R0);
1232  r5.storePacket(0, R1);
1233 
1234  R0 = r6.template loadPacket<ResPacket>(0);
1235  R1 = r7.template loadPacket<ResPacket>(0);
1236  traits.acc(C6, alphav, R0);
1237  traits.acc(C7, alphav, R1);
1238  r6.storePacket(0, R0);
1239  r7.storePacket(0, R1);
1240  }
1241  }
1242 #endif
1243 
1244  // loops on each largest micro vertical panel of rhs (depth * nr)
1245  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1246  // We select a LhsProgress x nr micro block of res
1247  // which is entirely stored into 1 x nr registers.
1248 
1249  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1250  prefetch(&blA[0]);
1251 
1252  // gets res block as register
1253  AccPacket C0, C1, C2, C3;
1254  traits.initAcc(C0);
1255  traits.initAcc(C1);
1256  traits.initAcc(C2);
1257  traits.initAcc(C3);
1258  // To improve instruction pipelining, let's double the accumulation registers:
1259  // even k will accumulate in C*, while odd k will accumulate in D*.
1260  // This trick is crucial to get good performance with FMA, otherwise it is
1261  // actually faster to perform separated MUL+ADD because of a naturally
1262  // better instruction-level parallelism.
1263  AccPacket D0, D1, D2, D3;
1264  traits.initAcc(D0);
1265  traits.initAcc(D1);
1266  traits.initAcc(D2);
1267  traits.initAcc(D3);
1268 
1269  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1270  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1271  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1272  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1273 
1274  r0.prefetch(prefetch_res_offset);
1275  r1.prefetch(prefetch_res_offset);
1276  r2.prefetch(prefetch_res_offset);
1277  r3.prefetch(prefetch_res_offset);
1278 
1279  // performs "inner" products
1280  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1281  prefetch(&blB[0]);
1282  LhsPacket A0, A1;
1283 
1284  for (Index k = 0; k < peeled_kc; k += pk) {
1285  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
1286  RhsPacketx4 rhs_panel;
1287  RhsPacket T0;
1288 
1289  internal::prefetch(blB + (48 + 0));
1290  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1291  peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1292  peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1293  peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1294  internal::prefetch(blB + (48 + 16));
1295  peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1296  peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1297  peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1298  peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
1299 
1300  blB += pk * 4 * RhsProgress;
1301  blA += pk * LhsProgress;
1302 
1303  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
1304  }
1305  C0 = padd(C0, D0);
1306  C1 = padd(C1, D1);
1307  C2 = padd(C2, D2);
1308  C3 = padd(C3, D3);
1309 
1310  // process remaining peeled loop
1311  for (Index k = peeled_kc; k < depth; k++) {
1312  RhsPacketx4 rhs_panel;
1313  RhsPacket T0;
1314  peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
1315  blB += 4 * RhsProgress;
1316  blA += LhsProgress;
1317  }
1318 
1319  ResPacket R0, R1;
1320  ResPacket alphav = pset1<ResPacket>(alpha);
1321 
1322  R0 = r0.template loadPacket<ResPacket>(0);
1323  R1 = r1.template loadPacket<ResPacket>(0);
1324  traits.acc(C0, alphav, R0);
1325  traits.acc(C1, alphav, R1);
1326  r0.storePacket(0, R0);
1327  r1.storePacket(0, R1);
1328 
1329  R0 = r2.template loadPacket<ResPacket>(0);
1330  R1 = r3.template loadPacket<ResPacket>(0);
1331  traits.acc(C2, alphav, R0);
1332  traits.acc(C3, alphav, R1);
1333  r2.storePacket(0, R0);
1334  r3.storePacket(0, R1);
1335  }
1336 
1337  // Deal with remaining columns of the rhs
1338  for (Index j2 = packet_cols4; j2 < cols; j2++) {
1339  // One column at a time
1340  const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
1341  prefetch(&blA[0]);
1342 
1343  // gets res block as register
1344  AccPacket C0;
1345  traits.initAcc(C0);
1346 
1347  LinearMapper r0 = res.getLinearMapper(i, j2);
1348 
1349  // performs "inner" products
1350  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1351  LhsPacket A0;
1352 
1353  for (Index k = 0; k < peeled_kc; k += pk) {
1354  EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
1355  RhsPacket B_0;
1356 
1357 #define EIGEN_GEBGP_ONESTEP(K) \
1358  do { \
1359  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
1360  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1361  /* FIXME: why unaligned???? */ \
1362  traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0); \
1363  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1364  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1365  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
1366  } while (false);
1367 
1376 
1377  blB += pk * RhsProgress;
1378  blA += pk * LhsProgress;
1379 
1380  EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
1381  }
1382 
1383  // process remaining peeled loop
1384  for (Index k = peeled_kc; k < depth; k++) {
1385  RhsPacket B_0;
1387  blB += RhsProgress;
1388  blA += LhsProgress;
1389  }
1390 #undef EIGEN_GEBGP_ONESTEP
1391  ResPacket R0;
1392  ResPacket alphav = pset1<ResPacket>(alpha);
1393  R0 = r0.template loadPacket<ResPacket>(0);
1394  traits.acc(C0, alphav, R0);
1395  r0.storePacket(0, R0);
1396  }
1397  }
1398  }
1399 };
1400 
1401 template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
1402  typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
1403  typename LinearMapper, typename DataMapper>
1405  : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
1406  RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
1407  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
1408  LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
1409  AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
1410  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
1411  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
1412  traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
1413  traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
1414  traits.madd(*A0, *B_0, *C0, *B_0);
1415  traits.madd(*A0, *B1, *C1, *B1);
1416  traits.madd(*A0, *B2, *C2, *B2);
1417  traits.madd(*A0, *B3, *C3, *B3);
1418  EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
1419  }
1420 };
1421 
1422 template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
1423  bool ConjugateLhs, bool ConjugateRhs>
1424 EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
1425  ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
1426  const RhsScalar* blockB, Index rows, Index depth,
1427  Index cols, ResScalar alpha, Index strideA, Index strideB,
1428  Index offsetA, Index offsetB) {
1429  Traits traits;
1430  SwappedTraits straits;
1431 
1432  if (strideA == -1) strideA = depth;
1433  if (strideB == -1) strideB = depth;
1435  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
1436  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
1437  const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
1438  const Index peeled_mc2 =
1439  mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
1440  const Index peeled_mc1 =
1441  mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
1442  const Index peeled_mc_half =
1443  mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
1444  const Index peeled_mc_quarter =
1445  mr >= LhsProgressQuarter
1446  ? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
1447  : 0;
1448  enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
1449  const Index peeled_kc = depth & ~(pk - 1);
1450  const int prefetch_res_offset = 32 / sizeof(ResScalar);
1451  // const Index depth2 = depth & ~1;
1452 
1453  //---------- Process 3 * LhsProgress rows at once ----------
1454  // This corresponds to 3*LhsProgress x nr register blocks.
1455  // Usually, make sense only with FMA
1456  if (mr >= 3 * Traits::LhsProgress) {
1457  // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
1458  // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
1459  // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
1460  // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
1461  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1462  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1463  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1464  // guess), or because we are testing specific blocking sizes.
1465  const Index actual_panel_rows =
1466  (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1467  (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
1468  for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
1469  const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
1470 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1471  EIGEN_IF_CONSTEXPR(nr >= 8) {
1472  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1473  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1474  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1475  prefetch(&blA[0]);
1476  // gets res block as register
1477  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
1478  C21, C22, C23;
1479  traits.initAcc(C0);
1480  traits.initAcc(C1);
1481  traits.initAcc(C2);
1482  traits.initAcc(C3);
1483  traits.initAcc(C4);
1484  traits.initAcc(C5);
1485  traits.initAcc(C6);
1486  traits.initAcc(C7);
1487  traits.initAcc(C8);
1488  traits.initAcc(C9);
1489  traits.initAcc(C10);
1490  traits.initAcc(C11);
1491  traits.initAcc(C12);
1492  traits.initAcc(C13);
1493  traits.initAcc(C14);
1494  traits.initAcc(C15);
1495  traits.initAcc(C16);
1496  traits.initAcc(C17);
1497  traits.initAcc(C18);
1498  traits.initAcc(C19);
1499  traits.initAcc(C20);
1500  traits.initAcc(C21);
1501  traits.initAcc(C22);
1502  traits.initAcc(C23);
1503 
1504  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1505  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1506  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1507  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1508  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1509  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1510  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1511  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1512 
1513  r0.prefetch(0);
1514  r1.prefetch(0);
1515  r2.prefetch(0);
1516  r3.prefetch(0);
1517  r4.prefetch(0);
1518  r5.prefetch(0);
1519  r6.prefetch(0);
1520  r7.prefetch(0);
1521 
1522  // performs "inner" products
1523  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1524  prefetch(&blB[0]);
1525  LhsPacket A0, A1;
1526  for (Index k = 0; k < peeled_kc; k += pk) {
1527  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
1528  // 27 registers are taken (24 for acc, 3 for lhs).
1529  RhsPanel27 rhs_panel;
1530  RhsPacket T0;
1531  LhsPacket A2;
1532 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1533 // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1534 // without this workaround A0, A1, and A2 are loaded in the same register,
1535 // which is not good for pipelining
1536 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1537 #else
1538 #define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
1539 #endif
1540 
1541 #define EIGEN_GEBP_ONESTEP(K) \
1542  do { \
1543  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8"); \
1544  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1545  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1546  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1547  EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1548  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1549  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
1550  traits.madd(A2, rhs_panel, C16, T0, fix<0>); \
1551  traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1552  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1553  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
1554  traits.madd(A2, rhs_panel, C17, T0, fix<1>); \
1555  traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1556  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1557  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
1558  traits.madd(A2, rhs_panel, C18, T0, fix<2>); \
1559  traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1560  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1561  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
1562  traits.madd(A2, rhs_panel, C19, T0, fix<3>); \
1563  traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1564  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
1565  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
1566  traits.madd(A2, rhs_panel, C20, T0, fix<0>); \
1567  traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1568  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
1569  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
1570  traits.madd(A2, rhs_panel, C21, T0, fix<1>); \
1571  traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1572  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
1573  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
1574  traits.madd(A2, rhs_panel, C22, T0, fix<2>); \
1575  traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel); \
1576  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
1577  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
1578  traits.madd(A2, rhs_panel, C23, T0, fix<3>); \
1579  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8"); \
1580  } while (false)
1581 
1582  EIGEN_GEBP_ONESTEP(0);
1583  EIGEN_GEBP_ONESTEP(1);
1584  EIGEN_GEBP_ONESTEP(2);
1585  EIGEN_GEBP_ONESTEP(3);
1586  EIGEN_GEBP_ONESTEP(4);
1587  EIGEN_GEBP_ONESTEP(5);
1588  EIGEN_GEBP_ONESTEP(6);
1589  EIGEN_GEBP_ONESTEP(7);
1590 
1591  blB += pk * 8 * RhsProgress;
1592  blA += pk * 3 * Traits::LhsProgress;
1593  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
1594  }
1595 
1596  // process remaining peeled loop
1597  for (Index k = peeled_kc; k < depth; k++) {
1598  RhsPanel27 rhs_panel;
1599  RhsPacket T0;
1600  LhsPacket A2;
1601  EIGEN_GEBP_ONESTEP(0);
1602  blB += 8 * RhsProgress;
1603  blA += 3 * Traits::LhsProgress;
1604  }
1605 
1606 #undef EIGEN_GEBP_ONESTEP
1607 
1608  ResPacket R0, R1, R2;
1609  ResPacket alphav = pset1<ResPacket>(alpha);
1610 
1611  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1612  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1613  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1614  traits.acc(C0, alphav, R0);
1615  traits.acc(C8, alphav, R1);
1616  traits.acc(C16, alphav, R2);
1617  r0.storePacket(0 * Traits::ResPacketSize, R0);
1618  r0.storePacket(1 * Traits::ResPacketSize, R1);
1619  r0.storePacket(2 * Traits::ResPacketSize, R2);
1620 
1621  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1622  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1623  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1624  traits.acc(C1, alphav, R0);
1625  traits.acc(C9, alphav, R1);
1626  traits.acc(C17, alphav, R2);
1627  r1.storePacket(0 * Traits::ResPacketSize, R0);
1628  r1.storePacket(1 * Traits::ResPacketSize, R1);
1629  r1.storePacket(2 * Traits::ResPacketSize, R2);
1630 
1631  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1632  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1633  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1634  traits.acc(C2, alphav, R0);
1635  traits.acc(C10, alphav, R1);
1636  traits.acc(C18, alphav, R2);
1637  r2.storePacket(0 * Traits::ResPacketSize, R0);
1638  r2.storePacket(1 * Traits::ResPacketSize, R1);
1639  r2.storePacket(2 * Traits::ResPacketSize, R2);
1640 
1641  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1642  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1643  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1644  traits.acc(C3, alphav, R0);
1645  traits.acc(C11, alphav, R1);
1646  traits.acc(C19, alphav, R2);
1647  r3.storePacket(0 * Traits::ResPacketSize, R0);
1648  r3.storePacket(1 * Traits::ResPacketSize, R1);
1649  r3.storePacket(2 * Traits::ResPacketSize, R2);
1650 
1651  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1652  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1653  R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1654  traits.acc(C4, alphav, R0);
1655  traits.acc(C12, alphav, R1);
1656  traits.acc(C20, alphav, R2);
1657  r4.storePacket(0 * Traits::ResPacketSize, R0);
1658  r4.storePacket(1 * Traits::ResPacketSize, R1);
1659  r4.storePacket(2 * Traits::ResPacketSize, R2);
1660 
1661  R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1662  R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1663  R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1664  traits.acc(C5, alphav, R0);
1665  traits.acc(C13, alphav, R1);
1666  traits.acc(C21, alphav, R2);
1667  r5.storePacket(0 * Traits::ResPacketSize, R0);
1668  r5.storePacket(1 * Traits::ResPacketSize, R1);
1669  r5.storePacket(2 * Traits::ResPacketSize, R2);
1670 
1671  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1672  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1673  R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1674  traits.acc(C6, alphav, R0);
1675  traits.acc(C14, alphav, R1);
1676  traits.acc(C22, alphav, R2);
1677  r6.storePacket(0 * Traits::ResPacketSize, R0);
1678  r6.storePacket(1 * Traits::ResPacketSize, R1);
1679  r6.storePacket(2 * Traits::ResPacketSize, R2);
1680 
1681  R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1682  R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1683  R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1684  traits.acc(C7, alphav, R0);
1685  traits.acc(C15, alphav, R1);
1686  traits.acc(C23, alphav, R2);
1687  r7.storePacket(0 * Traits::ResPacketSize, R0);
1688  r7.storePacket(1 * Traits::ResPacketSize, R1);
1689  r7.storePacket(2 * Traits::ResPacketSize, R2);
1690  }
1691  }
1692  }
1693 #endif
1694  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
1695  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1696  // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
1697  // stored into 3 x nr registers.
1698 
1699  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
1700  prefetch(&blA[0]);
1701 
1702  // gets res block as register
1703  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
1704  traits.initAcc(C0);
1705  traits.initAcc(C1);
1706  traits.initAcc(C2);
1707  traits.initAcc(C3);
1708  traits.initAcc(C4);
1709  traits.initAcc(C5);
1710  traits.initAcc(C6);
1711  traits.initAcc(C7);
1712  traits.initAcc(C8);
1713  traits.initAcc(C9);
1714  traits.initAcc(C10);
1715  traits.initAcc(C11);
1716 
1717  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1718  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1719  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1720  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1721 
1722  r0.prefetch(0);
1723  r1.prefetch(0);
1724  r2.prefetch(0);
1725  r3.prefetch(0);
1726 
1727  // performs "inner" products
1728  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
1729  prefetch(&blB[0]);
1730  LhsPacket A0, A1;
1731 
1732  for (Index k = 0; k < peeled_kc; k += pk) {
1733  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
1734  // 15 registers are taken (12 for acc, 3 for lhs).
1735  RhsPanel15 rhs_panel;
1736  RhsPacket T0;
1737  LhsPacket A2;
1738 #if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
1739 // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
1740 // without this workaround A0, A1, and A2 are loaded in the same register,
1741 // which is not good for pipelining
1742 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
1743 #else
1744 #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
1745 #endif
1746 #define EIGEN_GEBP_ONESTEP(K) \
1747  do { \
1748  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
1749  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1750  internal::prefetch(blA + (3 * K + 16) * LhsProgress); \
1751  if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) { \
1752  internal::prefetch(blB + (4 * K + 16) * RhsProgress); \
1753  } /* Bug 953 */ \
1754  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1755  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1756  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1757  EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
1758  traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1759  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
1760  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
1761  traits.madd(A2, rhs_panel, C8, T0, fix<0>); \
1762  traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1763  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
1764  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
1765  traits.madd(A2, rhs_panel, C9, T0, fix<1>); \
1766  traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1767  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
1768  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
1769  traits.madd(A2, rhs_panel, C10, T0, fix<2>); \
1770  traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
1771  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
1772  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
1773  traits.madd(A2, rhs_panel, C11, T0, fix<3>); \
1774  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
1775  } while (false)
1776 
1777  internal::prefetch(blB);
1778  EIGEN_GEBP_ONESTEP(0);
1779  EIGEN_GEBP_ONESTEP(1);
1780  EIGEN_GEBP_ONESTEP(2);
1781  EIGEN_GEBP_ONESTEP(3);
1782  EIGEN_GEBP_ONESTEP(4);
1783  EIGEN_GEBP_ONESTEP(5);
1784  EIGEN_GEBP_ONESTEP(6);
1785  EIGEN_GEBP_ONESTEP(7);
1786 
1787  blB += pk * 4 * RhsProgress;
1788  blA += pk * 3 * Traits::LhsProgress;
1789 
1790  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
1791  }
1792  // process remaining peeled loop
1793  for (Index k = peeled_kc; k < depth; k++) {
1794  RhsPanel15 rhs_panel;
1795  RhsPacket T0;
1796  LhsPacket A2;
1797  EIGEN_GEBP_ONESTEP(0);
1798  blB += 4 * RhsProgress;
1799  blA += 3 * Traits::LhsProgress;
1800  }
1801 
1802 #undef EIGEN_GEBP_ONESTEP
1803 
1804  ResPacket R0, R1, R2;
1805  ResPacket alphav = pset1<ResPacket>(alpha);
1806 
1807  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1808  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1809  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1810  traits.acc(C0, alphav, R0);
1811  traits.acc(C4, alphav, R1);
1812  traits.acc(C8, alphav, R2);
1813  r0.storePacket(0 * Traits::ResPacketSize, R0);
1814  r0.storePacket(1 * Traits::ResPacketSize, R1);
1815  r0.storePacket(2 * Traits::ResPacketSize, R2);
1816 
1817  R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1818  R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1819  R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1820  traits.acc(C1, alphav, R0);
1821  traits.acc(C5, alphav, R1);
1822  traits.acc(C9, alphav, R2);
1823  r1.storePacket(0 * Traits::ResPacketSize, R0);
1824  r1.storePacket(1 * Traits::ResPacketSize, R1);
1825  r1.storePacket(2 * Traits::ResPacketSize, R2);
1826 
1827  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1828  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1829  R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1830  traits.acc(C2, alphav, R0);
1831  traits.acc(C6, alphav, R1);
1832  traits.acc(C10, alphav, R2);
1833  r2.storePacket(0 * Traits::ResPacketSize, R0);
1834  r2.storePacket(1 * Traits::ResPacketSize, R1);
1835  r2.storePacket(2 * Traits::ResPacketSize, R2);
1836 
1837  R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1838  R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1839  R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1840  traits.acc(C3, alphav, R0);
1841  traits.acc(C7, alphav, R1);
1842  traits.acc(C11, alphav, R2);
1843  r3.storePacket(0 * Traits::ResPacketSize, R0);
1844  r3.storePacket(1 * Traits::ResPacketSize, R1);
1845  r3.storePacket(2 * Traits::ResPacketSize, R2);
1846  }
1847  }
1848 
1849  // Deal with remaining columns of the rhs
1850  for (Index j2 = packet_cols4; j2 < cols; j2++) {
1851  for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
1852  // One column at a time
1853  const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
1854  prefetch(&blA[0]);
1855 
1856  // gets res block as register
1857  AccPacket C0, C4, C8;
1858  traits.initAcc(C0);
1859  traits.initAcc(C4);
1860  traits.initAcc(C8);
1861 
1862  LinearMapper r0 = res.getLinearMapper(i, j2);
1863  r0.prefetch(0);
1864 
1865  // performs "inner" products
1866  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
1867  LhsPacket A0, A1, A2;
1868 
1869  for (Index k = 0; k < peeled_kc; k += pk) {
1870  EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
1871  RhsPacket B_0;
1872 #define EIGEN_GEBGP_ONESTEP(K) \
1873  do { \
1874  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
1875  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
1876  traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0); \
1877  traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1); \
1878  traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2); \
1879  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
1880  traits.madd(A0, B_0, C0, B_0, fix<0>); \
1881  traits.madd(A1, B_0, C4, B_0, fix<0>); \
1882  traits.madd(A2, B_0, C8, B_0, fix<0>); \
1883  EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
1884  } while (false)
1885 
1894 
1895  blB += int(pk) * int(RhsProgress);
1896  blA += int(pk) * 3 * int(Traits::LhsProgress);
1897 
1898  EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
1899  }
1900 
1901  // process remaining peeled loop
1902  for (Index k = peeled_kc; k < depth; k++) {
1903  RhsPacket B_0;
1905  blB += RhsProgress;
1906  blA += 3 * Traits::LhsProgress;
1907  }
1908 #undef EIGEN_GEBGP_ONESTEP
1909  ResPacket R0, R1, R2;
1910  ResPacket alphav = pset1<ResPacket>(alpha);
1911 
1912  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
1913  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
1914  R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
1915  traits.acc(C0, alphav, R0);
1916  traits.acc(C4, alphav, R1);
1917  traits.acc(C8, alphav, R2);
1918  r0.storePacket(0 * Traits::ResPacketSize, R0);
1919  r0.storePacket(1 * Traits::ResPacketSize, R1);
1920  r0.storePacket(2 * Traits::ResPacketSize, R2);
1921  }
1922  }
1923  }
1924  }
1925 
1926  //---------- Process 2 * LhsProgress rows at once ----------
1927  if (mr >= 2 * Traits::LhsProgress) {
1928  const Index l1 = defaultL1CacheSize; // in Bytes, TODO, l1 should be passed to this function.
1929  // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
1930  // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
1931  // guess), or because we are testing specific blocking sizes.
1932  Index actual_panel_rows =
1933  (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
1934  (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
1935 
1936  for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
1937  Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
1938 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
1939  EIGEN_IF_CONSTEXPR(nr >= 8) {
1940  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
1941  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
1942  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
1943  prefetch(&blA[0]);
1944 
1945  AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
1946  traits.initAcc(C0);
1947  traits.initAcc(C1);
1948  traits.initAcc(C2);
1949  traits.initAcc(C3);
1950  traits.initAcc(C4);
1951  traits.initAcc(C5);
1952  traits.initAcc(C6);
1953  traits.initAcc(C7);
1954  traits.initAcc(C8);
1955  traits.initAcc(C9);
1956  traits.initAcc(C10);
1957  traits.initAcc(C11);
1958  traits.initAcc(C12);
1959  traits.initAcc(C13);
1960  traits.initAcc(C14);
1961  traits.initAcc(C15);
1962 
1963  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
1964  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
1965  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
1966  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
1967  LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
1968  LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
1969  LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
1970  LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
1971  r0.prefetch(prefetch_res_offset);
1972  r1.prefetch(prefetch_res_offset);
1973  r2.prefetch(prefetch_res_offset);
1974  r3.prefetch(prefetch_res_offset);
1975  r4.prefetch(prefetch_res_offset);
1976  r5.prefetch(prefetch_res_offset);
1977  r6.prefetch(prefetch_res_offset);
1978  r7.prefetch(prefetch_res_offset);
1979 
1980  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
1981  prefetch(&blB[0]);
1982  LhsPacket A0, A1;
1983  for (Index k = 0; k < peeled_kc; k += pk) {
1984  RhsPacketx4 rhs_panel;
1985  RhsPacket T0;
1986 // NOTE: the begin/end asm comments below work around bug 935!
1987 // but they are not enough for gcc>=6 without FMA (bug 1637)
1988 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
1989 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
1990 #else
1991 #define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
1992 #endif
1993 #define EIGEN_GEBGP_ONESTEP(K) \
1994  do { \
1995  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8"); \
1996  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
1997  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
1998  traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel); \
1999  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2000  traits.madd(A1, rhs_panel, C8, T0, fix<0>); \
2001  traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
2002  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2003  traits.madd(A1, rhs_panel, C9, T0, fix<1>); \
2004  traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
2005  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2006  traits.madd(A1, rhs_panel, C10, T0, fix<2>); \
2007  traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
2008  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2009  traits.madd(A1, rhs_panel, C11, T0, fix<3>); \
2010  traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel); \
2011  traits.madd(A0, rhs_panel, C4, T0, fix<0>); \
2012  traits.madd(A1, rhs_panel, C12, T0, fix<0>); \
2013  traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
2014  traits.madd(A0, rhs_panel, C5, T0, fix<1>); \
2015  traits.madd(A1, rhs_panel, C13, T0, fix<1>); \
2016  traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
2017  traits.madd(A0, rhs_panel, C6, T0, fix<2>); \
2018  traits.madd(A1, rhs_panel, C14, T0, fix<2>); \
2019  traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
2020  traits.madd(A0, rhs_panel, C7, T0, fix<3>); \
2021  traits.madd(A1, rhs_panel, C15, T0, fix<3>); \
2022  EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
2023  } while (false)
2024 
2025  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
2026 
2035 
2036  blB += pk * 8 * RhsProgress;
2037  blA += pk * (2 * Traits::LhsProgress);
2038 
2039  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
2040  }
2041  // process remaining peeled loop
2042  for (Index k = peeled_kc; k < depth; k++) {
2043  RhsPacketx4 rhs_panel;
2044  RhsPacket T0;
2046  blB += 8 * RhsProgress;
2047  blA += 2 * Traits::LhsProgress;
2048  }
2049 
2050 #undef EIGEN_GEBGP_ONESTEP
2051 
2052  ResPacket R0, R1, R2, R3;
2053  ResPacket alphav = pset1<ResPacket>(alpha);
2054 
2055  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2056  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2057  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2058  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2059  traits.acc(C0, alphav, R0);
2060  traits.acc(C8, alphav, R1);
2061  traits.acc(C1, alphav, R2);
2062  traits.acc(C9, alphav, R3);
2063  r0.storePacket(0 * Traits::ResPacketSize, R0);
2064  r0.storePacket(1 * Traits::ResPacketSize, R1);
2065  r1.storePacket(0 * Traits::ResPacketSize, R2);
2066  r1.storePacket(1 * Traits::ResPacketSize, R3);
2067 
2068  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2069  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2070  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2071  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2072  traits.acc(C2, alphav, R0);
2073  traits.acc(C10, alphav, R1);
2074  traits.acc(C3, alphav, R2);
2075  traits.acc(C11, alphav, R3);
2076  r2.storePacket(0 * Traits::ResPacketSize, R0);
2077  r2.storePacket(1 * Traits::ResPacketSize, R1);
2078  r3.storePacket(0 * Traits::ResPacketSize, R2);
2079  r3.storePacket(1 * Traits::ResPacketSize, R3);
2080 
2081  R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2082  R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2083  R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2084  R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2085  traits.acc(C4, alphav, R0);
2086  traits.acc(C12, alphav, R1);
2087  traits.acc(C5, alphav, R2);
2088  traits.acc(C13, alphav, R3);
2089  r4.storePacket(0 * Traits::ResPacketSize, R0);
2090  r4.storePacket(1 * Traits::ResPacketSize, R1);
2091  r5.storePacket(0 * Traits::ResPacketSize, R2);
2092  r5.storePacket(1 * Traits::ResPacketSize, R3);
2093 
2094  R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2095  R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2096  R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2097  R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2098  traits.acc(C6, alphav, R0);
2099  traits.acc(C14, alphav, R1);
2100  traits.acc(C7, alphav, R2);
2101  traits.acc(C15, alphav, R3);
2102  r6.storePacket(0 * Traits::ResPacketSize, R0);
2103  r6.storePacket(1 * Traits::ResPacketSize, R1);
2104  r7.storePacket(0 * Traits::ResPacketSize, R2);
2105  r7.storePacket(1 * Traits::ResPacketSize, R3);
2106  }
2107  }
2108  }
2109 #endif
2110  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2111  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
2112  // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
2113  // stored into 2 x nr registers.
2114 
2115  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
2116  prefetch(&blA[0]);
2117 
2118  // gets res block as register
2119  AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
2120  traits.initAcc(C0);
2121  traits.initAcc(C1);
2122  traits.initAcc(C2);
2123  traits.initAcc(C3);
2124  traits.initAcc(C4);
2125  traits.initAcc(C5);
2126  traits.initAcc(C6);
2127  traits.initAcc(C7);
2128 
2129  LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
2130  LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
2131  LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
2132  LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
2133 
2134  r0.prefetch(prefetch_res_offset);
2135  r1.prefetch(prefetch_res_offset);
2136  r2.prefetch(prefetch_res_offset);
2137  r3.prefetch(prefetch_res_offset);
2138 
2139  // performs "inner" products
2140  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2141  prefetch(&blB[0]);
2142  LhsPacket A0, A1;
2143 
2144  for (Index k = 0; k < peeled_kc; k += pk) {
2145  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
2146  RhsPacketx4 rhs_panel;
2147  RhsPacket T0;
2148 
2149 // NOTE: the begin/end asm comments below work around bug 935!
2150 // but they are not enough for gcc>=6 without FMA (bug 1637)
2151 #if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
2152 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
2153 #else
2154 #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
2155 #endif
2156 #define EIGEN_GEBGP_ONESTEP(K) \
2157  do { \
2158  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \
2159  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2160  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2161  traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
2162  traits.madd(A0, rhs_panel, C0, T0, fix<0>); \
2163  traits.madd(A1, rhs_panel, C4, T0, fix<0>); \
2164  traits.madd(A0, rhs_panel, C1, T0, fix<1>); \
2165  traits.madd(A1, rhs_panel, C5, T0, fix<1>); \
2166  traits.madd(A0, rhs_panel, C2, T0, fix<2>); \
2167  traits.madd(A1, rhs_panel, C6, T0, fix<2>); \
2168  traits.madd(A0, rhs_panel, C3, T0, fix<3>); \
2169  traits.madd(A1, rhs_panel, C7, T0, fix<3>); \
2170  EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \
2171  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \
2172  } while (false)
2173 
2174  internal::prefetch(blB + (48 + 0));
2179  internal::prefetch(blB + (48 + 16));
2184 
2185  blB += pk * 4 * RhsProgress;
2186  blA += pk * (2 * Traits::LhsProgress);
2187 
2188  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
2189  }
2190  // process remaining peeled loop
2191  for (Index k = peeled_kc; k < depth; k++) {
2192  RhsPacketx4 rhs_panel;
2193  RhsPacket T0;
2195  blB += 4 * RhsProgress;
2196  blA += 2 * Traits::LhsProgress;
2197  }
2198 #undef EIGEN_GEBGP_ONESTEP
2199 
2200  ResPacket R0, R1, R2, R3;
2201  ResPacket alphav = pset1<ResPacket>(alpha);
2202 
2203  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2204  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2205  R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2206  R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2207  traits.acc(C0, alphav, R0);
2208  traits.acc(C4, alphav, R1);
2209  traits.acc(C1, alphav, R2);
2210  traits.acc(C5, alphav, R3);
2211  r0.storePacket(0 * Traits::ResPacketSize, R0);
2212  r0.storePacket(1 * Traits::ResPacketSize, R1);
2213  r1.storePacket(0 * Traits::ResPacketSize, R2);
2214  r1.storePacket(1 * Traits::ResPacketSize, R3);
2215 
2216  R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2217  R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2218  R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2219  R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2220  traits.acc(C2, alphav, R0);
2221  traits.acc(C6, alphav, R1);
2222  traits.acc(C3, alphav, R2);
2223  traits.acc(C7, alphav, R3);
2224  r2.storePacket(0 * Traits::ResPacketSize, R0);
2225  r2.storePacket(1 * Traits::ResPacketSize, R1);
2226  r3.storePacket(0 * Traits::ResPacketSize, R2);
2227  r3.storePacket(1 * Traits::ResPacketSize, R3);
2228  }
2229  }
2230 
2231  // Deal with remaining columns of the rhs
2232  for (Index j2 = packet_cols4; j2 < cols; j2++) {
2233  for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
2234  // One column at a time
2235  const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
2236  prefetch(&blA[0]);
2237 
2238  // gets res block as register
2239  AccPacket C0, C4;
2240  traits.initAcc(C0);
2241  traits.initAcc(C4);
2242 
2243  LinearMapper r0 = res.getLinearMapper(i, j2);
2244  r0.prefetch(prefetch_res_offset);
2245 
2246  // performs "inner" products
2247  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2248  LhsPacket A0, A1;
2249 
2250  for (Index k = 0; k < peeled_kc; k += pk) {
2251  EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
2252  RhsPacket B_0, B1;
2253 
2254 #define EIGEN_GEBGP_ONESTEP(K) \
2255  do { \
2256  EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1"); \
2257  EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
2258  traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0); \
2259  traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1); \
2260  traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0); \
2261  traits.madd(A0, B_0, C0, B1, fix<0>); \
2262  traits.madd(A1, B_0, C4, B_0, fix<0>); \
2263  EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \
2264  } while (false)
2265 
2274 
2275  blB += int(pk) * int(RhsProgress);
2276  blA += int(pk) * 2 * int(Traits::LhsProgress);
2277 
2278  EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
2279  }
2280 
2281  // process remaining peeled loop
2282  for (Index k = peeled_kc; k < depth; k++) {
2283  RhsPacket B_0, B1;
2285  blB += RhsProgress;
2286  blA += 2 * Traits::LhsProgress;
2287  }
2288 #undef EIGEN_GEBGP_ONESTEP
2289  ResPacket R0, R1;
2290  ResPacket alphav = pset1<ResPacket>(alpha);
2291 
2292  R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
2293  R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
2294  traits.acc(C0, alphav, R0);
2295  traits.acc(C4, alphav, R1);
2296  r0.storePacket(0 * Traits::ResPacketSize, R0);
2297  r0.storePacket(1 * Traits::ResPacketSize, R1);
2298  }
2299  }
2300  }
2301  }
2302  //---------- Process 1 * LhsProgress rows at once ----------
2303  if (mr >= 1 * Traits::LhsProgress) {
2304  lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
2305  RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
2306  p;
2307  p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2308  peeled_kc, pk, cols, depth, packet_cols4);
2309  }
2310  //---------- Process LhsProgressHalf rows at once ----------
2311  if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
2312  lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
2314  p;
2315  p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
2316  peeled_kc, pk, cols, depth, packet_cols4);
2317  }
2318  //---------- Process LhsProgressQuarter rows at once ----------
2319  if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
2320  lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
2322  QuarterTraits, LinearMapper, DataMapper>
2323  p;
2324  p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
2325  prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
2326  }
2327  //---------- Process remaining rows, 1 at once ----------
2328  if (peeled_mc_quarter < rows) {
2329 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2330  EIGEN_IF_CONSTEXPR(nr >= 8) {
2331  // loop on each panel of the rhs
2332  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2333  // loop on each row of the lhs (1*LhsProgress x depth)
2334  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2335  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2336  prefetch(&blA[0]);
2337  // gets a 1 x 1 res block as registers
2338  ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
2339  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
2340  for (Index k = 0; k < depth; k++) {
2341  LhsScalar A0 = blA[k];
2342  RhsScalar B_0;
2343 
2344  B_0 = blB[0];
2345  C0 = cj.pmadd(A0, B_0, C0);
2346 
2347  B_0 = blB[1];
2348  C1 = cj.pmadd(A0, B_0, C1);
2349 
2350  B_0 = blB[2];
2351  C2 = cj.pmadd(A0, B_0, C2);
2352 
2353  B_0 = blB[3];
2354  C3 = cj.pmadd(A0, B_0, C3);
2355 
2356  B_0 = blB[4];
2357  C4 = cj.pmadd(A0, B_0, C4);
2358 
2359  B_0 = blB[5];
2360  C5 = cj.pmadd(A0, B_0, C5);
2361 
2362  B_0 = blB[6];
2363  C6 = cj.pmadd(A0, B_0, C6);
2364 
2365  B_0 = blB[7];
2366  C7 = cj.pmadd(A0, B_0, C7);
2367 
2368  blB += 8;
2369  }
2370  res(i, j2 + 0) += alpha * C0;
2371  res(i, j2 + 1) += alpha * C1;
2372  res(i, j2 + 2) += alpha * C2;
2373  res(i, j2 + 3) += alpha * C3;
2374  res(i, j2 + 4) += alpha * C4;
2375  res(i, j2 + 5) += alpha * C5;
2376  res(i, j2 + 6) += alpha * C6;
2377  res(i, j2 + 7) += alpha * C7;
2378  }
2379  }
2380  }
2381 #endif
2382 
2383  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2384  // loop on each row of the lhs (1*LhsProgress x depth)
2385  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2386  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2387  prefetch(&blA[0]);
2388  const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
2389 
2390  // If LhsProgress is 8 or 16, it assumes that there is a
2391  // half or quarter packet, respectively, of the same size as
2392  // nr (which is currently 4) for the return type.
2394  const int SResPacketQuarterSize =
2396  // The following code assumes we can load SRhsPacket in such a way that
2397  // it multiplies blocks of 4 elements in SLhsPacket. This is not the
2398  // case for some customized kernels (i.e. NEON fp16). If the assumption
2399  // fails, drop down to the scalar path.
2400  constexpr bool kCanLoadSRhsQuad =
2402  (unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
2403  if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
2404  (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
2405  (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
2406  SAccPacket C0, C1, C2, C3;
2407  straits.initAcc(C0);
2408  straits.initAcc(C1);
2409  straits.initAcc(C2);
2410  straits.initAcc(C3);
2411 
2412  const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
2413  const Index endk = (depth / spk) * spk;
2414  const Index endk4 = (depth / (spk * 4)) * (spk * 4);
2415 
2416  Index k = 0;
2417  for (; k < endk4; k += 4 * spk) {
2418  SLhsPacket A0, A1;
2419  SRhsPacket B_0, B_1;
2420 
2421  straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
2422  straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
2423 
2424  straits.loadRhsQuad(blA + 0 * spk, B_0);
2425  straits.loadRhsQuad(blA + 1 * spk, B_1);
2426  straits.madd(A0, B_0, C0, B_0, fix<0>);
2427  straits.madd(A1, B_1, C1, B_1, fix<0>);
2428 
2429  straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
2430  straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
2431  straits.loadRhsQuad(blA + 2 * spk, B_0);
2432  straits.loadRhsQuad(blA + 3 * spk, B_1);
2433  straits.madd(A0, B_0, C2, B_0, fix<0>);
2434  straits.madd(A1, B_1, C3, B_1, fix<0>);
2435 
2436  blB += 4 * SwappedTraits::LhsProgress;
2437  blA += 4 * spk;
2438  }
2439  C0 = padd(padd(C0, C1), padd(C2, C3));
2440  for (; k < endk; k += spk) {
2441  SLhsPacket A0;
2442  SRhsPacket B_0;
2443 
2444  straits.loadLhsUnaligned(blB, A0);
2445  straits.loadRhsQuad(blA, B_0);
2446  straits.madd(A0, B_0, C0, B_0, fix<0>);
2447 
2448  blB += SwappedTraits::LhsProgress;
2449  blA += spk;
2450  }
2451  if (SwappedTraits::LhsProgress == 8) {
2452  // Special case where we have to first reduce the accumulation register C0
2453  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
2454  SResPacket>
2455  SResPacketHalf;
2456  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
2457  SLhsPacket>
2458  SLhsPacketHalf;
2459  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
2460  SRhsPacket>
2461  SRhsPacketHalf;
2462  typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
2463  SAccPacket>
2464  SAccPacketHalf;
2465 
2466  SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
2467  SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
2468 
2469  if (depth - endk > 0) {
2470  // We have to handle the last row of the rhs which corresponds to a half-packet
2471  SLhsPacketHalf a0;
2472  SRhsPacketHalf b0;
2473  straits.loadLhsUnaligned(blB, a0);
2474  straits.loadRhs(blA, b0);
2475  SAccPacketHalf c0 = predux_half_dowto4(C0);
2476  straits.madd(a0, b0, c0, b0, fix<0>);
2477  straits.acc(c0, alphav, R);
2478  } else {
2479  straits.acc(predux_half_dowto4(C0), alphav, R);
2480  }
2481  res.scatterPacket(i, j2, R);
2482  } else if (SwappedTraits::LhsProgress == 16) {
2483  // Special case where we have to first reduce the
2484  // accumulation register C0. We specialize the block in
2485  // template form, so that LhsProgress < 16 paths don't
2486  // fail to compile
2488  p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
2489  } else {
2490  SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
2491  SResPacket alphav = pset1<SResPacket>(alpha);
2492  straits.acc(C0, alphav, R);
2493  res.scatterPacket(i, j2, R);
2494  }
2495  } else // scalar path
2496  {
2497  // get a 1 x 4 res block as registers
2498  ResScalar C0(0), C1(0), C2(0), C3(0);
2499 
2500  for (Index k = 0; k < depth; k++) {
2501  LhsScalar A0;
2502  RhsScalar B_0, B_1;
2503 
2504  A0 = blA[k];
2505 
2506  B_0 = blB[0];
2507  B_1 = blB[1];
2508  C0 = cj.pmadd(A0, B_0, C0);
2509  C1 = cj.pmadd(A0, B_1, C1);
2510 
2511  B_0 = blB[2];
2512  B_1 = blB[3];
2513  C2 = cj.pmadd(A0, B_0, C2);
2514  C3 = cj.pmadd(A0, B_1, C3);
2515 
2516  blB += 4;
2517  }
2518  res(i, j2 + 0) += alpha * C0;
2519  res(i, j2 + 1) += alpha * C1;
2520  res(i, j2 + 2) += alpha * C2;
2521  res(i, j2 + 3) += alpha * C3;
2522  }
2523  }
2524  }
2525  // remaining columns
2526  for (Index j2 = packet_cols4; j2 < cols; j2++) {
2527  // loop on each row of the lhs (1*LhsProgress x depth)
2528  for (Index i = peeled_mc_quarter; i < rows; i += 1) {
2529  const LhsScalar* blA = &blockA[i * strideA + offsetA];
2530  prefetch(&blA[0]);
2531  // gets a 1 x 1 res block as registers
2532  ResScalar C0(0);
2533  const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
2534  for (Index k = 0; k < depth; k++) {
2535  LhsScalar A0 = blA[k];
2536  RhsScalar B_0 = blB[k];
2537  C0 = cj.pmadd(A0, B_0, C0);
2538  }
2539  res(i, j2) += alpha * C0;
2540  }
2541  }
2542  }
2543 }
2544 
2545 // pack a block of the lhs
2546 // The traversal is as follow (mr==4):
2547 // 0 4 8 12 ...
2548 // 1 5 9 13 ...
2549 // 2 6 10 14 ...
2550 // 3 7 11 15 ...
2551 //
2552 // 16 20 24 28 ...
2553 // 17 21 25 29 ...
2554 // 18 22 26 30 ...
2555 // 19 23 27 31 ...
2556 //
2557 // 32 33 34 35 ...
2558 // 36 36 38 39 ...
2559 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2560  bool PanelMode>
2561 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
2562  typedef typename DataMapper::LinearMapper LinearMapper;
2563  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2564  Index offset = 0);
2565 };
2566 
2567 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2568  bool PanelMode>
2569 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
2570  PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2571  Index rows, Index stride, Index offset) {
2572  typedef typename unpacket_traits<Packet>::half HalfPacket;
2573  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2574  enum {
2575  PacketSize = unpacket_traits<Packet>::size,
2576  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2577  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2578  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2579  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2580  };
2581 
2582  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2583  EIGEN_UNUSED_VARIABLE(stride);
2584  EIGEN_UNUSED_VARIABLE(offset);
2585  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2586  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
2588  Index count = 0;
2589 
2590  const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
2591  const Index peeled_mc2 =
2592  Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
2593  const Index peeled_mc1 =
2594  Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
2595  const Index peeled_mc_half =
2596  Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
2597  const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
2598  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
2599  const Index peeled_mc0 = Pack2 >= PacketSize ? peeled_mc_quarter
2600  : Pack2 > 1 && last_lhs_progress ? (rows / last_lhs_progress) * last_lhs_progress
2601  : 0;
2602 
2603  Index i = 0;
2604 
2605  // Pack 3 packets
2606  if (Pack1 >= 3 * PacketSize) {
2607  for (; i < peeled_mc3; i += 3 * PacketSize) {
2608  if (PanelMode) count += (3 * PacketSize) * offset;
2609 
2610  for (Index k = 0; k < depth; k++) {
2611  Packet A, B, C;
2612  A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2613  B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2614  C = lhs.template loadPacket<Packet>(i + 2 * PacketSize, k);
2615  pstore(blockA + count, cj.pconj(A));
2616  count += PacketSize;
2617  pstore(blockA + count, cj.pconj(B));
2618  count += PacketSize;
2619  pstore(blockA + count, cj.pconj(C));
2620  count += PacketSize;
2621  }
2622  if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
2623  }
2624  }
2625  // Pack 2 packets
2626  if (Pack1 >= 2 * PacketSize) {
2627  for (; i < peeled_mc2; i += 2 * PacketSize) {
2628  if (PanelMode) count += (2 * PacketSize) * offset;
2629 
2630  for (Index k = 0; k < depth; k++) {
2631  Packet A, B;
2632  A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2633  B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
2634  pstore(blockA + count, cj.pconj(A));
2635  count += PacketSize;
2636  pstore(blockA + count, cj.pconj(B));
2637  count += PacketSize;
2638  }
2639  if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
2640  }
2641  }
2642  // Pack 1 packets
2643  if (Pack1 >= 1 * PacketSize) {
2644  for (; i < peeled_mc1; i += 1 * PacketSize) {
2645  if (PanelMode) count += (1 * PacketSize) * offset;
2646 
2647  for (Index k = 0; k < depth; k++) {
2648  Packet A;
2649  A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
2650  pstore(blockA + count, cj.pconj(A));
2651  count += PacketSize;
2652  }
2653  if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
2654  }
2655  }
2656  // Pack half packets
2657  if (HasHalf && Pack1 >= HalfPacketSize) {
2658  for (; i < peeled_mc_half; i += HalfPacketSize) {
2659  if (PanelMode) count += (HalfPacketSize)*offset;
2660 
2661  for (Index k = 0; k < depth; k++) {
2662  HalfPacket A;
2663  A = lhs.template loadPacket<HalfPacket>(i + 0 * (HalfPacketSize), k);
2664  pstoreu(blockA + count, cj.pconj(A));
2665  count += HalfPacketSize;
2666  }
2667  if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
2668  }
2669  }
2670  // Pack quarter packets
2671  if (HasQuarter && Pack1 >= QuarterPacketSize) {
2672  for (; i < peeled_mc_quarter; i += QuarterPacketSize) {
2673  if (PanelMode) count += (QuarterPacketSize)*offset;
2674 
2675  for (Index k = 0; k < depth; k++) {
2676  QuarterPacket A;
2677  A = lhs.template loadPacket<QuarterPacket>(i + 0 * (QuarterPacketSize), k);
2678  pstoreu(blockA + count, cj.pconj(A));
2679  count += QuarterPacketSize;
2680  }
2681  if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
2682  }
2683  }
2684  // Pack2 may be *smaller* than PacketSize—that happens for
2685  // products like real * complex, where we have to go half the
2686  // progress on the lhs in order to duplicate those operands to
2687  // address both real & imaginary parts on the rhs. This portion will
2688  // pack those half ones until they match the number expected on the
2689  // last peeling loop at this point (for the rhs).
2690  if (Pack2 < PacketSize && Pack2 > 1) {
2691  for (; i < peeled_mc0; i += last_lhs_progress) {
2692  if (PanelMode) count += last_lhs_progress * offset;
2693 
2694  for (Index k = 0; k < depth; k++)
2695  for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
2696 
2697  if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
2698  }
2699  }
2700  // Pack scalars
2701  for (; i < rows; i++) {
2702  if (PanelMode) count += offset;
2703  for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2704  if (PanelMode) count += (stride - offset - depth);
2705  }
2706 }
2707 
2708 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2709  bool PanelMode>
2710 struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
2711  typedef typename DataMapper::LinearMapper LinearMapper;
2712  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
2713  Index offset = 0);
2714 };
2715 
2716 template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
2717  bool PanelMode>
2718 EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
2719  PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
2720  Index rows, Index stride, Index offset) {
2721  typedef typename unpacket_traits<Packet>::half HalfPacket;
2722  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
2723  enum {
2724  PacketSize = unpacket_traits<Packet>::size,
2725  HalfPacketSize = unpacket_traits<HalfPacket>::size,
2726  QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
2727  HasHalf = (int)HalfPacketSize < (int)PacketSize,
2728  HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
2729  };
2730 
2731  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
2732  EIGEN_UNUSED_VARIABLE(stride);
2733  EIGEN_UNUSED_VARIABLE(offset);
2734  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2736  Index count = 0;
2737  bool gone_half = false, gone_quarter = false, gone_last = false;
2738 
2739  Index i = 0;
2740  Index pack = Pack1;
2741  Index psize = PacketSize;
2742  while (pack > 0) {
2743  Index remaining_rows = rows - i;
2744  Index peeled_mc = gone_last ? Pack2 > 1 ? (rows / pack) * pack : 0 : i + (remaining_rows / pack) * pack;
2745  Index starting_pos = i;
2746  for (; i < peeled_mc; i += pack) {
2747  if (PanelMode) count += pack * offset;
2748 
2749  Index k = 0;
2750  if (pack >= psize && psize >= QuarterPacketSize) {
2751  const Index peeled_k = (depth / psize) * psize;
2752  for (; k < peeled_k; k += psize) {
2753  for (Index m = 0; m < pack; m += psize) {
2754  if (psize == PacketSize) {
2755  PacketBlock<Packet> kernel;
2756  for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i + p + m, k);
2757  ptranspose(kernel);
2758  for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel.packet[p]));
2759  } else if (HasHalf && psize == HalfPacketSize) {
2760  gone_half = true;
2761  PacketBlock<HalfPacket> kernel_half;
2762  for (Index p = 0; p < psize; ++p)
2763  kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i + p + m, k);
2764  ptranspose(kernel_half);
2765  for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_half.packet[p]));
2766  } else if (HasQuarter && psize == QuarterPacketSize) {
2767  gone_quarter = true;
2768  PacketBlock<QuarterPacket> kernel_quarter;
2769  for (Index p = 0; p < psize; ++p)
2770  kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i + p + m, k);
2771  ptranspose(kernel_quarter);
2772  for (Index p = 0; p < psize; ++p)
2773  pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_quarter.packet[p]));
2774  }
2775  }
2776  count += psize * pack;
2777  }
2778  }
2779 
2780  for (; k < depth; k++) {
2781  Index w = 0;
2782  for (; w < pack - 3; w += 4) {
2783  Scalar a(cj(lhs(i + w + 0, k))), b(cj(lhs(i + w + 1, k))), c(cj(lhs(i + w + 2, k))), d(cj(lhs(i + w + 3, k)));
2784  blockA[count++] = a;
2785  blockA[count++] = b;
2786  blockA[count++] = c;
2787  blockA[count++] = d;
2788  }
2789  if (pack % 4)
2790  for (; w < pack; ++w) blockA[count++] = cj(lhs(i + w, k));
2791  }
2792 
2793  if (PanelMode) count += pack * (stride - offset - depth);
2794  }
2795 
2796  pack -= psize;
2797  Index left = rows - i;
2798  if (pack <= 0) {
2799  if (!gone_last && (starting_pos == i || left >= psize / 2 || left >= psize / 4) &&
2800  ((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
2801  (psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
2802  psize /= 2;
2803  pack = psize;
2804  continue;
2805  }
2806  // Pack2 may be *smaller* than PacketSize—that happens for
2807  // products like real * complex, where we have to go half the
2808  // progress on the lhs in order to duplicate those operands to
2809  // address both real & imaginary parts on the rhs. This portion will
2810  // pack those half ones until they match the number expected on the
2811  // last peeling loop at this point (for the rhs).
2812  if (Pack2 < PacketSize && !gone_last) {
2813  gone_last = true;
2814  psize = pack = left & ~1;
2815  }
2816  }
2817  }
2818 
2819  for (; i < rows; i++) {
2820  if (PanelMode) count += offset;
2821  for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
2822  if (PanelMode) count += (stride - offset - depth);
2823  }
2824 }
2825 
2826 // copy a complete panel of the rhs
2827 // this version is optimized for column major matrices
2828 // The traversal order is as follow: (nr==4):
2829 // 0 1 2 3 12 13 14 15 24 27
2830 // 4 5 6 7 16 17 18 19 25 28
2831 // 8 9 10 11 20 21 22 23 26 29
2832 // . . . . . . . . . .
2833 template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2834 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
2836  typedef typename DataMapper::LinearMapper LinearMapper;
2837  enum { PacketSize = packet_traits<Scalar>::size };
2838  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
2839  Index offset = 0);
2840 };
2841 
2842 template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
2844  Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
2845  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
2846  EIGEN_UNUSED_VARIABLE(stride);
2847  EIGEN_UNUSED_VARIABLE(offset);
2848  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
2850  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
2851  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
2852  Index count = 0;
2853  const Index peeled_k = (depth / PacketSize) * PacketSize;
2854 
2855 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
2856  EIGEN_IF_CONSTEXPR(nr >= 8) {
2857  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
2858  // skip what we have before
2859  if (PanelMode) count += 8 * offset;
2860  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2861  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2862  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2863  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2864  const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
2865  const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
2866  const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
2867  const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
2868  Index k = 0;
2869  if (PacketSize % 2 == 0 && PacketSize <= 8) // 2 4 8
2870  {
2871  for (; k < peeled_k; k += PacketSize) {
2872  if (PacketSize == 2) {
2873  PacketBlock<Packet, PacketSize == 2 ? 2 : PacketSize> kernel0, kernel1, kernel2, kernel3;
2874  kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2875  kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2876  kernel1.packet[0 % PacketSize] = dm2.template loadPacket<Packet>(k);
2877  kernel1.packet[1 % PacketSize] = dm3.template loadPacket<Packet>(k);
2878  kernel2.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2879  kernel2.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2880  kernel3.packet[0 % PacketSize] = dm6.template loadPacket<Packet>(k);
2881  kernel3.packet[1 % PacketSize] = dm7.template loadPacket<Packet>(k);
2882  ptranspose(kernel0);
2883  ptranspose(kernel1);
2884  ptranspose(kernel2);
2885  ptranspose(kernel3);
2886 
2887  pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2888  pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2889  pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
2890  pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
2891 
2892  pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2893  pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2894  pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
2895  pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
2896  count += 8 * PacketSize;
2897  } else if (PacketSize == 4) {
2899 
2900  kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2901  kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2902  kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2903  kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2904  kernel1.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
2905  kernel1.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
2906  kernel1.packet[2 % PacketSize] = dm6.template loadPacket<Packet>(k);
2907  kernel1.packet[3 % PacketSize] = dm7.template loadPacket<Packet>(k);
2908  ptranspose(kernel0);
2909  ptranspose(kernel1);
2910 
2911  pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2912  pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
2913  pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2914  pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
2915  pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2916  pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[2 % PacketSize]));
2917  pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2918  pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.packet[3 % PacketSize]));
2919  count += 8 * PacketSize;
2920  } else if (PacketSize == 8) {
2922 
2923  kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
2924  kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2925  kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2926  kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2927  kernel0.packet[4 % PacketSize] = dm4.template loadPacket<Packet>(k);
2928  kernel0.packet[5 % PacketSize] = dm5.template loadPacket<Packet>(k);
2929  kernel0.packet[6 % PacketSize] = dm6.template loadPacket<Packet>(k);
2930  kernel0.packet[7 % PacketSize] = dm7.template loadPacket<Packet>(k);
2931  ptranspose(kernel0);
2932 
2933  pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
2934  pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
2935  pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
2936  pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
2937  pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[4 % PacketSize]));
2938  pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.packet[5 % PacketSize]));
2939  pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[6 % PacketSize]));
2940  pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.packet[7 % PacketSize]));
2941  count += 8 * PacketSize;
2942  }
2943  }
2944  }
2945 
2946  for (; k < depth; k++) {
2947  blockB[count + 0] = cj(dm0(k));
2948  blockB[count + 1] = cj(dm1(k));
2949  blockB[count + 2] = cj(dm2(k));
2950  blockB[count + 3] = cj(dm3(k));
2951  blockB[count + 4] = cj(dm4(k));
2952  blockB[count + 5] = cj(dm5(k));
2953  blockB[count + 6] = cj(dm6(k));
2954  blockB[count + 7] = cj(dm7(k));
2955  count += 8;
2956  }
2957  // skip what we have after
2958  if (PanelMode) count += 8 * (stride - offset - depth);
2959  }
2960  }
2961 #endif
2962 
2963  EIGEN_IF_CONSTEXPR(nr >= 4) {
2964  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
2965  // skip what we have before
2966  if (PanelMode) count += 4 * offset;
2967  const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
2968  const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
2969  const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
2970  const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
2971 
2972  Index k = 0;
2973  if ((PacketSize % 4) == 0) // TODO enable vectorized transposition for PacketSize==2 ??
2974  {
2975  for (; k < peeled_k; k += PacketSize) {
2976  PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
2977  kernel.packet[0] = dm0.template loadPacket<Packet>(k);
2978  kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
2979  kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
2980  kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
2981  ptranspose(kernel);
2982  pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
2983  pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
2984  pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
2985  pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
2986  count += 4 * PacketSize;
2987  }
2988  }
2989  for (; k < depth; k++) {
2990  blockB[count + 0] = cj(dm0(k));
2991  blockB[count + 1] = cj(dm1(k));
2992  blockB[count + 2] = cj(dm2(k));
2993  blockB[count + 3] = cj(dm3(k));
2994  count += 4;
2995  }
2996  // skip what we have after
2997  if (PanelMode) count += 4 * (stride - offset - depth);
2998  }
2999  }
3000 
3001  // copy the remaining columns one at a time (nr==1)
3002  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3003  if (PanelMode) count += offset;
3004  const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
3005  for (Index k = 0; k < depth; k++) {
3006  blockB[count] = cj(dm0(k));
3007  count += 1;
3008  }
3009  if (PanelMode) count += (stride - offset - depth);
3010  }
3011 }
3012 
3013 // this version is optimized for row major matrices
3014 template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
3015 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
3019  typedef typename DataMapper::LinearMapper LinearMapper;
3020  enum {
3023  QuarterPacketSize = unpacket_traits<QuarterPacket>::size
3024  };
3025  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
3026  Index offset = 0) {
3027  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
3028  EIGEN_UNUSED_VARIABLE(stride);
3029  EIGEN_UNUSED_VARIABLE(offset);
3030  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
3031  const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
3032  const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
3034  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
3035  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
3036  Index count = 0;
3037 
3038 #if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
3039  EIGEN_IF_CONSTEXPR(nr >= 8) {
3040  for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
3041  // skip what we have before
3042  if (PanelMode) count += 8 * offset;
3043  for (Index k = 0; k < depth; k++) {
3044  if (PacketSize == 8) {
3045  Packet A = rhs.template loadPacket<Packet>(k, j2);
3046  pstoreu(blockB + count, cj.pconj(A));
3047  count += PacketSize;
3048  } else if (PacketSize == 4) {
3049  Packet A = rhs.template loadPacket<Packet>(k, j2);
3050  Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
3051  pstoreu(blockB + count, cj.pconj(A));
3052  pstoreu(blockB + count + PacketSize, cj.pconj(B));
3053  count += 2 * PacketSize;
3054  } else {
3055  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3056  blockB[count + 0] = cj(dm0(0));
3057  blockB[count + 1] = cj(dm0(1));
3058  blockB[count + 2] = cj(dm0(2));
3059  blockB[count + 3] = cj(dm0(3));
3060  blockB[count + 4] = cj(dm0(4));
3061  blockB[count + 5] = cj(dm0(5));
3062  blockB[count + 6] = cj(dm0(6));
3063  blockB[count + 7] = cj(dm0(7));
3064  count += 8;
3065  }
3066  }
3067  // skip what we have after
3068  if (PanelMode) count += 8 * (stride - offset - depth);
3069  }
3070  }
3071 #endif
3072 
3073  if (nr >= 4) {
3074  for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
3075  // skip what we have before
3076  if (PanelMode) count += 4 * offset;
3077  for (Index k = 0; k < depth; k++) {
3078  if (PacketSize == 4) {
3079  Packet A = rhs.template loadPacket<Packet>(k, j2);
3080  pstoreu(blockB + count, cj.pconj(A));
3081  count += PacketSize;
3082  } else if (HasHalf && HalfPacketSize == 4) {
3083  HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
3084  pstoreu(blockB + count, cj.pconj(A));
3085  count += HalfPacketSize;
3086  } else if (HasQuarter && QuarterPacketSize == 4) {
3087  QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
3088  pstoreu(blockB + count, cj.pconj(A));
3089  count += QuarterPacketSize;
3090  } else {
3091  const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
3092  blockB[count + 0] = cj(dm0(0));
3093  blockB[count + 1] = cj(dm0(1));
3094  blockB[count + 2] = cj(dm0(2));
3095  blockB[count + 3] = cj(dm0(3));
3096  count += 4;
3097  }
3098  }
3099  // skip what we have after
3100  if (PanelMode) count += 4 * (stride - offset - depth);
3101  }
3102  }
3103  // copy the remaining columns one at a time (nr==1)
3104  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
3105  if (PanelMode) count += offset;
3106  for (Index k = 0; k < depth; k++) {
3107  blockB[count] = cj(rhs(k, j2));
3108  count += 1;
3109  }
3110  if (PanelMode) count += stride - offset - depth;
3111  }
3112  }
3113 };
3114 
3115 } // end namespace internal
3116 
3119 inline std::ptrdiff_t l1CacheSize() {
3120  std::ptrdiff_t l1, l2, l3;
3121  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3122  return l1;
3123 }
3124 
3127 inline std::ptrdiff_t l2CacheSize() {
3128  std::ptrdiff_t l1, l2, l3;
3129  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3130  return l2;
3131 }
3132 
3135 inline std::ptrdiff_t l3CacheSize() {
3136  std::ptrdiff_t l1, l2, l3;
3137  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
3138  return l3;
3139 }
3140 
3146 inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) {
3147  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
3148 }
3149 
3150 } // end namespace Eigen
3151 
3152 #endif // EIGEN_GENERAL_BLOCK_PANEL_H
#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
Definition: AltiVec/PacketMath.h:25
#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
Definition: AltiVec/PacketMath.h:30
AnnoyingScalar imag(const AnnoyingScalar &)
Definition: AnnoyingScalar.h:132
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ASM_COMMENT(X)
Definition: Macros.h:972
#define EIGEN_COMP_MSVC
Definition: Macros.h:131
#define eigen_internal_assert(x)
Definition: Macros.h:916
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define EIGEN_DONT_INLINE
Definition: Macros.h:853
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_IF_CONSTEXPR(X)
Definition: Macros.h:1306
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
RowVector3d w
Definition: Matrix_resize_int.cpp:3
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
@ R
Definition: StatisticsVector.h:21
float * p
Definition: Tutorial_Map_using.cpp:9
int rows
Definition: Tutorial_commainit_02.cpp:1
int cols
Definition: Tutorial_commainit_02.cpp:1
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
Scalar * b
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition: bench_gemm.cpp:47
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
NumTraits< Scalar >::Real RealScalar
Definition: bench_gemm.cpp:46
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition: bench_gemm.cpp:48
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M
Definition: benchmark-blocking-sizes.cpp:22
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K
Definition: benchmark-blocking-sizes.cpp:21
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
Definition: benchmark-blocking-sizes.cpp:20
internal::packet_traits< Scalar >::type Packet
Definition: benchmark-blocking-sizes.cpp:54
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N
Definition: benchmark-blocking-sizes.cpp:23
boost::multiprecision::number< boost::multiprecision::cpp_dec_float< 100 >, boost::multiprecision::et_on > Real
Definition: boostmultiprec.cpp:77
#define _(A, B)
Definition: cfortran.h:132
Definition: ForwardDeclarations.h:102
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
Definition: IntegralConstant.h:23
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:913
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:943
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:892
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:885
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:908
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:897
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:878
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:880
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:905
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:919
std::complex< RealScalar > Scalar
Definition: products/GeneralBlockPanelKernel.h:845
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:882
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:888
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:901
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:937
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:931
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:903
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:879
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:616
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:592
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:562
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:542
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:550
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:587
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:566
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:557
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:579
EIGEN_STRONG_INLINE void madd_impl(const LhsScalar &a, const RhsScalar &b, ResScalar &c, RhsScalar &, const false_type &) const
Definition: products/GeneralBlockPanelKernel.h:610
EIGEN_STRONG_INLINE void acc(const AccPacketType &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:622
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:543
EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar *b, RhsPacket &dest, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:572
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:568
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:541
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: products/GeneralBlockPanelKernel.h:514
std::complex< RealScalar > LhsScalar
Definition: products/GeneralBlockPanelKernel.h:512
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:546
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:553
EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const true_type &) const
Definition: products/GeneralBlockPanelKernel.h:598
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:584
std::conditional_t< Vectorizable, DoublePacketType, Scalar > AccPacket
Definition: products/GeneralBlockPanelKernel.h:736
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:787
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:749
EIGEN_STRONG_INLINE void madd(const LhsPacket &a, const RhsPacket &b, ResPacket &c, RhsPacket &, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:803
std::conditional_t< Vectorizable, RealPacket, Scalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:733
EIGEN_STRONG_INLINE void acc(const Scalar &c, const Scalar &alpha, Scalar &r) const
Definition: products/GeneralBlockPanelKernel.h:814
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, DoublePacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:777
EIGEN_STRONG_INLINE void acc(const DoublePacket< RealPacketType > &c, const ResPacketType &alpha, ResPacketType &r) const
Definition: products/GeneralBlockPanelKernel.h:817
EIGEN_STRONG_INLINE void initAcc(DoublePacketType &p)
Definition: products/GeneralBlockPanelKernel.h:743
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:809
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: products/GeneralBlockPanelKernel.h:753
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:782
std::conditional_t< Vectorizable, ScalarPacket, Scalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:735
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:758
std::conditional_t< Vectorizable, DoublePacketType, Scalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:734
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, ResPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:776
EIGEN_STRONG_INLINE std::enable_if_t<!is_same< RhsPacketType, RhsPacketx4 >::value > madd(const LhsPacketType &a, const RhsPacketType &b, DoublePacket< ResPacketType > &c, TmpType &, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:793
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, ScalarPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:766
conj_helper< LhsScalar, RhsScalar, ConjLhs, ConjRhs > cj
Definition: products/GeneralBlockPanelKernel.h:839
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:774
EIGEN_STRONG_INLINE void initAcc(Scalar &p)
Definition: products/GeneralBlockPanelKernel.h:741
std::conditional_t< Vectorizable, ScalarPacket, Scalar > LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:732
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, DoublePacket< RealPacketType > &dest) const
Definition: products/GeneralBlockPanelKernel.h:770
Definition: products/GeneralBlockPanelKernel.h:397
PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_)
LhsPacket LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:440
RhsScalar_ RhsScalar
Definition: products/GeneralBlockPanelKernel.h:400
EIGEN_STRONG_INLINE void acc(const AccPacket &c, const ResPacket &alpha, ResPacket &r) const
Definition: products/GeneralBlockPanelKernel.h:499
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:448
EIGEN_STRONG_INLINE void loadRhs(const RhsScalar *b, RhsPacketx4 &dest) const
Definition: products/GeneralBlockPanelKernel.h:452
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *b, RhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:457
EIGEN_STRONG_INLINE void loadLhs(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:466
QuadPacket< RhsPacket > RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:442
ResPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:443
EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar *a, LhsPacketType &dest) const
Definition: products/GeneralBlockPanelKernel.h:471
LhsScalar_ LhsScalar
Definition: products/GeneralBlockPanelKernel.h:399
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketx4 &b, AccPacketType &c, RhsPacket &tmp, const LaneIdType &lane) const
Definition: products/GeneralBlockPanelKernel.h:494
EIGEN_STRONG_INLINE void acc(const ResPacketHalf &c, const ResPacketHalf &alpha, ResPacketHalf &r) const
Definition: products/GeneralBlockPanelKernel.h:504
EIGEN_STRONG_INLINE void updateRhs(const RhsScalar *, RhsPacketx4 &) const
Definition: products/GeneralBlockPanelKernel.h:461
std::conditional_t< Vectorizable, ResPacket_, ResScalar > ResPacket
Definition: products/GeneralBlockPanelKernel.h:439
PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_)
std::conditional_t< Vectorizable, RhsPacket_, RhsScalar > RhsPacket
Definition: products/GeneralBlockPanelKernel.h:438
std::conditional_t< Vectorizable, LhsPacket_, LhsScalar > LhsPacket
Definition: products/GeneralBlockPanelKernel.h:437
EIGEN_STRONG_INLINE void initAcc(AccPacket &p)
Definition: products/GeneralBlockPanelKernel.h:445
ScalarBinaryOpTraits< LhsScalar, RhsScalar >::ReturnType ResScalar
Definition: products/GeneralBlockPanelKernel.h:401
EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar *b, RhsPacket &dest) const
Definition: products/GeneralBlockPanelKernel.h:463
@ nr
Definition: products/GeneralBlockPanelKernel.h:418
@ default_mr
Definition: products/GeneralBlockPanelKernel.h:421
@ ConjLhs
Definition: products/GeneralBlockPanelKernel.h:408
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:413
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:410
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:434
@ RhsPacketSize
Definition: products/GeneralBlockPanelKernel.h:412
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:433
@ mr
Definition: products/GeneralBlockPanelKernel.h:430
@ NumberOfRegisters
Definition: products/GeneralBlockPanelKernel.h:415
@ ConjRhs
Definition: products/GeneralBlockPanelKernel.h:409
@ LhsPacketSize
Definition: products/GeneralBlockPanelKernel.h:411
PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_)
EIGEN_STRONG_INLINE void madd(const LhsPacketType &a, const RhsPacketType &b, AccPacketType &c, RhsPacketType &tmp, const LaneIdType &) const
Definition: products/GeneralBlockPanelKernel.h:476
Definition: matrices.h:74
float real
Definition: datatypes.h:10
#define min(a, b)
Definition: datatypes.h:22
#define max(a, b)
Definition: datatypes.h:23
Action
Definition: Constants.h:516
@ GetAction
Definition: Constants.h:516
@ SetAction
Definition: Constants.h:516
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
return int(ret)+1
RealScalar alpha
Definition: level1_cplx_impl.h:151
const Scalar * a
Definition: level2_cplx_impl.h:32
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
@ Target
Definition: Constants.h:495
const std::ptrdiff_t defaultL2CacheSize
Definition: products/GeneralBlockPanelKernel.h:62
EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf &a)
Definition: AltiVec/Complex.h:268
constexpr int plain_enum_min(A a, B b)
Definition: Meta.h:649
EIGEN_DEVICE_FUNC Packet padd(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:318
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_DEVICE_FUNC void pbroadcast4(const typename unpacket_traits< Packet >::type *a, Packet &a0, Packet &a1, Packet &a2, Packet &a3)
Definition: GenericPacketMath.h:849
EIGEN_STRONG_INLINE void ptranspose(PacketBlock< Packet2cf, 2 > &kernel)
Definition: AltiVec/Complex.h:339
void queryCacheSizes(int &l1, int &l2, int &l3)
Definition: Memory.h:1263
void loadQuadToDoublePacket(const Scalar *b, DoublePacket< RealPacket > &dest, std::enable_if_t< unpacket_traits< RealPacket >::size<=8 > *=0)
Definition: products/GeneralBlockPanelKernel.h:668
void evaluateProductBlockingSizesHeuristic(Index &k, Index &m, Index &n, Index num_threads=1)
Definition: products/GeneralBlockPanelKernel.h:118
EIGEN_DEVICE_FUNC void prefetch(const Scalar *addr)
Definition: GenericPacketMath.h:967
EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f &a, const Packet4f &b, const Packet4f &c)
Definition: AltiVec/PacketMath.h:1218
EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf &x)
Definition: LSX/Complex.h:218
EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf &a, const Packet4cf &b)
Definition: AVX/Complex.h:88
const std::ptrdiff_t defaultL3CacheSize
Definition: products/GeneralBlockPanelKernel.h:63
GEBPPacketSizeType
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketHalf
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:20
@ GEBPPacketFull
Definition: products/GeneralBlockPanelKernel.h:20
void computeProductBlockingSizes(Index &k, Index &m, Index &n, Index num_threads=1)
Computes the blocking parameters for a m x k times k x n matrix product.
Definition: products/GeneralBlockPanelKernel.h:321
void manage_caching_sizes(Action action, std::ptrdiff_t *l1, std::ptrdiff_t *l2, std::ptrdiff_t *l3)
Definition: products/GeneralBlockPanelKernel.h:86
EIGEN_DEVICE_FUNC void pstore(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:891
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c &a)
Definition: NEON/PacketMath.h:3635
bool useSpecificBlockingSizes(Index &k, Index &m, Index &n)
Definition: products/GeneralBlockPanelKernel.h:287
EIGEN_DEVICE_FUNC void pstoreu(Scalar *to, const Packet &from)
Definition: GenericPacketMath.h:911
std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
Definition: products/GeneralBlockPanelKernel.h:27
const std::ptrdiff_t defaultL1CacheSize
Definition: products/GeneralBlockPanelKernel.h:61
EIGEN_DEVICE_FUNC Packet psub(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:337
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T &x, const T &y)
Definition: MathFunctions.h:926
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
std::ptrdiff_t l1CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3119
std::ptrdiff_t l2CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3127
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
std::ptrdiff_t l3CacheSize()
Definition: products/GeneralBlockPanelKernel.h:3135
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: products/GeneralBlockPanelKernel.h:3146
double C1
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: TwenteMeshGluing.cpp:74
double C2
"Mooney Rivlin" coefficient for generalised Mooney Rivlin law
Definition: mpi/distribution/airy_cantilever/airy_cantilever2.cc:156
double K
Wave number.
Definition: sphere_scattering.cc:115
r
Definition: UniformPSDSelfTest.py:20
int c
Definition: calibrate.py:100
action
Definition: calibrate.py:47
type
Definition: compute_granudrum_aor.py:141
Definition: Eigen_Colamd.h:49
#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:32
#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:44
#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val)
Definition: products/GeneralBlockPanelKernel.h:38
#define EIGEN_GEBGP_ONESTEP(K)
#define EIGEN_GEBP_ONESTEP(K)
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:217
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:1043
Definition: Half.h:139
Definition: products/GeneralBlockPanelKernel.h:71
std::ptrdiff_t m_l1
Definition: products/GeneralBlockPanelKernel.h:80
CacheSizes()
Definition: products/GeneralBlockPanelKernel.h:72
std::ptrdiff_t m_l2
Definition: products/GeneralBlockPanelKernel.h:81
std::ptrdiff_t m_l3
Definition: products/GeneralBlockPanelKernel.h:82
Definition: products/GeneralBlockPanelKernel.h:631
Packet second
Definition: products/GeneralBlockPanelKernel.h:633
Packet first
Definition: products/GeneralBlockPanelKernel.h:632
Definition: GenericPacketMath.h:1407
Packet packet[N]
Definition: GenericPacketMath.h:1408
Definition: products/GeneralBlockPanelKernel.h:343
const Packet & get(const FixedInt< 0 > &) const
Definition: products/GeneralBlockPanelKernel.h:345
Packet B2
Definition: products/GeneralBlockPanelKernel.h:344
Packet B_0
Definition: products/GeneralBlockPanelKernel.h:344
const Packet & get(const FixedInt< 2 > &) const
Definition: products/GeneralBlockPanelKernel.h:347
const Packet & get(const FixedInt< 3 > &) const
Definition: products/GeneralBlockPanelKernel.h:348
Packet B1
Definition: products/GeneralBlockPanelKernel.h:344
const Packet & get(const FixedInt< 1 > &) const
Definition: products/GeneralBlockPanelKernel.h:346
Packet B3
Definition: products/GeneralBlockPanelKernel.h:344
Definition: products/GeneralBlockPanelKernel.h:333
static constexpr int remaining_registers
Definition: products/GeneralBlockPanelKernel.h:335
RhsPacket type
Definition: products/GeneralBlockPanelKernel.h:339
typedef RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:339
Definition: ConjHelper.h:71
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType &x, const RhsType &y, const ResultType &c) const
Definition: ConjHelper.h:74
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType &x, const RhsType &y) const
Definition: ConjHelper.h:79
Definition: ConjHelper.h:42
Definition: Meta.h:97
Definition: products/GeneralBlockPanelKernel.h:960
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:995
Traits::RhsPacket RhsPacket
Definition: products/GeneralBlockPanelKernel.h:969
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:982
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf > HalfTraits
Definition: products/GeneralBlockPanelKernel.h:963
QuarterTraits::ResPacket ResPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:992
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:983
QuarterTraits::RhsPacket RhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:991
HalfTraits::ResPacket ResPacketHalf
Definition: products/GeneralBlockPanelKernel.h:987
HalfTraits::RhsPacket RhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:986
@ ResPacketSize
Definition: products/GeneralBlockPanelKernel.h:1005
@ RhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1004
@ Vectorizable
Definition: products/GeneralBlockPanelKernel.h:998
@ LhsProgressQuarter
Definition: products/GeneralBlockPanelKernel.h:1001
@ RhsProgress
Definition: products/GeneralBlockPanelKernel.h:1002
@ LhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1000
@ RhsProgressHalf
Definition: products/GeneralBlockPanelKernel.h:1003
@ LhsProgress
Definition: products/GeneralBlockPanelKernel.h:999
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:961
Traits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:972
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:981
QuarterTraits::LhsPacket LhsPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:990
HalfTraits::LhsPacket LhsPacketHalf
Definition: products/GeneralBlockPanelKernel.h:985
QuarterTraits::AccPacket AccPacketQuarter
Definition: products/GeneralBlockPanelKernel.h:993
SwappedTraits::ResScalar SResScalar
Definition: products/GeneralBlockPanelKernel.h:979
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:977
RhsPanelHelper< RhsPacket, RhsPacketx4, 27 >::type RhsPanel27
Definition: products/GeneralBlockPanelKernel.h:975
RhsPanelHelper< RhsPacket, RhsPacketx4, 15 >::type RhsPanel15
Definition: products/GeneralBlockPanelKernel.h:974
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter > QuarterTraits
Definition: products/GeneralBlockPanelKernel.h:965
Traits::LhsPacket LhsPacket
Definition: products/GeneralBlockPanelKernel.h:968
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:967
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:980
Traits::ResPacket ResPacket
Definition: products/GeneralBlockPanelKernel.h:970
Traits::AccPacket AccPacket
Definition: products/GeneralBlockPanelKernel.h:971
HalfTraits::AccPacket AccPacketHalf
Definition: products/GeneralBlockPanelKernel.h:988
EIGEN_DONT_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, Index rows, Index depth, Index cols, ResScalar alpha, Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0)
Definition: products/GeneralBlockPanelKernel.h:1425
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2711
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2562
Definition: BlasUtil.h:34
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:2836
packet_traits< Scalar >::type Packet
Definition: products/GeneralBlockPanelKernel.h:2835
DataMapper::LinearMapper LinearMapper
Definition: products/GeneralBlockPanelKernel.h:3019
EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride=0, Index offset=0)
Definition: products/GeneralBlockPanelKernel.h:3025
unpacket_traits< Packet >::half HalfPacket
Definition: products/GeneralBlockPanelKernel.h:3017
unpacket_traits< typename unpacket_traits< Packet >::half >::half QuarterPacket
Definition: products/GeneralBlockPanelKernel.h:3018
packet_traits< Scalar >::type Packet
Definition: products/GeneralBlockPanelKernel.h:3016
Definition: BlasUtil.h:30
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: products/GeneralBlockPanelKernel.h:1055
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:1046
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:1047
Definition: products/GeneralBlockPanelKernel.h:1017
SwappedTraits::AccPacket SAccPacket
Definition: products/GeneralBlockPanelKernel.h:1025
SwappedTraits::ResPacket SResPacket
Definition: products/GeneralBlockPanelKernel.h:1024
gebp_traits< LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target > Traits
Definition: products/GeneralBlockPanelKernel.h:1018
Traits::ResScalar ResScalar
Definition: products/GeneralBlockPanelKernel.h:1021
SwappedTraits::LhsPacket SLhsPacket
Definition: products/GeneralBlockPanelKernel.h:1022
gebp_traits< RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target > SwappedTraits
Definition: products/GeneralBlockPanelKernel.h:1019
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, SwappedTraits &straits, const LhsScalar *blA, const RhsScalar *blB, Index depth, const Index endk, Index i, Index j2, ResScalar alpha, SAccPacket &C0)
Definition: products/GeneralBlockPanelKernel.h:1027
SwappedTraits::RhsPacket SRhsPacket
Definition: products/GeneralBlockPanelKernel.h:1023
Definition: products/GeneralBlockPanelKernel.h:1406
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1407
Definition: products/GeneralBlockPanelKernel.h:1091
EIGEN_STRONG_INLINE void operator()(const DataMapper &res, const LhsScalar *blockA, const RhsScalar *blockB, ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
Definition: products/GeneralBlockPanelKernel.h:1111
GEBPTraits::RhsPacketx4 RhsPacketx4
Definition: products/GeneralBlockPanelKernel.h:1092
EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar *blA, const RhsScalar *blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
Definition: products/GeneralBlockPanelKernel.h:1094
T1 type
Definition: products/GeneralBlockPanelKernel.h:358
T2 type
Definition: products/GeneralBlockPanelKernel.h:363
Definition: products/GeneralBlockPanelKernel.h:352
T3 type
Definition: products/GeneralBlockPanelKernel.h:353
Definition: GenericPacketMath.h:108
Definition: ForwardDeclarations.h:21
Definition: Meta.h:94
DoublePacket< typename unpacket_traits< Packet >::half > half
Definition: products/GeneralBlockPanelKernel.h:687
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
Definition: datatypes.h:12
Definition: ZVector/PacketMath.h:50