GeneralMatrixMatrix.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
11 #define EIGEN_GENERAL_MATRIX_MATRIX_H
12 
13 // IWYU pragma: private
14 #include "../InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 template <typename LhsScalar_, typename RhsScalar_>
21 class level3_blocking;
22 
23 /* Specialization for a row-major destination matrix => simple transposition of the product */
24 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
25  int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
26 struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
27  ConjugateRhs, RowMajor, ResInnerStride> {
29 
31  static EIGEN_STRONG_INLINE void run(Index rows, Index cols, Index depth, const LhsScalar* lhs, Index lhsStride,
32  const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr,
35  // transpose the product such that the result is column major
36  general_matrix_matrix_product<Index, RhsScalar, RhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateRhs,
37  LhsScalar, LhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor,
38  ResInnerStride>::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr,
39  resStride, alpha, blocking, info);
40  }
41 };
42 
43 /* Specialization for a col-major destination matrix
44  * => Blocking algorithm following Goto's paper */
45 template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
46  int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
47 struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
48  ConjugateRhs, ColMajor, ResInnerStride> {
50 
52  static void run(Index rows, Index cols, Index depth, const LhsScalar* lhs_, Index lhsStride, const RhsScalar* rhs_,
53  Index rhsStride, ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha,
58  LhsMapper lhs(lhs_, lhsStride);
59  RhsMapper rhs(rhs_, rhsStride);
60  ResMapper res(res_, resStride, resIncr);
61 
62  Index kc = blocking.kc(); // cache block size along the K direction
63  Index mc = (std::min)(rows, blocking.mc()); // cache block size along the M direction
64  Index nc = (std::min)(cols, blocking.nc()); // cache block size along the N direction
65 
66  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
67  LhsStorageOrder>
68  pack_lhs;
71 
72 #if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
73  if (info) {
74  // this is the parallel version!
75  int tid = info->logical_thread_id;
76  int threads = info->num_threads;
77 
78  LhsScalar* blockA = blocking.blockA();
79  eigen_internal_assert(blockA != 0);
80 
81  std::size_t sizeB = kc * nc;
82  ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
83 
84  // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
85  for (Index k = 0; k < depth; k += kc) {
86  const Index actual_kc = (std::min)(k + kc, depth) - k; // => rows of B', and cols of the A'
87 
88  // In order to reduce the chance that a thread has to wait for the other,
89  // let's start by packing B'.
90  pack_rhs(blockB, rhs.getSubMapper(k, 0), actual_kc, nc);
91 
92  // Pack A_k to A' in a parallel fashion:
93  // each thread packs the sub block A_k,i to A'_i where i is the thread id.
94 
95  // However, before copying to A'_i, we have to make sure that no other thread is still using it,
96  // i.e., we test that info->task_info[tid].users equals 0.
97  // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to
98  // use it.
99  while (info->task_info[tid].users != 0) {
100  std::this_thread::yield();
101  }
102  info->task_info[tid].users = threads;
103 
104  pack_lhs(blockA + info->task_info[tid].lhs_start * actual_kc,
105  lhs.getSubMapper(info->task_info[tid].lhs_start, k), actual_kc, info->task_info[tid].lhs_length);
106 
107  // Notify the other threads that the part A'_i is ready to go.
108  info->task_info[tid].sync = k;
109 
110  // Computes C_i += A' * B' per A'_i
111  for (int shift = 0; shift < threads; ++shift) {
112  int i = (tid + shift) % threads;
113 
114  // At this point we have to make sure that A'_i has been updated by the thread i,
115  // we use testAndSetOrdered to mimic a volatile access.
116  // However, no need to wait for the B' part which has been updated by the current thread!
117  if (shift > 0) {
118  while (info->task_info[i].sync != k) {
119  std::this_thread::yield();
120  }
121  }
122 
123  gebp(res.getSubMapper(info->task_info[i].lhs_start, 0), blockA + info->task_info[i].lhs_start * actual_kc,
124  blockB, info->task_info[i].lhs_length, actual_kc, nc, alpha);
125  }
126 
127  // Then keep going as usual with the remaining B'
128  for (Index j = nc; j < cols; j += nc) {
129  const Index actual_nc = (std::min)(j + nc, cols) - j;
130 
131  // pack B_k,j to B'
132  pack_rhs(blockB, rhs.getSubMapper(k, j), actual_kc, actual_nc);
133 
134  // C_j += A' * B'
135  gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
136  }
137 
138  // Release all the sub blocks A'_i of A' for the current thread,
139  // i.e., we simply decrement the number of users by 1
140  for (Index i = 0; i < threads; ++i) info->task_info[i].users -= 1;
141  }
142  } else
143 #endif // defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
144  {
146 
147  // this is the sequential version!
148  std::size_t sizeA = kc * mc;
149  std::size_t sizeB = kc * nc;
150 
151  ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
152  ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
153 
154  const bool pack_rhs_once = mc != rows && kc == depth && nc == cols;
155 
156  // For each horizontal panel of the rhs, and corresponding panel of the lhs...
157  for (Index i2 = 0; i2 < rows; i2 += mc) {
158  const Index actual_mc = (std::min)(i2 + mc, rows) - i2;
159 
160  for (Index k2 = 0; k2 < depth; k2 += kc) {
161  const Index actual_kc = (std::min)(k2 + kc, depth) - k2;
162 
163  // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
164  // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
165  // Note that this panel will be read as many times as the number of blocks in the rhs's
166  // horizontal panel which is, in practice, a very low number.
167  pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
168 
169  // For each kc x nc block of the rhs's horizontal panel...
170  for (Index j2 = 0; j2 < cols; j2 += nc) {
171  const Index actual_nc = (std::min)(j2 + nc, cols) - j2;
172 
173  // We pack the rhs's block into a sequential chunk of memory (L2 caching)
174  // Note that this block will be read a very high number of times, which is equal to the number of
175  // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
176  if ((!pack_rhs_once) || i2 == 0) pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc);
177 
178  // Everything is packed, we can now call the panel * block kernel:
179  gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
180  }
181  }
182  }
183  }
184  }
185 };
186 
187 /*********************************************************************************
188  * Specialization of generic_product_impl for "large" GEMM, i.e.,
189  * implementation of the high level wrapper to general_matrix_matrix_product
190  **********************************************************************************/
191 
192 template <typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest,
193  typename BlockingType>
194 struct gemm_functor {
195  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
196  : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {}
197 
198  void initParallelSession(Index num_threads) const {
199  m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
200  m_blocking.allocateA();
201  }
202 
204  if (cols == -1) cols = m_rhs.cols();
205 
206  Gemm::run(rows, cols, m_lhs.cols(), &m_lhs.coeffRef(row, 0), m_lhs.outerStride(), &m_rhs.coeffRef(0, col),
207  m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row, col)), m_dest.innerStride(), m_dest.outerStride(),
209  }
210 
211  typedef typename Gemm::Traits Traits;
212 
213  protected:
214  const Lhs& m_lhs;
215  const Rhs& m_rhs;
216  Dest& m_dest;
218  BlockingType& m_blocking;
219 };
220 
221 template <int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth,
222  int KcFactor = 1, bool FiniteAtCompileTime = MaxRows != Dynamic && MaxCols != Dynamic && MaxDepth != Dynamic>
224 
225 template <typename LhsScalar_, typename RhsScalar_>
227  typedef LhsScalar_ LhsScalar;
228  typedef RhsScalar_ RhsScalar;
229 
230  protected:
233 
237 
238  public:
239  level3_blocking() : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {}
240 
241  inline Index mc() const { return m_mc; }
242  inline Index nc() const { return m_nc; }
243  inline Index kc() const { return m_kc; }
244 
245  inline LhsScalar* blockA() { return m_blockA; }
246  inline RhsScalar* blockB() { return m_blockB; }
247 };
248 
249 template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
250  int KcFactor>
251 class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor,
252  true /* == FiniteAtCompileTime */>
253  : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
254  std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
255  enum {
256  Transpose = StorageOrder == RowMajor,
257  ActualRows = Transpose ? MaxCols : MaxRows,
258  ActualCols = Transpose ? MaxRows : MaxCols
259  };
260  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
261  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
262  enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth };
263 
264 #if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
265  EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
266  EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
267 #else
268  EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
269  EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
270 #endif
271 
272  public:
273  gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/,
274  bool /*full_rows = false*/) {
275  this->m_mc = ActualRows;
276  this->m_nc = ActualCols;
277  this->m_kc = MaxDepth;
278 #if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
279  this->m_blockA = m_staticA;
280  this->m_blockB = m_staticB;
281 #else
282  this->m_blockA = reinterpret_cast<LhsScalar*>((std::uintptr_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
283  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
284  this->m_blockB = reinterpret_cast<RhsScalar*>((std::uintptr_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
285  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
286 #endif
287  }
288 
290 
291  inline void allocateA() {}
292  inline void allocateB() {}
293  inline void allocateAll() {}
294 };
295 
296 template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
297  int KcFactor>
298 class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor, false>
299  : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
300  std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
301  enum { Transpose = StorageOrder == RowMajor };
302  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
303  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
304 
307 
308  public:
309  gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) {
310  this->m_mc = Transpose ? cols : rows;
311  this->m_nc = Transpose ? rows : cols;
312  this->m_kc = depth;
313 
314  if (l3_blocking) {
315  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
316  } else // no l3 blocking
317  {
318  Index n = this->m_nc;
319  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, n, num_threads);
320  }
321 
322  m_sizeA = this->m_mc * this->m_kc;
323  m_sizeB = this->m_kc * this->m_nc;
324  }
325 
326  void initParallel(Index rows, Index cols, Index depth, Index num_threads) {
327  this->m_mc = Transpose ? cols : rows;
328  this->m_nc = Transpose ? rows : cols;
329  this->m_kc = depth;
330 
331  eigen_internal_assert(this->m_blockA == 0 && this->m_blockB == 0);
332  Index m = this->m_mc;
333  computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, m, this->m_nc, num_threads);
334  m_sizeA = this->m_mc * this->m_kc;
335  m_sizeB = this->m_kc * this->m_nc;
336  }
337 
338  void allocateA() {
339  if (this->m_blockA == 0) this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
340  }
341 
342  void allocateB() {
343  if (this->m_blockB == 0) this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
344  }
345 
346  void allocateAll() {
347  allocateA();
348  allocateB();
349  }
350 
352  aligned_delete(this->m_blockA, m_sizeA);
353  aligned_delete(this->m_blockB, m_sizeB);
354  }
355 };
356 
357 } // end namespace internal
358 
359 namespace internal {
360 
361 template <typename Lhs, typename Rhs>
363  : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>> {
365  typedef typename Lhs::Scalar LhsScalar;
366  typedef typename Rhs::Scalar RhsScalar;
367 
371 
375 
376  enum { MaxDepthAtCompileTime = min_size_prefer_fixed(Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) };
377 
379 
380  template <typename Dst>
381  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
382  // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
383  // to determine the following heuristic.
384  // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
385  // unless it has been specialized by the user or for a given architecture.
386  // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
387  // I'm not sure it is still required.
388  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
389  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>());
390  else {
391  dst.setZero();
392  scaleAndAddTo(dst, lhs, rhs, Scalar(1));
393  }
394  }
395 
396  template <typename Dst>
397  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
398  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
399  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>());
400  else
401  scaleAndAddTo(dst, lhs, rhs, Scalar(1));
402  }
403 
404  template <typename Dst>
405  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
406  if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
407  lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>());
408  else
409  scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
410  }
411 
412  template <typename Dest>
413  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
414  eigen_assert(dst.rows() == a_lhs.rows() && dst.cols() == a_rhs.cols());
415  if (a_lhs.cols() == 0 || a_lhs.rows() == 0 || a_rhs.cols() == 0) return;
416 
417  if (dst.cols() == 1) {
418  // Fallback to GEMV if either the lhs or rhs is a runtime vector
419  typename Dest::ColXpr dst_vec(dst.col(0));
420  return internal::generic_product_impl<Lhs, typename Rhs::ConstColXpr, DenseShape, DenseShape,
421  GemvProduct>::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
422  } else if (dst.rows() == 1) {
423  // Fallback to GEMV if either the lhs or rhs is a runtime vector
424  typename Dest::RowXpr dst_vec(dst.row(0));
425  return internal::generic_product_impl<typename Lhs::ConstRowXpr, Rhs, DenseShape, DenseShape,
426  GemvProduct>::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
427  }
428 
429  add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
430  add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
431 
432  Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
433 
435  Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, MaxDepthAtCompileTime>
436  BlockingType;
437 
438  typedef internal::gemm_functor<
439  Scalar, Index,
441  Index, LhsScalar, (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor,
442  bool(LhsBlasTraits::NeedToConjugate), RhsScalar,
443  (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
444  (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>,
445  ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType>
446  GemmFunctor;
447 
448  BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
449  internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || Dest::MaxRowsAtCompileTime == Dynamic)>(
450  GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(),
451  Dest::Flags & RowMajorBit);
452  }
453 };
454 
455 } // end namespace internal
456 
457 } // end namespace Eigen
458 
459 #endif // EIGEN_GENERAL_MATRIX_MATRIX_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146
#define EIGEN_DEFAULT_ALIGN_BYTES
Definition: ConfigureVectorization.h:169
#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
Definition: GeneralProduct.h:28
#define eigen_internal_assert(x)
Definition: Macros.h:916
#define EIGEN_UNUSED_VARIABLE(var)
Definition: Macros.h:966
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
m col(1)
m row(1)
#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)
Definition: Memory.h:806
cout<< "Here is the matrix m:"<< endl<< m<< endl;Matrix< ptrdiff_t, 3, 1 > res
Definition: PartialRedux_count.cpp:3
int rows
Definition: Tutorial_commainit_02.cpp:1
int cols
Definition: Tutorial_commainit_02.cpp:1
SCALAR Scalar
Definition: bench_gemm.cpp:45
Expression of the product of two arbitrary matrices or vectors.
Definition: Product.h:202
Expression of the transpose of a matrix.
Definition: Transpose.h:56
Definition: BlasUtil.h:304
Definition: BlasUtil.h:443
Definition: products/GeneralBlockPanelKernel.h:397
LhsPacket LhsPacket4Packing
Definition: products/GeneralBlockPanelKernel.h:440
std::conditional_t< Transpose, LhsScalar_, RhsScalar_ > RhsScalar
Definition: GeneralMatrixMatrix.h:261
std::conditional_t< Transpose, RhsScalar_, LhsScalar_ > LhsScalar
Definition: GeneralMatrixMatrix.h:260
gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking)
Definition: GeneralMatrixMatrix.h:309
std::conditional_t< Transpose, RhsScalar_, LhsScalar_ > LhsScalar
Definition: GeneralMatrixMatrix.h:302
std::conditional_t< Transpose, LhsScalar_, RhsScalar_ > RhsScalar
Definition: GeneralMatrixMatrix.h:303
void initParallel(Index rows, Index cols, Index depth, Index num_threads)
Definition: GeneralMatrixMatrix.h:326
Definition: GeneralMatrixMatrix.h:223
Definition: GeneralMatrixMatrix.h:226
level3_blocking()
Definition: GeneralMatrixMatrix.h:239
RhsScalar * m_blockB
Definition: GeneralMatrixMatrix.h:232
Index nc() const
Definition: GeneralMatrixMatrix.h:242
Index m_nc
Definition: GeneralMatrixMatrix.h:235
LhsScalar * m_blockA
Definition: GeneralMatrixMatrix.h:231
Index m_kc
Definition: GeneralMatrixMatrix.h:236
Index m_mc
Definition: GeneralMatrixMatrix.h:234
RhsScalar * blockB()
Definition: GeneralMatrixMatrix.h:246
RhsScalar_ RhsScalar
Definition: GeneralMatrixMatrix.h:228
LhsScalar * blockA()
Definition: GeneralMatrixMatrix.h:245
Index mc() const
Definition: GeneralMatrixMatrix.h:241
LhsScalar_ LhsScalar
Definition: GeneralMatrixMatrix.h:227
Index kc() const
Definition: GeneralMatrixMatrix.h:243
#define min(a, b)
Definition: datatypes.h:22
@ GemvProduct
Definition: Constants.h:510
@ GemmProduct
Definition: Constants.h:511
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
const unsigned int RowMajorBit
Definition: Constants.h:70
RealScalar alpha
Definition: level1_cplx_impl.h:151
int * m
Definition: level2_cplx_impl.h:294
int info
Definition: level2_cplx_impl.h:39
char char char int int * k
Definition: level2_impl.h:374
EIGEN_DEVICE_FUNC void aligned_delete(T *ptr, std::size_t size)
Definition: Memory.h:430
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_STRONG_INLINE void parallelize_gemm(const Functor &func, Index rows, Index cols, Index, bool)
Definition: Parallelizer.h:108
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar &alpha, const Lhs &lhs, const Rhs &rhs)
Definition: BlasUtil.h:609
constexpr int min_size_prefer_fixed(A a, B b)
Definition: Meta.h:683
typename remove_all< T >::type remove_all_t
Definition: Meta.h:142
typename add_const_on_value_type< T >::type add_const_on_value_type_t
Definition: Meta.h:274
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
const int Dynamic
Definition: Constants.h:25
Definition: Eigen_Colamd.h:49
Definition: Constants.h:540
Determines whether the given binary operation of two numeric types is allowed and what the scalar ret...
Definition: XprHelper.h:1043
Definition: Parallelizer.h:106
Template functor for scalar/packet assignment with addition.
Definition: AssignmentFunctors.h:52
Template functor for scalar/packet assignment.
Definition: AssignmentFunctors.h:25
Definition: BlasUtil.h:459
std::conditional_t< bool(HasUsableDirectAccess), ExtractType, typename ExtractType_::PlainObject > DirectLinearAccessType
Definition: BlasUtil.h:475
Definition: products/GeneralBlockPanelKernel.h:960
Definition: GeneralMatrixMatrix.h:194
Dest & m_dest
Definition: GeneralMatrixMatrix.h:216
void initParallelSession(Index num_threads) const
Definition: GeneralMatrixMatrix.h:198
const Rhs & m_rhs
Definition: GeneralMatrixMatrix.h:215
Gemm::Traits Traits
Definition: GeneralMatrixMatrix.h:211
Scalar m_actualAlpha
Definition: GeneralMatrixMatrix.h:217
BlockingType & m_blocking
Definition: GeneralMatrixMatrix.h:218
gemm_functor(const Lhs &lhs, const Rhs &rhs, Dest &dest, const Scalar &actualAlpha, BlockingType &blocking)
Definition: GeneralMatrixMatrix.h:195
const Lhs & m_lhs
Definition: GeneralMatrixMatrix.h:214
void operator()(Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo< Index > *info=0) const
Definition: GeneralMatrixMatrix.h:203
Definition: BlasUtil.h:34
Definition: BlasUtil.h:30
static void run(Index rows, Index cols, Index depth, const LhsScalar *lhs_, Index lhsStride, const RhsScalar *rhs_, Index rhsStride, ResScalar *res_, Index resIncr, Index resStride, ResScalar alpha, level3_blocking< LhsScalar, RhsScalar > &blocking, GemmParallelInfo< Index > *info=0)
Definition: GeneralMatrixMatrix.h:52
static EIGEN_STRONG_INLINE void run(Index rows, Index cols, Index depth, const LhsScalar *lhs, Index lhsStride, const RhsScalar *rhs, Index rhsStride, ResScalar *res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking< RhsScalar, LhsScalar > &blocking, GemmParallelInfo< Index > *info=0)
Definition: GeneralMatrixMatrix.h:31
internal::blas_traits< Lhs > LhsBlasTraits
Definition: GeneralMatrixMatrix.h:368
RhsBlasTraits::DirectLinearAccessType ActualRhsType
Definition: GeneralMatrixMatrix.h:373
LhsBlasTraits::DirectLinearAccessType ActualLhsType
Definition: GeneralMatrixMatrix.h:369
static void subTo(Dst &dst, const Lhs &lhs, const Rhs &rhs)
Definition: GeneralMatrixMatrix.h:405
Product< Lhs, Rhs >::Scalar Scalar
Definition: GeneralMatrixMatrix.h:364
internal::remove_all_t< ActualRhsType > ActualRhsTypeCleaned
Definition: GeneralMatrixMatrix.h:374
static void evalTo(Dst &dst, const Lhs &lhs, const Rhs &rhs)
Definition: GeneralMatrixMatrix.h:381
internal::blas_traits< Rhs > RhsBlasTraits
Definition: GeneralMatrixMatrix.h:372
static void addTo(Dst &dst, const Lhs &lhs, const Rhs &rhs)
Definition: GeneralMatrixMatrix.h:397
generic_product_impl< Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode > lazyproduct
Definition: GeneralMatrixMatrix.h:378
internal::remove_all_t< ActualLhsType > ActualLhsTypeCleaned
Definition: GeneralMatrixMatrix.h:370
static void scaleAndAddTo(Dest &dst, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar &alpha)
Definition: GeneralMatrixMatrix.h:413
Definition: ProductEvaluators.h:341
Definition: ProductEvaluators.h:78
Template functor for scalar/packet assignment with subtraction.
Definition: AssignmentFunctors.h:73
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317