bench_gemm.cpp File Reference
#include <iostream>
#include <bench/BenchTimer.h>
#include <Eigen/Core>

Macros

#define SCALAR   float
 
#define SCALARA   SCALAR
 
#define SCALARB   SCALAR
 

Typedefs

typedef SCALAR Scalar
 
typedef NumTraits< Scalar >::Real RealScalar
 
typedef Matrix< SCALARA, Dynamic, Dynamic, opt_AA
 
typedef Matrix< SCALARB, Dynamic, Dynamic, opt_BB
 
typedef Matrix< Scalar, Dynamic, Dynamic > C
 
typedef Matrix< RealScalar, Dynamic, Dynamic > M
 

Functions

void matlab_cplx_cplx (const M &ar, const M &ai, const M &br, const M &bi, M &cr, M &ci)
 
void matlab_real_cplx (const M &a, const M &br, const M &bi, M &cr, M &ci)
 
void matlab_cplx_real (const M &ar, const M &ai, const M &b, M &cr, M &ci)
 
template<typename A , typename B , typename C >
EIGEN_DONT_INLINE void gemm (const A &a, const B &b, C &c)
 
int main (int argc, char **argv)
 

Variables

const int opt_A = ColMajor
 
const int opt_B = ColMajor
 

Macro Definition Documentation

◆ SCALAR

#define SCALAR   float

◆ SCALARA

#define SCALARA   SCALAR

◆ SCALARB

#define SCALARB   SCALAR

Typedef Documentation

◆ A

typedef Matrix<SCALARA, Dynamic, Dynamic, opt_A> A

◆ B

typedef Matrix<SCALARB, Dynamic, Dynamic, opt_B> B

◆ C

typedef Matrix<Scalar, Dynamic, Dynamic> C

◆ M

typedef Matrix<RealScalar, Dynamic, Dynamic> M

◆ RealScalar

◆ Scalar

Function Documentation

◆ gemm()

template<typename A , typename B , typename C >
EIGEN_DONT_INLINE void gemm ( const A a,
const B b,
C c 
)
158  {
159  c.noalias() += a * b;
160 }
Scalar * b
Definition: benchVecAdd.cpp:17
const Scalar * a
Definition: level2_cplx_impl.h:32
int c
Definition: calibrate.py:100

References a, b, and calibrate::c.

Referenced by main().

◆ main()

int main ( int argc  ,
char **  argv 
)
162  {
163  std::ptrdiff_t l1 = internal::queryL1CacheSize();
164  std::ptrdiff_t l2 = internal::queryTopLevelCacheSize();
165  std::cout << "L1 cache size = " << (l1 > 0 ? l1 / 1024 : -1) << " KB\n";
166  std::cout << "L2/L3 cache size = " << (l2 > 0 ? l2 / 1024 : -1) << " KB\n";
167  typedef internal::gebp_traits<Scalar, Scalar> Traits;
168  std::cout << "Register blocking = " << Traits::mr << " x " << Traits::nr << "\n";
169 
170  int rep = 1; // number of repetitions per try
171  int tries = 2; // number of tries, we keep the best
172 
173  int s = 2048;
174  int m = s;
175  int n = s;
176  int p = s;
177  int cache_size1 = -1, cache_size2 = l2, cache_size3 = 0;
178 
179  bool need_help = false;
180  for (int i = 1; i < argc;) {
181  if (argv[i][0] == '-') {
182  if (argv[i][1] == 's') {
183  ++i;
184  s = atoi(argv[i++]);
185  m = n = p = s;
186  if (argv[i][0] != '-') {
187  n = atoi(argv[i++]);
188  p = atoi(argv[i++]);
189  }
190  } else if (argv[i][1] == 'c') {
191  ++i;
192  cache_size1 = atoi(argv[i++]);
193  if (argv[i][0] != '-') {
194  cache_size2 = atoi(argv[i++]);
195  if (argv[i][0] != '-') cache_size3 = atoi(argv[i++]);
196  }
197  } else if (argv[i][1] == 't') {
198  tries = atoi(argv[++i]);
199  ++i;
200  } else if (argv[i][1] == 'p') {
201  ++i;
202  rep = atoi(argv[i++]);
203  }
204  } else {
205  need_help = true;
206  break;
207  }
208  }
209 
210  if (need_help) {
211  std::cout << argv[0] << " -s <matrix sizes> -c <cache sizes> -t <nb tries> -p <nb repeats>\n";
212  std::cout << " <matrix sizes> : size\n";
213  std::cout << " <matrix sizes> : rows columns depth\n";
214  return 1;
215  }
216 
217 #if EIGEN_VERSION_AT_LEAST(3, 2, 90)
218  if (cache_size1 > 0) setCpuCacheSizes(cache_size1, cache_size2, cache_size3);
219 #endif
220 
221  A a(m, p);
222  a.setRandom();
223  B b(p, n);
224  b.setRandom();
225  C c(m, n);
226  c.setOnes();
227  C rc = c;
228 
229  std::cout << "Matrix sizes = " << m << "x" << p << " * " << p << "x" << n << "\n";
230  std::ptrdiff_t mc(m), nc(n), kc(p);
231  internal::computeProductBlockingSizes<Scalar, Scalar>(kc, mc, nc);
232  std::cout << "blocking size (mc x kc) = " << mc << " x " << kc << " x " << nc << "\n";
233 
234  C r = c;
235 
236 // check the parallel product is correct
237 #if defined EIGEN_HAS_OPENMP
239  int procs = omp_get_max_threads();
240  if (procs > 1) {
241 #ifdef HAVE_BLAS
242  blas_gemm(a, b, r);
243 #else
244  omp_set_num_threads(1);
245  r.noalias() += a * b;
246  omp_set_num_threads(procs);
247 #endif
248  c.noalias() += a * b;
249  if (!r.isApprox(c)) std::cerr << "Warning, your parallel product is crap!\n\n";
250  }
251 #elif defined HAVE_BLAS
252  blas_gemm(a, b, r);
253  c.noalias() += a * b;
254  if (!r.isApprox(c)) {
255  std::cout << (r - c).norm() / r.norm() << "\n";
256  std::cerr << "Warning, your product is crap!\n\n";
257  }
258 #else
259  if (1. * m * n * p < 2000. * 2000 * 2000) {
260  gemm(a, b, c);
261  r.noalias() += a.cast<Scalar>().lazyProduct(b.cast<Scalar>());
262  if (!r.isApprox(c)) {
263  std::cout << (r - c).norm() / r.norm() << "\n";
264  std::cerr << "Warning, your product is crap!\n\n";
265  }
266  }
267 #endif
268 
269 #ifdef HAVE_BLAS
270  BenchTimer tblas;
271  c = rc;
272  BENCH(tblas, tries, rep, blas_gemm(a, b, c));
273  std::cout << "blas cpu " << tblas.best(CPU_TIMER) / rep << "s \t"
274  << (double(m) * n * p * rep * 2 / tblas.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << tblas.total(CPU_TIMER)
275  << "s)\n";
276  std::cout << "blas real " << tblas.best(REAL_TIMER) / rep << "s \t"
277  << (double(m) * n * p * rep * 2 / tblas.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << tblas.total(REAL_TIMER)
278  << "s)\n";
279 #endif
280 
281  // warm start
282  if (b.norm() + a.norm() == 123.554) std::cout << "\n";
283 
284  BenchTimer tmt;
285  c = rc;
286  BENCH(tmt, tries, rep, gemm(a, b, c));
287  std::cout << "eigen cpu " << tmt.best(CPU_TIMER) / rep << "s \t"
288  << (double(m) * n * p * rep * 2 / tmt.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER)
289  << "s)\n";
290  std::cout << "eigen real " << tmt.best(REAL_TIMER) / rep << "s \t"
291  << (double(m) * n * p * rep * 2 / tmt.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER)
292  << "s)\n";
293 
294 #ifdef EIGEN_HAS_OPENMP
295  if (procs > 1) {
296  BenchTimer tmono;
297  omp_set_num_threads(1);
299  c = rc;
300  BENCH(tmono, tries, rep, gemm(a, b, c));
301  std::cout << "eigen mono cpu " << tmono.best(CPU_TIMER) / rep << "s \t"
302  << (double(m) * n * p * rep * 2 / tmono.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << tmono.total(CPU_TIMER)
303  << "s)\n";
304  std::cout << "eigen mono real " << tmono.best(REAL_TIMER) / rep << "s \t"
305  << (double(m) * n * p * rep * 2 / tmono.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t("
306  << tmono.total(REAL_TIMER) << "s)\n";
307  std::cout << "mt speed up x" << tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER) << " => "
308  << (100.0 * tmono.best(CPU_TIMER) / tmt.best(REAL_TIMER)) / procs << "%\n";
309  }
310 #endif
311 
312  if (1. * m * n * p < 30 * 30 * 30) {
313  BenchTimer tmt;
314  c = rc;
315  BENCH(tmt, tries, rep, c.noalias() += a.lazyProduct(b));
316  std::cout << "lazy cpu " << tmt.best(CPU_TIMER) / rep << "s \t"
317  << (double(m) * n * p * rep * 2 / tmt.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << tmt.total(CPU_TIMER)
318  << "s)\n";
319  std::cout << "lazy real " << tmt.best(REAL_TIMER) / rep << "s \t"
320  << (double(m) * n * p * rep * 2 / tmt.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << tmt.total(REAL_TIMER)
321  << "s)\n";
322  }
323 
324 #ifdef DECOUPLED
326  M ar(m, p);
327  ar.setRandom();
328  M ai(m, p);
329  ai.setRandom();
330  M br(p, n);
331  br.setRandom();
332  M bi(p, n);
333  bi.setRandom();
334  M cr(m, n);
335  cr.setRandom();
336  M ci(m, n);
337  ci.setRandom();
338 
339  BenchTimer t;
340  BENCH(t, tries, rep, matlab_cplx_cplx(ar, ai, br, bi, cr, ci));
341  std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER) / rep << "s \t"
342  << (double(m) * n * p * rep * 2 / t.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER)
343  << "s)\n";
344  std::cout << "\"matlab\" real " << t.best(REAL_TIMER) / rep << "s \t"
345  << (double(m) * n * p * rep * 2 / t.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER)
346  << "s)\n";
347  }
349  M a(m, p);
350  a.setRandom();
351  M br(p, n);
352  br.setRandom();
353  M bi(p, n);
354  bi.setRandom();
355  M cr(m, n);
356  cr.setRandom();
357  M ci(m, n);
358  ci.setRandom();
359 
360  BenchTimer t;
361  BENCH(t, tries, rep, matlab_real_cplx(a, br, bi, cr, ci));
362  std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER) / rep << "s \t"
363  << (double(m) * n * p * rep * 2 / t.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER)
364  << "s)\n";
365  std::cout << "\"matlab\" real " << t.best(REAL_TIMER) / rep << "s \t"
366  << (double(m) * n * p * rep * 2 / t.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER)
367  << "s)\n";
368  }
370  M ar(m, p);
371  ar.setRandom();
372  M ai(m, p);
373  ai.setRandom();
374  M b(p, n);
375  b.setRandom();
376  M cr(m, n);
377  cr.setRandom();
378  M ci(m, n);
379  ci.setRandom();
380 
381  BenchTimer t;
382  BENCH(t, tries, rep, matlab_cplx_real(ar, ai, b, cr, ci));
383  std::cout << "\"matlab\" cpu " << t.best(CPU_TIMER) / rep << "s \t"
384  << (double(m) * n * p * rep * 2 / t.best(CPU_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(CPU_TIMER)
385  << "s)\n";
386  std::cout << "\"matlab\" real " << t.best(REAL_TIMER) / rep << "s \t"
387  << (double(m) * n * p * rep * 2 / t.best(REAL_TIMER)) * 1e-9 << " GFLOPS \t(" << t.total(REAL_TIMER)
388  << "s)\n";
389  }
390 #endif
391 
392  return 0;
393 }
#define BENCH(TIMER, TRIES, REP, CODE)
Definition: BenchTimer.h:150
int i
Definition: BiCGSTAB_step_by_step.cpp:9
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
Array< double, 1, 3 > e(1./3., 0.5, 2.)
float * p
Definition: Tutorial_Map_using.cpp:9
SCALAR Scalar
Definition: bench_gemm.cpp:45
void matlab_real_cplx(const M &a, const M &br, const M &bi, M &cr, M &ci)
Definition: bench_gemm.cpp:147
EIGEN_DONT_INLINE void gemm(const A &a, const B &b, C &c)
Definition: bench_gemm.cpp:158
void matlab_cplx_real(const M &ar, const M &ai, const M &b, M &cr, M &ci)
Definition: bench_gemm.cpp:152
void matlab_cplx_cplx(const M &ar, const M &ai, const M &br, const M &bi, M &cr, M &ci)
Definition: bench_gemm.cpp:139
Definition: BenchTimer.h:55
double best(int TIMER=CPU_TIMER) const
Definition: BenchTimer.h:98
double total(int TIMER=CPU_TIMER) const
Definition: BenchTimer.h:106
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
Definition: matrices.h:74
RealScalar s
Definition: level1_cplx_impl.h:130
int * m
Definition: level2_cplx_impl.h:294
int queryTopLevelCacheSize()
Definition: Memory.h:1307
int queryL1CacheSize()
Definition: Memory.h:1299
@ REAL_TIMER
Definition: BenchTimer.h:46
@ CPU_TIMER
Definition: BenchTimer.h:46
EIGEN_DEPRECATED void initParallel()
Definition: Parallelizer.h:50
void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3)
Definition: products/GeneralBlockPanelKernel.h:3146
void setNbThreads(int v)
Definition: Parallelizer.h:62
r
Definition: UniformPSDSelfTest.py:20
list rc
Definition: plotDoE.py:16
t
Definition: plotPSD.py:36
Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
Definition: NumTraits.h:217

References a, b, BENCH, Eigen::BenchTimer::best(), calibrate::c, Eigen::CPU_TIMER, e(), gemm(), i, Eigen::initParallel(), m, matlab_cplx_cplx(), matlab_cplx_real(), matlab_real_cplx(), n, p, Eigen::internal::queryL1CacheSize(), Eigen::internal::queryTopLevelCacheSize(), UniformPSDSelfTest::r, plotDoE::rc, Eigen::REAL_TIMER, s, Eigen::setCpuCacheSizes(), Eigen::setNbThreads(), Eigen::PlainObjectBase< Derived >::setRandom(), plotPSD::t, and Eigen::BenchTimer::total().

◆ matlab_cplx_cplx()

void matlab_cplx_cplx ( const M ar,
const M ai,
const M br,
const M bi,
M cr,
M ci 
)
139  {
140  cr.noalias() += ar * br;
141  cr.noalias() -= ai * bi;
142  ci.noalias() += ar * bi;
143  ci.noalias() += ai * br;
144  // [cr ci] += [ar ai] * br + [-ai ar] * bi
145 }

Referenced by main().

◆ matlab_cplx_real()

void matlab_cplx_real ( const M ar,
const M ai,
const M b,
M cr,
M ci 
)
152  {
153  cr.noalias() += ar * b;
154  ci.noalias() += ai * b;
155 }

References b.

Referenced by main().

◆ matlab_real_cplx()

void matlab_real_cplx ( const M a,
const M br,
const M bi,
M cr,
M ci 
)
147  {
148  cr.noalias() += a * br;
149  ci.noalias() += a * bi;
150 }

References a.

Referenced by main().

Variable Documentation

◆ opt_A

const int opt_A = ColMajor

◆ opt_B

const int opt_B = ColMajor