benchmark-blocking-sizes.cpp File Reference
#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <fstream>
#include <memory>
#include <cstdio>
#include <Eigen/Core>
#include <bench/BenchTimer.h>

Classes

struct  size_triple_t
 
struct  benchmark_t
 
struct  action_t
 
struct  human_duration_t
 
struct  measure_all_pot_sizes_action_t
 
struct  measure_default_sizes_action_t
 

Macros

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES   eigen_use_specific_block_size
 
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K   eigen_block_size_k
 
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M   eigen_block_size_m
 
#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N   eigen_block_size_n
 

Typedefs

typedef MatrixXf MatrixType
 
typedef MatrixType::Scalar Scalar
 
typedef internal::packet_traits< Scalar >::type Packet
 

Functions

uint8_t log2_pot (size_t x)
 
uint16_t compact_size_triple (size_t k, size_t m, size_t n)
 
uint16_t compact_size_triple (const size_triple_t &t)
 
ostream & operator<< (ostream &s, const benchmark_t &b)
 
bool operator< (const benchmark_t &b1, const benchmark_t &b2)
 
void print_cpuinfo ()
 
template<typename T >
string type_name ()
 
template<>
string type_name< float > ()
 
template<>
string type_name< double > ()
 
void show_usage_and_exit (int, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
 
float measure_clock_speed ()
 
ostream & operator<< (ostream &s, const human_duration_t &d)
 
void serialize_benchmarks (const char *filename, const vector< benchmark_t > &benchmarks, size_t first_benchmark_to_run)
 
bool deserialize_benchmarks (const char *filename, vector< benchmark_t > &benchmarks, size_t &first_benchmark_to_run)
 
void try_run_some_benchmarks (vector< benchmark_t > &benchmarks, double time_start, size_t &first_benchmark_to_run)
 
void run_benchmarks (vector< benchmark_t > &benchmarks)
 
int main (int argc, char *argv[])
 

Variables

bool eigen_use_specific_block_size
 
int eigen_block_size_k
 
int eigen_block_size_m
 
int eigen_block_size_n
 
static BenchTimer timer
 
const int measurement_repetitions = 3
 
const float min_accurate_time = 1e-2f
 
size_t min_working_set_size = 0
 
float max_clock_speed = 0.0f
 
const size_t maxsize = 2048
 
const size_t minsize = 16
 
const char session_filename [] = "/data/local/tmp/benchmark-blocking-sizes-session.data"
 

Macro Definition Documentation

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K   eigen_block_size_k

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M   eigen_block_size_m

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N   eigen_block_size_n

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZES

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES   eigen_use_specific_block_size

Typedef Documentation

◆ MatrixType

typedef MatrixXf MatrixType

◆ Packet

typedef internal::packet_traits<Scalar>::type Packet

◆ Scalar

typedef MatrixType::Scalar Scalar

Function Documentation

◆ compact_size_triple() [1/2]

uint16_t compact_size_triple ( const size_triple_t t)
87 { return compact_size_triple(t.k, t.m, t.n); }
uint16_t compact_size_triple(size_t k, size_t m, size_t n)
Definition: benchmark-blocking-sizes.cpp:83
t
Definition: plotPSD.py:36

References compact_size_triple(), and plotPSD::t.

◆ compact_size_triple() [2/2]

uint16_t compact_size_triple ( size_t  k,
size_t  m,
size_t  n 
)
83  {
84  return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
85 }
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
uint8_t log2_pot(size_t x)
Definition: benchmark-blocking-sizes.cpp:74
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374

References k, log2_pot(), m, and n.

Referenced by compact_size_triple().

◆ deserialize_benchmarks()

bool deserialize_benchmarks ( const char filename,
vector< benchmark_t > &  benchmarks,
size_t &  first_benchmark_to_run 
)
328  {
329  FILE* file = fopen(filename, "r");
330  if (!file) {
331  return false;
332  }
333  if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
334  return false;
335  }
336  size_t benchmarks_vector_size = 0;
337  if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
338  return false;
339  }
340  if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
341  return false;
342  }
343  benchmarks.resize(benchmarks_vector_size);
344  if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
345  return false;
346  }
347  unlink(filename);
348  return true;
349 }
float max_clock_speed
Definition: benchmark-blocking-sizes.cpp:46
string filename
Definition: MergeRestartFiles.py:39
Definition: benchmark-blocking-sizes.cpp:91

References MergeRestartFiles::filename, and max_clock_speed.

Referenced by run_benchmarks().

◆ log2_pot()

uint8_t log2_pot ( size_t  x)
74  {
75  size_t l = 0;
76  while (x >>= 1) l++;
77  return l;
78 }
list x
Definition: plotDoE.py:28

References plotDoE::x.

Referenced by compact_size_triple().

◆ main()

int main ( int argc  ,
char argv[] 
)
562  {
563  double time_start = timer.getRealTime();
564  cout.precision(4);
565  cerr.precision(4);
566 
567  vector<unique_ptr<action_t>> available_actions;
568  available_actions.emplace_back(new measure_all_pot_sizes_action_t);
569  available_actions.emplace_back(new measure_default_sizes_action_t);
570 
571  auto action = available_actions.end();
572 
573  if (argc <= 1) {
574  show_usage_and_exit(argc, argv, available_actions);
575  }
576  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
577  if (!strcmp(argv[1], (*it)->invokation_name())) {
578  action = it;
579  break;
580  }
581  }
582 
583  if (action == available_actions.end()) {
584  show_usage_and_exit(argc, argv, available_actions);
585  }
586 
587  for (int i = 2; i < argc; i++) {
588  if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
589  const char* equals_sign = strchr(argv[i], '=');
590  min_working_set_size = strtoul(equals_sign + 1, nullptr, 10);
591  } else {
592  cerr << "unrecognized option: " << argv[i] << endl << endl;
593  show_usage_and_exit(argc, argv, available_actions);
594  }
595  }
596 
597  print_cpuinfo();
598 
599  cout << "benchmark parameters:" << endl;
600  cout << "pointer size: " << 8 * sizeof(void*) << " bits" << endl;
601  cout << "scalar type: " << type_name<Scalar>() << endl;
602  cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
603  cout << "minsize = " << minsize << endl;
604  cout << "maxsize = " << maxsize << endl;
605  cout << "measurement_repetitions = " << measurement_repetitions << endl;
606  cout << "min_accurate_time = " << min_accurate_time << endl;
607  cout << "min_working_set_size = " << min_working_set_size;
608  if (min_working_set_size == 0) {
609  cout << " (try to outsize caches)";
610  }
611  cout << endl << endl;
612 
613  (*action)->run();
614 
615  double time_end = timer.getRealTime();
616  cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
617 }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
size_t min_working_set_size
Definition: benchmark-blocking-sizes.cpp:44
const size_t minsize
Definition: benchmark-blocking-sizes.cpp:50
const size_t maxsize
Definition: benchmark-blocking-sizes.cpp:49
const float min_accurate_time
Definition: benchmark-blocking-sizes.cpp:41
void print_cpuinfo()
Definition: benchmark-blocking-sizes.cpp:206
void show_usage_and_exit(int, char *argv[], const vector< unique_ptr< action_t >> &available_actions)
Definition: benchmark-blocking-sizes.cpp:249
const int measurement_repetitions
Definition: benchmark-blocking-sizes.cpp:36
action
Definition: calibrate.py:47
Definition: benchmark-blocking-sizes.cpp:288
Definition: benchmark-blocking-sizes.cpp:510
Definition: benchmark-blocking-sizes.cpp:539

References calibrate::action, i, maxsize, measurement_repetitions, min_accurate_time, min_working_set_size, minsize, print_cpuinfo(), show_usage_and_exit(), and Eigen::internal::packet_traits< T >::size.

◆ measure_clock_speed()

float measure_clock_speed ( )
268  {
269  cerr << "Measuring clock speed... \r" << flush;
270 
271  vector<float> all_gflops;
272  for (int i = 0; i < 8; i++) {
273  benchmark_t b(1024, 1024, 1024);
274  b.run();
275  all_gflops.push_back(b.gflops);
276  }
277 
278  sort(all_gflops.begin(), all_gflops.end());
279  float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
280 
281  // multiply by an arbitrary constant to discourage trying doing anything with the
282  // returned values besides just comparing them with each other.
283  float result = stable_estimate * 123.456f;
284 
285  return result;
286 }
Scalar * b
Definition: benchVecAdd.cpp:17

References b, and i.

Referenced by run_benchmarks(), and try_run_some_benchmarks().

◆ operator<()

bool operator< ( const benchmark_t b1,
const benchmark_t b2 
)
127  {
131  (b1.compact_block_size == b2.compact_block_size && b1.gflops > b2.gflops))));
132 }
float gflops
Definition: benchmark-blocking-sizes.cpp:95
uint16_t compact_block_size
Definition: benchmark-blocking-sizes.cpp:93
uint16_t compact_product_size
Definition: benchmark-blocking-sizes.cpp:92

References benchmark_t::compact_block_size, benchmark_t::compact_product_size, and benchmark_t::gflops.

Referenced by Eigen::TensorBase< Derived, ReadOnlyAccessors >::operator<().

◆ operator<<() [1/2]

ostream& operator<< ( ostream &  s,
const benchmark_t b 
)
111  {
112  s << hex << b.compact_product_size << dec;
113  if (b.use_default_block_size) {
114  size_triple_t t(b.compact_product_size);
115  Index k = t.k, m = t.m, n = t.n;
116  internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
117  s << " default(" << k << ", " << m << ", " << n << ")";
118  } else {
119  s << " " << hex << b.compact_block_size << dec;
120  }
121  s << " " << b.gflops;
122  return s;
123 }
RealScalar s
Definition: level1_cplx_impl.h:130
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: analyze-blocking-sizes.cpp:45

References b, k, m, n, s, and plotPSD::t.

◆ operator<<() [2/2]

ostream& operator<< ( ostream &  s,
const human_duration_t d 
)
293  {
294  int remainder = d.seconds;
295  if (remainder > 3600) {
296  int hours = remainder / 3600;
297  s << hours << " h ";
298  remainder -= hours * 3600;
299  }
300  if (remainder > 60) {
301  int minutes = remainder / 60;
302  s << minutes << " min ";
303  remainder -= minutes * 60;
304  }
305  if (d.seconds < 600) {
306  s << remainder << " s";
307  }
308  return s;
309 }
int seconds
Definition: benchmark-blocking-sizes.cpp:289

References s, and human_duration_t::seconds.

◆ print_cpuinfo()

void print_cpuinfo ( )
206  {
207 #ifdef __linux__
208  cout << "contents of /proc/cpuinfo:" << endl;
209  string line;
210  ifstream cpuinfo("/proc/cpuinfo");
211  if (cpuinfo.is_open()) {
212  while (getline(cpuinfo, line)) {
213  cout << line << endl;
214  }
215  cpuinfo.close();
216  }
217  cout << endl;
218 #elif defined __APPLE__
219  cout << "output of sysctl hw:" << endl;
220  system("sysctl hw");
221  cout << endl;
222 #endif
223 }
line
Definition: calibrate.py:103

References calibrate::line.

Referenced by main().

◆ run_benchmarks()

void run_benchmarks ( vector< benchmark_t > &  benchmarks)
454  {
455  size_t first_benchmark_to_run;
456  vector<benchmark_t> deserialized_benchmarks;
457  bool use_deserialized_benchmarks = false;
458  if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
459  cerr << "Found serialized session with " << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
460  << " % already done" << endl;
461  if (deserialized_benchmarks.size() == benchmarks.size() && first_benchmark_to_run > 0 &&
462  first_benchmark_to_run < benchmarks.size()) {
463  use_deserialized_benchmarks = true;
464  }
465  }
466 
467  if (use_deserialized_benchmarks) {
468  benchmarks = deserialized_benchmarks;
469  } else {
470  // not using deserialized benchmarks, starting from scratch
471  first_benchmark_to_run = 0;
472 
473  // Randomly shuffling benchmarks allows us to get accurate enough progress info,
474  // as now the cheap/expensive benchmarks are randomly mixed so they average out.
475  // It also means that if data is corrupted for some time span, the odds are that
476  // not all repetitions of a given benchmark will be corrupted.
477  random_shuffle(benchmarks.begin(), benchmarks.end());
478  }
479 
480  for (int i = 0; i < 4; i++) {
482  }
483 
484  double time_start = 0.0;
485  while (first_benchmark_to_run < benchmarks.size()) {
486  if (first_benchmark_to_run == 0) {
487  time_start = timer.getRealTime();
488  }
489  try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run);
490  }
491 
492  // Sort timings by increasing benchmark parameters, and decreasing gflops.
493  // The latter is very important. It means that we can ignore all but the first
494  // benchmark with given parameters.
495  sort(benchmarks.begin(), benchmarks.end());
496 
497  // Collect best (i.e. now first) results for each parameter values.
498  vector<benchmark_t> best_benchmarks;
499  for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
500  if (best_benchmarks.empty() || best_benchmarks.back().compact_product_size != it->compact_product_size ||
501  best_benchmarks.back().compact_block_size != it->compact_block_size) {
502  best_benchmarks.push_back(*it);
503  }
504  }
505 
506  // keep and return only the best benchmarks
507  benchmarks = best_benchmarks;
508 }
bool deserialize_benchmarks(const char *filename, vector< benchmark_t > &benchmarks, size_t &first_benchmark_to_run)
Definition: benchmark-blocking-sizes.cpp:328
float measure_clock_speed()
Definition: benchmark-blocking-sizes.cpp:268
const char session_filename[]
Definition: benchmark-blocking-sizes.cpp:311
void try_run_some_benchmarks(vector< benchmark_t > &benchmarks, double time_start, size_t &first_benchmark_to_run)
Definition: benchmark-blocking-sizes.cpp:351
#define max(a, b)
Definition: datatypes.h:23

References deserialize_benchmarks(), i, Eigen::max(), max_clock_speed, measure_clock_speed(), session_filename, and try_run_some_benchmarks().

Referenced by measure_all_pot_sizes_action_t::run(), and measure_default_sizes_action_t::run().

◆ serialize_benchmarks()

void serialize_benchmarks ( const char filename,
const vector< benchmark_t > &  benchmarks,
size_t  first_benchmark_to_run 
)
313  {
314  FILE* file = fopen(filename, "w");
315  if (!file) {
316  cerr << "Could not open file " << filename << " for writing." << endl;
317  cerr << "Do you have write permissions on the current working directory?" << endl;
318  exit(1);
319  }
320  size_t benchmarks_vector_size = benchmarks.size();
321  fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
322  fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
323  fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
324  fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
325  fclose(file);
326 }

References MergeRestartFiles::filename, and max_clock_speed.

Referenced by try_run_some_benchmarks().

◆ show_usage_and_exit()

void show_usage_and_exit ( int  ,
char argv[],
const vector< unique_ptr< action_t >> &  available_actions 
)
249  {
250  cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
251  cerr << "available actions:" << endl << endl;
252  for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
253  cerr << " " << (*it)->invokation_name() << endl;
254  }
255  cerr << endl;
256  cerr << "options:" << endl << endl;
257  cerr << " --min-working-set-size=N:" << endl;
258  cerr << " Set the minimum working set size to N bytes." << endl;
259  cerr << " This is rounded up as needed to a multiple of matrix size." << endl;
260  cerr << " A larger working set lowers the chance of a warm cache." << endl;
261  cerr << " The default value 0 means use a large enough working" << endl;
262  cerr << " set to likely outsize caches." << endl;
263  cerr << " A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
264  cerr << " avoid warm caches." << endl;
265  exit(1);
266 }

Referenced by main().

◆ try_run_some_benchmarks()

void try_run_some_benchmarks ( vector< benchmark_t > &  benchmarks,
double  time_start,
size_t &  first_benchmark_to_run 
)
351  {
352  if (first_benchmark_to_run == benchmarks.size()) {
353  return;
354  }
355 
356  double time_last_progress_update = 0;
357  double time_last_clock_speed_measurement = 0;
358  double time_now = 0;
359 
360  size_t benchmark_index = first_benchmark_to_run;
361 
362  while (true) {
363  float ratio_done = float(benchmark_index) / benchmarks.size();
364  time_now = timer.getRealTime();
365 
366  // We check clock speed every minute and at the end.
367  if (benchmark_index == benchmarks.size() || time_now > time_last_clock_speed_measurement + 60.0f) {
368  time_last_clock_speed_measurement = time_now;
369 
370  // Ensure that clock speed is as expected
371  float current_clock_speed = measure_clock_speed();
372 
373  // The tolerance needs to be smaller than the relative difference between
374  // clock speeds that a device could operate under.
375  // It seems unlikely that a device would be throttling clock speeds by
376  // amounts smaller than 2%.
377  // With a value of 1%, I was getting within noise on a Sandy Bridge.
378  const float clock_speed_tolerance = 0.02f;
379 
380  if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
381  // Clock speed is now higher than we previously measured.
382  // Either our initial measurement was inaccurate, which won't happen
383  // too many times as we are keeping the best clock speed value and
384  // and allowing some tolerance; or something really weird happened,
385  // which invalidates all benchmark results collected so far.
386  // Either way, we better restart all over again now.
387  if (benchmark_index) {
388  cerr << "Restarting at " << 100.0f * ratio_done << " % because clock speed increased. " << endl;
389  }
390  max_clock_speed = current_clock_speed;
391  first_benchmark_to_run = 0;
392  return;
393  }
394 
395  bool rerun_last_tests = false;
396 
397  if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
398  cerr << "Measurements completed so far: " << 100.0f * ratio_done << " % " << endl;
399  cerr << "Clock speed seems to be only " << current_clock_speed / max_clock_speed << " times what it used to be."
400  << endl;
401 
402  unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
403 
404  while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
405  if (seconds_to_sleep_if_lower_clock_speed > 32) {
406  cerr << "Sleeping longer probably won't make a difference." << endl;
407  cerr << "Serializing benchmarks to " << session_filename << endl;
408  serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
409  cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
410  exit(2);
411  }
412  rerun_last_tests = true;
413  cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed << " s... \r"
414  << endl;
415  sleep(seconds_to_sleep_if_lower_clock_speed);
416  current_clock_speed = measure_clock_speed();
417  seconds_to_sleep_if_lower_clock_speed *= 2;
418  }
419  }
420 
421  if (rerun_last_tests) {
422  cerr << "Redoing the last " << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
423  << " % because clock speed had been low. " << endl;
424  return;
425  }
426 
427  // nothing wrong with the clock speed so far, so there won't be a need to rerun
428  // benchmarks run so far in case we later encounter a lower clock speed.
429  first_benchmark_to_run = benchmark_index;
430  }
431 
432  if (benchmark_index == benchmarks.size()) {
433  // We're done!
434  first_benchmark_to_run = benchmarks.size();
435  // Erase progress info
436  cerr << " " << endl;
437  return;
438  }
439 
440  // Display progress info on stderr
441  if (time_now > time_last_progress_update + 1.0f) {
442  time_last_progress_update = time_now;
443  cerr << "Measurements... " << 100.0f * ratio_done << " %, ETA "
444  << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
445  << " \r" << flush;
446  }
447 
448  // This is where we actually run a benchmark!
449  benchmarks[benchmark_index].run();
450  benchmark_index++;
451  }
452 }
void serialize_benchmarks(const char *filename, const vector< benchmark_t > &benchmarks, size_t first_benchmark_to_run)
Definition: benchmark-blocking-sizes.cpp:313

References max_clock_speed, measure_clock_speed(), serialize_benchmarks(), and session_filename.

Referenced by run_benchmarks().

◆ type_name()

template<typename T >
string type_name ( )

◆ type_name< double >()

template<>
string type_name< double > ( )
236  {
237  return "double";
238 }

◆ type_name< float >()

template<>
string type_name< float > ( )
231  {
232  return "float";
233 }

Variable Documentation

◆ eigen_block_size_k

int eigen_block_size_k

Referenced by benchmark_t::run().

◆ eigen_block_size_m

int eigen_block_size_m

Referenced by benchmark_t::run().

◆ eigen_block_size_n

int eigen_block_size_n

Referenced by benchmark_t::run().

◆ eigen_use_specific_block_size

bool eigen_use_specific_block_size

Referenced by benchmark_t::run().

◆ max_clock_speed

◆ maxsize

◆ measurement_repetitions

const int measurement_repetitions = 3

◆ min_accurate_time

const float min_accurate_time = 1e-2f

Referenced by main(), and benchmark_t::run().

◆ min_working_set_size

size_t min_working_set_size = 0

Referenced by main(), and benchmark_t::run().

◆ minsize

◆ session_filename

const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"

◆ timer

BenchTimer timer
static