#include <iostream>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <fstream>
#include <memory>
#include <cstdio>
#include <Eigen/Core>
#include <bench/BenchTimer.h>

Classes
struct	size_triple_t

struct	benchmark_t

struct	action_t

struct	human_duration_t

struct	measure_all_pot_sizes_action_t

struct	measure_default_sizes_action_t

Macros
#define	EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size

#define	EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k

#define	EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m

#define	EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n

Typedefs
typedef MatrixXf	MatrixType

typedef MatrixType::Scalar	Scalar

typedef internal::packet_traits< Scalar >::type	Packet

Functions
uint8_t	log2_pot (size_t x)

uint16_t	compact_size_triple (size_t k, size_t m, size_t n)

uint16_t	compact_size_triple (const size_triple_t &t)

ostream &	operator<< (ostream &s, const benchmark_t &b)

bool	operator< (const benchmark_t &b1, const benchmark_t &b2)

void	print_cpuinfo ()

template<typename T >
string	type_name ()

template<>
string	type_name< float > ()

template<>
string	type_name< double > ()

void	show_usage_and_exit (int, char *argv[], const vector< unique_ptr< action_t >> &available_actions)

float	measure_clock_speed ()

ostream &	operator<< (ostream &s, const human_duration_t &d)

void	serialize_benchmarks (const char *filename, const vector< benchmark_t > &benchmarks, size_t first_benchmark_to_run)

bool	deserialize_benchmarks (const char *filename, vector< benchmark_t > &benchmarks, size_t &first_benchmark_to_run)

void	try_run_some_benchmarks (vector< benchmark_t > &benchmarks, double time_start, size_t &first_benchmark_to_run)

void	run_benchmarks (vector< benchmark_t > &benchmarks)

int	main (int argc, char *argv[])

Variables
bool	eigen_use_specific_block_size

int	eigen_block_size_k

int	eigen_block_size_m

int	eigen_block_size_n

static BenchTimer	timer

const int	measurement_repetitions = 3

const float	min_accurate_time = 1e-2f

size_t	min_working_set_size = 0

float	max_clock_speed = 0.0f

const size_t	maxsize = 2048

const size_t	minsize = 16

const char	session_filename [] = "/data/local/tmp/benchmark-blocking-sizes-session.data"

Macro Definition Documentation

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K eigen_block_size_k

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M eigen_block_size_m

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N eigen_block_size_n

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZES

#define EIGEN_TEST_SPECIFIC_BLOCKING_SIZES eigen_use_specific_block_size

Typedef Documentation

◆ MatrixType

typedef MatrixXf MatrixType

◆ Packet

typedef internal::packet_traits<Scalar>::type Packet

◆ Scalar

typedef MatrixType::Scalar Scalar

Function Documentation

◆ compact_size_triple() [1/2]

uint16_t compact_size_triple ( const size_triple_t & t )

87 { return compact_size_triple(t.k, t.m, t.n); }

compact_size_triple

uint16_t compact_size_triple(size_t k, size_t m, size_t n)

Definition: benchmark-blocking-sizes.cpp:83

plotPSD.t

t

Definition: plotPSD.py:36

References compact_size_triple(), and plotPSD::t.

◆ compact_size_triple() [2/2]

uint16_t compact_size_triple	(	size_t	k,
		size_t	m,
		size_t	n
	)

                                                            {
   return (log2_pot(k) << 8) | (log2_pot(m) << 4) | log2_pot(n);
 }

References k, log2_pot(), m, and n.

Referenced by compact_size_triple().

◆ deserialize_benchmarks()

bool deserialize_benchmarks	(	const char *	filename,
		vector< benchmark_t > &	benchmarks,
		size_t &	first_benchmark_to_run
	)

                                                                                                                    {
   FILE* file = fopen(filename, "r");
   if (!file) {
     return false;
   }
   if (1 != fread(&max_clock_speed, sizeof(max_clock_speed), 1, file)) {
     return false;
   }
   size_t benchmarks_vector_size = 0;
   if (1 != fread(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file)) {
     return false;
   }
   if (1 != fread(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file)) {
     return false;
   }
   benchmarks.resize(benchmarks_vector_size);
   if (benchmarks.size() != fread(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file)) {
     return false;
   }
   unlink(filename);
   return true;
 }

References MergeRestartFiles::filename, and max_clock_speed.

Referenced by run_benchmarks().

◆ log2_pot()

uint8_t log2_pot ( size_t x )

                            {
   size_t l = 0;
   while (x >>= 1) l++;
   return l;
 }

References plotDoE::x.

Referenced by compact_size_triple().

◆ main()

int main	(	int argc	,
		char *	argv[]
	)

                                  {
   double time_start = timer.getRealTime();
   cout.precision(4);
   cerr.precision(4);
  
   vector<unique_ptr<action_t>> available_actions;
   available_actions.emplace_back(new measure_all_pot_sizes_action_t);
   available_actions.emplace_back(new measure_default_sizes_action_t);
  
   auto action = available_actions.end();
  
   if (argc <= 1) {
     show_usage_and_exit(argc, argv, available_actions);
   }
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
     if (!strcmp(argv[1], (*it)->invokation_name())) {
       action = it;
       break;
     }
   }
  
   if (action == available_actions.end()) {
     show_usage_and_exit(argc, argv, available_actions);
   }
  
   for (int i = 2; i < argc; i++) {
     if (argv[i] == strstr(argv[i], "--min-working-set-size=")) {
       const char* equals_sign = strchr(argv[i], '=');
       min_working_set_size = strtoul(equals_sign + 1, nullptr, 10);
     } else {
       cerr << "unrecognized option: " << argv[i] << endl << endl;
       show_usage_and_exit(argc, argv, available_actions);
     }
   }
  
   print_cpuinfo();
  
   cout << "benchmark parameters:" << endl;
   cout << "pointer size: " << 8 * sizeof(void*) << " bits" << endl;
   cout << "scalar type: " << type_name<Scalar>() << endl;
   cout << "packet size: " << internal::packet_traits<MatrixType::Scalar>::size << endl;
   cout << "minsize = " << minsize << endl;
   cout << "maxsize = " << maxsize << endl;
   cout << "measurement_repetitions = " << measurement_repetitions << endl;
   cout << "min_accurate_time = " << min_accurate_time << endl;
   cout << "min_working_set_size = " << min_working_set_size;
   if (min_working_set_size == 0) {
     cout << " (try to outsize caches)";
   }
   cout << endl << endl;
  
   (*action)->run();
  
   double time_end = timer.getRealTime();
   cerr << "Finished in " << human_duration_t(time_end - time_start) << endl;
 }

References calibrate::action, i, maxsize, measurement_repetitions, min_accurate_time, min_working_set_size, minsize, print_cpuinfo(), show_usage_and_exit(), and Eigen::internal::packet_traits< T >::size.

◆ measure_clock_speed()

float measure_clock_speed ( )

                             {
   cerr << "Measuring clock speed...                              \r" << flush;
  
   vector<float> all_gflops;
   for (int i = 0; i < 8; i++) {
     benchmark_t b(1024, 1024, 1024);
     b.run();
     all_gflops.push_back(b.gflops);
   }
  
   sort(all_gflops.begin(), all_gflops.end());
   float stable_estimate = all_gflops[2] + all_gflops[3] + all_gflops[4] + all_gflops[5];
  
   // multiply by an arbitrary constant to discourage trying doing anything with the
   // returned values besides just comparing them with each other.
   float result = stable_estimate * 123.456f;
  
   return result;
 }

References b, and i.

Referenced by run_benchmarks(), and try_run_some_benchmarks().

◆ operator<()

bool operator<	(	const benchmark_t &	b1,
		const benchmark_t &	b2
	)

                                                              {
   return b1.compact_product_size < b2.compact_product_size ||
          (b1.compact_product_size == b2.compact_product_size &&
           ((b1.compact_block_size < b2.compact_block_size ||
             (b1.compact_block_size == b2.compact_block_size && b1.gflops > b2.gflops))));
 }

References benchmark_t::compact_block_size, benchmark_t::compact_product_size, and benchmark_t::gflops.

Referenced by Eigen::TensorBase< Derived, ReadOnlyAccessors >::operator<().

◆ operator<<() [1/2]

ostream& operator<<	(	ostream &	s,
		const benchmark_t &	b
	)

                                                       {
   s << hex << b.compact_product_size << dec;
   if (b.use_default_block_size) {
     size_triple_t t(b.compact_product_size);
     Index k = t.k, m = t.m, n = t.n;
     internal::computeProductBlockingSizes<Scalar, Scalar>(k, m, n);
     s << " default(" << k << ", " << m << ", " << n << ")";
   } else {
     s << " " << hex << b.compact_block_size << dec;
   }
   s << " " << b.gflops;
   return s;
 }

References b, k, m, n, s, and plotPSD::t.

◆ operator<<() [2/2]

ostream& operator<<	(	ostream &	s,
		const human_duration_t &	d
	)

                                                            {
   int remainder = d.seconds;
   if (remainder > 3600) {
     int hours = remainder / 3600;
     s << hours << " h ";
     remainder -= hours * 3600;
   }
   if (remainder > 60) {
     int minutes = remainder / 60;
     s << minutes << " min ";
     remainder -= minutes * 60;
   }
   if (d.seconds < 600) {
     s << remainder << " s";
   }
   return s;
 }

References s, and human_duration_t::seconds.

◆ print_cpuinfo()

void print_cpuinfo ( )

                      {
 #ifdef __linux__
   cout << "contents of /proc/cpuinfo:" << endl;
   string line;
   ifstream cpuinfo("/proc/cpuinfo");
   if (cpuinfo.is_open()) {
     while (getline(cpuinfo, line)) {
       cout << line << endl;
     }
     cpuinfo.close();
   }
   cout << endl;
 #elif defined __APPLE__
   cout << "output of sysctl hw:" << endl;
   system("sysctl hw");
   cout << endl;
 #endif
 }

References calibrate::line.

Referenced by main().

◆ run_benchmarks()

void run_benchmarks ( vector< benchmark_t > & benchmarks )

                                                      {
   size_t first_benchmark_to_run;
   vector<benchmark_t> deserialized_benchmarks;
   bool use_deserialized_benchmarks = false;
   if (deserialize_benchmarks(session_filename, deserialized_benchmarks, first_benchmark_to_run)) {
     cerr << "Found serialized session with " << 100.0f * first_benchmark_to_run / deserialized_benchmarks.size()
          << " % already done" << endl;
     if (deserialized_benchmarks.size() == benchmarks.size() && first_benchmark_to_run > 0 &&
         first_benchmark_to_run < benchmarks.size()) {
       use_deserialized_benchmarks = true;
     }
   }
  
   if (use_deserialized_benchmarks) {
     benchmarks = deserialized_benchmarks;
   } else {
     // not using deserialized benchmarks, starting from scratch
     first_benchmark_to_run = 0;
  
     // Randomly shuffling benchmarks allows us to get accurate enough progress info,
     // as now the cheap/expensive benchmarks are randomly mixed so they average out.
     // It also means that if data is corrupted for some time span, the odds are that
     // not all repetitions of a given benchmark will be corrupted.
     random_shuffle(benchmarks.begin(), benchmarks.end());
   }
  
   for (int i = 0; i < 4; i++) {
     max_clock_speed = max(max_clock_speed, measure_clock_speed());
   }
  
   double time_start = 0.0;
   while (first_benchmark_to_run < benchmarks.size()) {
     if (first_benchmark_to_run == 0) {
       time_start = timer.getRealTime();
     }
     try_run_some_benchmarks(benchmarks, time_start, first_benchmark_to_run);
   }
  
   // Sort timings by increasing benchmark parameters, and decreasing gflops.
   // The latter is very important. It means that we can ignore all but the first
   // benchmark with given parameters.
   sort(benchmarks.begin(), benchmarks.end());
  
   // Collect best (i.e. now first) results for each parameter values.
   vector<benchmark_t> best_benchmarks;
   for (auto it = benchmarks.begin(); it != benchmarks.end(); ++it) {
     if (best_benchmarks.empty() || best_benchmarks.back().compact_product_size != it->compact_product_size ||
         best_benchmarks.back().compact_block_size != it->compact_block_size) {
       best_benchmarks.push_back(*it);
     }
   }
  
   // keep and return only the best benchmarks
   benchmarks = best_benchmarks;
 }

References deserialize_benchmarks(), i, Eigen::max(), max_clock_speed, measure_clock_speed(), session_filename, and try_run_some_benchmarks().

Referenced by measure_all_pot_sizes_action_t::run(), and measure_default_sizes_action_t::run().

◆ serialize_benchmarks()

void serialize_benchmarks	(	const char *	filename,
		const vector< benchmark_t > &	benchmarks,
		size_t	first_benchmark_to_run
	)

                                                                                                                       {
   FILE* file = fopen(filename, "w");
   if (!file) {
     cerr << "Could not open file " << filename << " for writing." << endl;
     cerr << "Do you have write permissions on the current working directory?" << endl;
     exit(1);
   }
   size_t benchmarks_vector_size = benchmarks.size();
   fwrite(&max_clock_speed, sizeof(max_clock_speed), 1, file);
   fwrite(&benchmarks_vector_size, sizeof(benchmarks_vector_size), 1, file);
   fwrite(&first_benchmark_to_run, sizeof(first_benchmark_to_run), 1, file);
   fwrite(benchmarks.data(), sizeof(benchmark_t), benchmarks.size(), file);
   fclose(file);
 }

References MergeRestartFiles::filename, and max_clock_speed.

Referenced by try_run_some_benchmarks().

◆ show_usage_and_exit()

void show_usage_and_exit	(	int	,
		char *	argv[],
		const vector< unique_ptr< action_t >> &	available_actions
	)

                                                                                                      {
   cerr << "usage: " << argv[0] << " <action> [options...]" << endl << endl;
   cerr << "available actions:" << endl << endl;
   for (auto it = available_actions.begin(); it != available_actions.end(); ++it) {
     cerr << "  " << (*it)->invokation_name() << endl;
   }
   cerr << endl;
   cerr << "options:" << endl << endl;
   cerr << "  --min-working-set-size=N:" << endl;
   cerr << "       Set the minimum working set size to N bytes." << endl;
   cerr << "       This is rounded up as needed to a multiple of matrix size." << endl;
   cerr << "       A larger working set lowers the chance of a warm cache." << endl;
   cerr << "       The default value 0 means use a large enough working" << endl;
   cerr << "       set to likely outsize caches." << endl;
   cerr << "       A value of 1 (that is, 1 byte) would mean don't do anything to" << endl;
   cerr << "       avoid warm caches." << endl;
   exit(1);
 }

Referenced by main().

◆ try_run_some_benchmarks()

void try_run_some_benchmarks	(	vector< benchmark_t > &	benchmarks,
		double	time_start,
		size_t &	first_benchmark_to_run
	)

                                                                                                                  {
   if (first_benchmark_to_run == benchmarks.size()) {
     return;
   }
  
   double time_last_progress_update = 0;
   double time_last_clock_speed_measurement = 0;
   double time_now = 0;
  
   size_t benchmark_index = first_benchmark_to_run;
  
   while (true) {
     float ratio_done = float(benchmark_index) / benchmarks.size();
     time_now = timer.getRealTime();
  
     // We check clock speed every minute and at the end.
     if (benchmark_index == benchmarks.size() || time_now > time_last_clock_speed_measurement + 60.0f) {
       time_last_clock_speed_measurement = time_now;
  
       // Ensure that clock speed is as expected
       float current_clock_speed = measure_clock_speed();
  
       // The tolerance needs to be smaller than the relative difference between
       // clock speeds that a device could operate under.
       // It seems unlikely that a device would be throttling clock speeds by
       // amounts smaller than 2%.
       // With a value of 1%, I was getting within noise on a Sandy Bridge.
       const float clock_speed_tolerance = 0.02f;
  
       if (current_clock_speed > (1 + clock_speed_tolerance) * max_clock_speed) {
         // Clock speed is now higher than we previously measured.
         // Either our initial measurement was inaccurate, which won't happen
         // too many times as we are keeping the best clock speed value and
         // and allowing some tolerance; or something really weird happened,
         // which invalidates all benchmark results collected so far.
         // Either way, we better restart all over again now.
         if (benchmark_index) {
           cerr << "Restarting at " << 100.0f * ratio_done << " % because clock speed increased.          " << endl;
         }
         max_clock_speed = current_clock_speed;
         first_benchmark_to_run = 0;
         return;
       }
  
       bool rerun_last_tests = false;
  
       if (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
         cerr << "Measurements completed so far: " << 100.0f * ratio_done << " %                             " << endl;
         cerr << "Clock speed seems to be only " << current_clock_speed / max_clock_speed << " times what it used to be."
              << endl;
  
         unsigned int seconds_to_sleep_if_lower_clock_speed = 1;
  
         while (current_clock_speed < (1 - clock_speed_tolerance) * max_clock_speed) {
           if (seconds_to_sleep_if_lower_clock_speed > 32) {
             cerr << "Sleeping longer probably won't make a difference." << endl;
             cerr << "Serializing benchmarks to " << session_filename << endl;
             serialize_benchmarks(session_filename, benchmarks, first_benchmark_to_run);
             cerr << "Now restart this benchmark, and it should pick up where we left." << endl;
             exit(2);
           }
           rerun_last_tests = true;
           cerr << "Sleeping " << seconds_to_sleep_if_lower_clock_speed << " s...                                   \r"
                << endl;
           sleep(seconds_to_sleep_if_lower_clock_speed);
           current_clock_speed = measure_clock_speed();
           seconds_to_sleep_if_lower_clock_speed *= 2;
         }
       }
  
       if (rerun_last_tests) {
         cerr << "Redoing the last " << 100.0f * float(benchmark_index - first_benchmark_to_run) / benchmarks.size()
              << " % because clock speed had been low.   " << endl;
         return;
       }
  
       // nothing wrong with the clock speed so far, so there won't be a need to rerun
       // benchmarks run so far in case we later encounter a lower clock speed.
       first_benchmark_to_run = benchmark_index;
     }
  
     if (benchmark_index == benchmarks.size()) {
       // We're done!
       first_benchmark_to_run = benchmarks.size();
       // Erase progress info
       cerr << "                                                            " << endl;
       return;
     }
  
     // Display progress info on stderr
     if (time_now > time_last_progress_update + 1.0f) {
       time_last_progress_update = time_now;
       cerr << "Measurements... " << 100.0f * ratio_done << " %, ETA "
            << human_duration_t(float(time_now - time_start) * (1.0f - ratio_done) / ratio_done)
            << "                          \r" << flush;
     }
  
     // This is where we actually run a benchmark!
     benchmarks[benchmark_index].run();
     benchmark_index++;
   }
 }

References max_clock_speed, measure_clock_speed(), serialize_benchmarks(), and session_filename.

Referenced by run_benchmarks().

◆ type_name()

template<typename T >

string type_name ( )

                    {
   return "unknown";
 }

Referenced by float_pow_test_impl(), and cast_test_impl< SrcType, DstType, RowsAtCompileTime, ColsAtCompileTime >::run().

◆ type_name< double >()

template<>

string type_name< double > ( )

                            {
   return "double";
 }

◆ type_name< float >()

template<>

string type_name< float > ( )

                           {
   return "float";
 }

Variable Documentation

◆ eigen_block_size_k

int eigen_block_size_k

Referenced by benchmark_t::run().

◆ eigen_block_size_m

int eigen_block_size_m

Referenced by benchmark_t::run().

◆ eigen_block_size_n

int eigen_block_size_n

Referenced by benchmark_t::run().

◆ eigen_use_specific_block_size

bool eigen_use_specific_block_size

Referenced by benchmark_t::run().

◆ max_clock_speed

float max_clock_speed = 0.0f

Referenced by deserialize_benchmarks(), run_benchmarks(), serialize_benchmarks(), and try_run_some_benchmarks().

◆ maxsize

const size_t maxsize = 2048

Referenced by EIGEN_DECLARE_TEST(), main(), measure_all_pot_sizes_action_t::run(), and measure_default_sizes_action_t::run().

◆ measurement_repetitions

const int measurement_repetitions = 3

Referenced by main(), measure_all_pot_sizes_action_t::run(), and measure_default_sizes_action_t::run().

◆ min_accurate_time

const float min_accurate_time = 1e-2f

Referenced by main(), and benchmark_t::run().

◆ min_working_set_size

size_t min_working_set_size = 0

Referenced by main(), and benchmark_t::run().

◆ minsize

const size_t minsize = 16

Referenced by main(), measure_all_pot_sizes_action_t::run(), and measure_default_sizes_action_t::run().

◆ session_filename

const char session_filename[] = "/data/local/tmp/benchmark-blocking-sizes-session.data"

Referenced by run_benchmarks(), and try_run_some_benchmarks().

◆ timer

BenchTimer timer

static

Classes

Macros

Typedefs

Functions

Variables

Macro Definition Documentation

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N

◆ EIGEN_TEST_SPECIFIC_BLOCKING_SIZES

Typedef Documentation

◆ MatrixType

◆ Packet

◆ Scalar

Function Documentation

◆ compact_size_triple() [1/2]

◆ compact_size_triple() [2/2]

◆ deserialize_benchmarks()

◆ log2_pot()

◆ main()

◆ measure_clock_speed()

◆ operator<()

◆ operator<<() [1/2]

◆ operator<<() [2/2]

◆ print_cpuinfo()

◆ run_benchmarks()

◆ serialize_benchmarks()

◆ show_usage_and_exit()

◆ try_run_some_benchmarks()

◆ type_name()

◆ type_name< double >()

◆ type_name< float >()

Variable Documentation

◆ eigen_block_size_k

◆ eigen_block_size_m

◆ eigen_block_size_n

◆ eigen_use_specific_block_size

◆ max_clock_speed

◆ maxsize

◆ measurement_repetitions

◆ min_accurate_time

◆ min_working_set_size

◆ minsize

◆ session_filename

◆ timer