tensor_contract_sycl_bench.cc File Reference
#include <CL/sycl.hpp>
#include <fstream>
#include <iostream>
#include <chrono>
#include <ctime>
#include <unsupported/Eigen/CXX11/Tensor>

Macros

#define EIGEN_BENCH_CONTRACT_SYCL
 
#define EIGEN_TEST_NO_LONGDOUBLE
 
#define EIGEN_TEST_NO_COMPLEX
 
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t
 

Functions

std::ofstream out ("Result.txt")
 
std::chrono::time_point< std::chrono::system_clock > get_time ()
 
template<typename Start , typename End , typename TensorIndex >
void finalizeBenchmark (Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_, TensorIndex num_iters, std::string name)
 
template<typename T , typename Device , typename TensorIndex >
void contraction (const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
 
template<typename T , typename Device , typename TensorIndex >
void contractionRowMajor (const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
 
template<typename T , typename Device , typename TensorIndex >
void contractionAT (const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
 
template<typename T , typename Device , typename TensorIndex >
void contractionBT (const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
 
template<typename T , typename Device , typename TensorIndex >
void contractionABT (const Device &device_, TensorIndex num_iters, TensorIndex m_, TensorIndex k_, TensorIndex n_)
 
int main ()
 

Macro Definition Documentation

◆ EIGEN_BENCH_CONTRACT_SYCL

#define EIGEN_BENCH_CONTRACT_SYCL

◆ EIGEN_DEFAULT_DENSE_INDEX_TYPE

#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t

◆ EIGEN_TEST_NO_COMPLEX

#define EIGEN_TEST_NO_COMPLEX

◆ EIGEN_TEST_NO_LONGDOUBLE

#define EIGEN_TEST_NO_LONGDOUBLE

Function Documentation

◆ contraction()

template<typename T , typename Device , typename TensorIndex >
void contraction ( const Device &  device_,
TensorIndex  num_iters,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_ 
)
51  {
52  T* a_;
53  T* b_;
54  T* c_;
55  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
56  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
57  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));
58 
59  // Initialize the content of the memory pools to prevent asan from
60  // complaining.
61  device_.fill(a_, a_ + (m_ * k_), T(12));
62  device_.fill(b_, b_ + (k_ * n_), T(23));
63  device_.fill(c_, c_ + (m_ * n_), T(31));
64 
66  sizeA[0] = m_;
67  sizeA[1] = k_;
69  sizeB[0] = k_;
70  sizeB[1] = n_;
72  sizeC[0] = m_;
73  sizeC[1] = n_;
74 
75  const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA);
76  const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB);
78 
79  typedef typename Tensor<T, 2>::DimensionPair DimPair;
81  dims[0] = DimPair(1, 0);
82 #ifdef EIGEN_USE_SYCL // warmup for sycl
83  for (int iter = 0; iter < 10; ++iter) {
84  C.device(device_) = A.contract(B, dims);
85  }
86 #endif
87  auto start = get_time();
88  for (int iter = 0; iter < num_iters; ++iter) {
89  C.device(device_) = A.contract(B, dims);
90  }
91  auto end = get_time();
92  // Record the number of FLOPs executed per second (size_ multiplications and
93  // additions for each value in the resulting tensor)
94  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contraction");
95  device_.deallocate(a_);
96  device_.deallocate(b_);
97  device_.deallocate(c_);
98  device_.synchronize();
99 }
Eigen::Triplet< double > T
Definition: EigenUnitTest.cpp:11
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition: bench_gemm.cpp:47
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition: bench_gemm.cpp:48
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33
The tensor class.
Definition: Tensor.h:68
Definition: matrices.h:74
Tensor< float, 1 >::DimensionPair DimPair
Definition: cxx11_tensor_contraction.cpp:17
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
@ Aligned
Definition: Constants.h:242
std::array< T, N > array
Definition: EmulateArray.h:231
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
void finalizeBenchmark(Start start, End end, TensorIndex m_, TensorIndex k_, TensorIndex n_, TensorIndex num_iters, std::string name)
Definition: tensor_contract_sycl_bench.cc:38
std::chrono::time_point< std::chrono::system_clock > get_time()
Definition: tensor_contract_sycl_bench.cc:32

References Eigen::Aligned, Eigen::placeholders::end, finalizeBenchmark(), get_time(), and oomph::CumulativeTimings::start().

◆ contractionABT()

template<typename T , typename Device , typename TensorIndex >
void contractionABT ( const Device &  device_,
TensorIndex  num_iters,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_ 
)
255  {
256  T* a_;
257  T* b_;
258  T* c_;
259  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
260  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
261  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));
262 
263  // Initialize the content of the memory pools to prevent asan from
264  // complaining.
265  device_.memset(a_, 12, m_ * k_ * sizeof(T));
266  device_.memset(b_, 23, k_ * n_ * sizeof(T));
267  device_.memset(c_, 31, m_ * n_ * sizeof(T));
268 
270  sizeA[0] = k_;
271  sizeA[1] = m_;
273  sizeB[0] = n_;
274  sizeB[1] = k_;
276  sizeC[0] = m_;
277  sizeC[1] = n_;
278 
282 
283  typedef typename Tensor<T, 2>::DimensionPair DimPair;
285  dims[0] = DimPair(0, 1);
286 #ifdef EIGEN_USE_SYCL // warmup for sycl
287  for (int iter = 0; iter < 10; ++iter) {
288  C.device(device_) = A.contract(B, dims);
289  }
290 #endif
291  auto start = get_time();
292  for (int iter = 0; iter < num_iters; ++iter) {
293  C.device(device_) = A.contract(B, dims);
294  }
295  auto end = get_time();
296  // Record the number of FLOPs executed per second (size_ multiplications and
297  // additions for each value in the resulting tensor)
298  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionABT");
299  device_.deallocate(a_);
300  device_.deallocate(b_);
301  device_.deallocate(c_);
302  device_.synchronize();
303 }

References Eigen::Aligned, Eigen::placeholders::end, finalizeBenchmark(), get_time(), and oomph::CumulativeTimings::start().

◆ contractionAT()

template<typename T , typename Device , typename TensorIndex >
void contractionAT ( const Device &  device_,
TensorIndex  num_iters,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_ 
)
154  {
155  T* a_;
156  T* b_;
157  T* c_;
158  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
159  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
160  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));
161 
162  // Initialize the content of the memory pools to prevent asan from
163  // complaining.
164  device_.memset(a_, 12, m_ * k_ * sizeof(T));
165  device_.memset(b_, 23, k_ * n_ * sizeof(T));
166  device_.memset(c_, 31, m_ * n_ * sizeof(T));
168  sizeA[0] = k_;
169  sizeA[1] = m_;
171  sizeB[0] = k_;
172  sizeB[1] = n_;
174  sizeC[0] = m_;
175  sizeC[1] = n_;
176 
180 
181  typedef typename Tensor<T, 2>::DimensionPair DimPair;
183  dims[0] = DimPair(0, 0);
184 #ifdef EIGEN_USE_SYCL // warmup for sycl
185  for (int iter = 0; iter < 10; ++iter) {
186  C.device(device_) = A.contract(B, dims);
187  }
188 #endif
189  auto start = get_time();
190  for (int iter = 0; iter < num_iters; ++iter) {
191  C.device(device_) = A.contract(B, dims);
192  }
193  auto end = get_time();
194  // Record the number of FLOPs executed per second (size_ multiplications and
195  // additions for each value in the resulting tensor)
196  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionAT");
197  device_.deallocate(a_);
198  device_.deallocate(b_);
199  device_.deallocate(c_);
200  device_.synchronize();
201 }

References Eigen::Aligned, Eigen::placeholders::end, finalizeBenchmark(), get_time(), and oomph::CumulativeTimings::start().

◆ contractionBT()

template<typename T , typename Device , typename TensorIndex >
void contractionBT ( const Device &  device_,
TensorIndex  num_iters,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_ 
)
204  {
205  T* a_;
206  T* b_;
207  T* c_;
208  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
209  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
210  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));
211 
212  // Initialize the content of the memory pools to prevent asan from
213  // complaining.
214  device_.memset(a_, 12, m_ * k_ * sizeof(T));
215  device_.memset(b_, 23, k_ * n_ * sizeof(T));
216  device_.memset(c_, 31, m_ * n_ * sizeof(T));
217 
219  sizeA[0] = m_;
220  sizeA[1] = k_;
222  sizeB[0] = n_;
223  sizeB[1] = k_;
225  sizeC[0] = m_;
226  sizeC[1] = n_;
227 
231 
232  typedef typename Tensor<T, 2>::DimensionPair DimPair;
234  dims[0] = DimPair(1, 1);
235 #ifdef EIGEN_USE_SYCL // warmup for sycl
236  for (int iter = 0; iter < 10; ++iter) {
237  C.device(device_) = A.contract(B, dims);
238  }
239 #endif
240  auto start = get_time();
241  for (int iter = 0; iter < num_iters; ++iter) {
242  C.device(device_) = A.contract(B, dims);
243  }
244  auto end = get_time();
245  // Record the number of FLOPs executed per second (size_ multiplications and
246  // additions for each value in the resulting tensor)
247  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionBT");
248  device_.deallocate(a_);
249  device_.deallocate(b_);
250  device_.deallocate(c_);
251  device_.synchronize();
252 }

References Eigen::Aligned, Eigen::placeholders::end, finalizeBenchmark(), get_time(), and oomph::CumulativeTimings::start().

◆ contractionRowMajor()

template<typename T , typename Device , typename TensorIndex >
void contractionRowMajor ( const Device &  device_,
TensorIndex  num_iters,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_ 
)
103  {
104  T* a_;
105  T* b_;
106  T* c_;
107  a_ = (T*)device_.allocate(m_ * k_ * sizeof(T));
108  b_ = (T*)device_.allocate(k_ * n_ * sizeof(T));
109  c_ = (T*)device_.allocate(m_ * n_ * sizeof(T));
110 
111  // Initialize the content of the memory pools to prevent asan from
112  // complaining.
113  device_.memset(a_, T(12), T(m_ * k_ * sizeof(T)));
114  device_.memset(b_, T(23), T(k_ * n_ * sizeof(T)));
115  device_.memset(c_, T(31), T(m_ * n_ * sizeof(T)));
116 
118  sizeA[0] = m_;
119  sizeA[1] = k_;
121  sizeB[0] = k_;
122  sizeB[1] = n_;
124  sizeC[0] = m_;
125  sizeC[1] = n_;
126 
130 
131  typedef typename Tensor<T, 2>::DimensionPair DimPair;
133  dims[0] = DimPair(1, 0);
134 #ifdef EIGEN_USE_SYCL // warmup for sycl
135  for (int iter = 0; iter < 10; ++iter) {
136  C.device(device_) = A.contract(B, dims);
137  }
138 #endif
139  auto start = get_time();
140  for (int iter = 0; iter < num_iters; ++iter) {
141  C.device(device_) = A.contract(B, dims);
142  }
143  auto end = get_time();
144  // Record the number of FLOPs executed per second (size_ multiplications and
145  // additions for each value in the resulting tensor)
146  finalizeBenchmark(start, end, m_, k_, n_, num_iters, "contractionRowMajor");
147  device_.deallocate(a_);
148  device_.deallocate(b_);
149  device_.deallocate(c_);
150  device_.synchronize();
151 }

References Eigen::Aligned, Eigen::placeholders::end, finalizeBenchmark(), get_time(), and oomph::CumulativeTimings::start().

◆ finalizeBenchmark()

template<typename Start , typename End , typename TensorIndex >
void finalizeBenchmark ( Start  start,
End  end,
TensorIndex  m_,
TensorIndex  k_,
TensorIndex  n_,
TensorIndex  num_iters,
std::string  name 
)
39  {
40  std::chrono::duration<double> elapsed_seconds = end - start;
41  std::cout << "Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : "
42  << static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters) / elapsed_seconds.count()) * 1e-9
43  << "\n";
44  out << "Kernel Name : " << name << ", M : " << m_ << ", N : " << n_ << ", K : " << k_ << " GFLOP/s : "
45  << static_cast<float>((static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters) / elapsed_seconds.count()) * 1e-9
46  << "\n";
47 }
Array< double, 1, 3 > e(1./3., 0.5, 2.)
std::int64_t int64_t
Definition: Meta.h:43
string name
Definition: plotDoE.py:33
std::ofstream out("Result.txt")

References e(), Eigen::placeholders::end, plotDoE::name, out(), and oomph::CumulativeTimings::start().

Referenced by contraction(), contractionABT(), contractionAT(), contractionBT(), and contractionRowMajor().

◆ get_time()

std::chrono::time_point<std::chrono::system_clock> get_time ( )
32  {
33  std::chrono::time_point<std::chrono::system_clock> start, end;
34  return std::chrono::system_clock::now();
35 }

References Eigen::placeholders::end, and oomph::CumulativeTimings::start().

Referenced by contraction(), contractionABT(), contractionAT(), contractionBT(), and contractionRowMajor().

◆ main()

int main ( )
305  {
306  cl::sycl::gpu_selector selector;
307  Eigen::QueueInterface queue(selector);
308  Eigen::SyclDevice device(&queue);
309  int64_t num_iters = 20;
310  for (int64_t m = 32; m <= 4096; m *= 2)
311  for (int64_t k = 32; k <= 4096; k *= 2)
312  for (int64_t n = 32; n <= 4096; n *= 2) {
313  (contraction<float>(device, num_iters, m, k, n));
314  (contractionRowMajor<float>(device, num_iters, m, k, n));
315  (contractionAT<float>(device, num_iters, m, k, n));
316  (contractionBT<float>(device, num_iters, m, k, n));
317  (contractionABT<float>(device, num_iters, m, k, n));
318  }
319  return 0;
320 }
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374

References k, m, and n.

◆ out()

std::ofstream out ( "Result.txt"  )

Referenced by ChangingTOIParticle::actionsAfterTimeStep(), statistics_while_running< T >::appendToFile(), Eigen::internal::OP< MatrixSolver, MatrixType, Scalar, true >::applyOP(), Eigen::TensorEvaluator< const TensorPaddingOp< PaddingDimensions, ArgType >, Device >::block(), CFile::CFile(), check_sparse_solving(), Eigen::ArpackGeneralizedSelfAdjointEigenSolver< MatrixType, MatrixSolver, BisSPD >::compute(), CFile::copy_data(), CFile::copy_fstat(), create_fluid_and_solid_surface_mesh_from_fluid_xda_mesh(), cwise_ref(), oomph::GeneralisedElement::describe_dofs(), oomph::Mesh::describe_dofs(), oomph::Data::describe_dofs(), oomph::SolidNode::describe_dofs(), oomph::Problem::describe_dofs(), oomph::ElementWithExternalElement::describe_local_dofs(), oomph::HeatedLinearSurfaceContactElement< ELEMENT >::describe_local_dofs(), oomph::ElementWithMovingNodes::describe_local_dofs(), oomph::ElementWithSpecificMovingNodes< ELEMENT, NODE_TYPE >::describe_local_dofs(), oomph::GeneralisedElement::describe_local_dofs(), oomph::FiniteElement::describe_local_dofs(), oomph::SolidFiniteElement::describe_local_dofs(), oomph::FaceElementAsGeomObject< ELEMENT >::describe_local_dofs(), oomph::FSIWallElement::describe_local_dofs(), oomph::Mesh::describe_local_dofs(), oomph::ProjectableElement< ELEMENT >::describe_local_dofs(), oomph::PseudoSolidNodeUpdateElement< BASIC, SOLID >::describe_local_dofs(), oomph::RefineablePseudoSolidNodeUpdateElement< BASIC, SOLID >::describe_local_dofs(), oomph::SpectralElement::describe_local_dofs(), oomph::RefineableQSpectralPoissonElement< DIM, NNODE_1D >::describe_local_dofs(), oomph::FSIImposeDisplacementByLagrangeMultiplierElement< ELEMENT >::describe_local_dofs(), oomph::FiniteElement::describe_nodal_local_dofs(), oomph::SolidFiniteElement::describe_solid_local_dofs(), oomph::SpineMesh::describe_spine_dofs(), oomph::MyProblem::doc_boundaries(), oomph::MyProblem::doc_solution(), EIGEN_DECLARE_TEST(), Detail::VTKPointDescriptorEntryImpl< T, V >::emit(), Detail::emitProxy(), File::File(), finalizeBenchmark(), format(), helpers::getLineFromStringStream(), Eigen::PastixLU< MatrixType_, IsStrSym >::grabMatrix(), Eigen::PastixLLT< MatrixType_, UpLo_ >::grabMatrix(), Eigen::PastixLDLT< MatrixType_, UpLo_ >::grabMatrix(), HstopCurve(), DPMBase::incrementRunNumberInFile(), oomph::MyProblem::initial_doc(), DPMBase::initialiseSolve(), mag2(), main(), BaseCluster::makeAmatFile(), BaseCluster::makeCdatFile(), BaseCluster::makeGnuplotFile(), BaseCluster::makeIntenalStructureFile(), BaseCluster::makeOverlFile(), BaseVTKWriter< H >::makeVTKFileWithHeader(), VTKData::makeVTKFileWithHeader(), oomph::StreamfunctionProblem::my_output(), File::openWrite(), File::openWriteNoAppend(), Eigen::internal::pfrexp(), Eigen::internal::pldexp< Packet4d >(), Eigen::internal::pldexp< Packet8d >(), Eigen::internal::pldexp_generic(), printBenchStyle(), printStatheader(), Eigen::internal::ptranspose(), Eigen::internal::putDenseElt(), Eigen::internal::PutMatrixElt(), helpers::readArrayFromCommandLine(), helpers::readVectorFromCommandLine(), REF_FREXP(), run_and_compare_to_gpu(), run_and_verify(), run_on_cpu(), run_on_gpu(), run_on_gpu_meta_kernel(), Membrane::saveAsOFF(), Membrane::saveAsSTL(), Eigen::saveMarket(), Eigen::saveMarketDense(), strcicmp(), test_async_multithread_chip(), test_async_multithread_elementwise(), test_async_multithread_volume_patch(), test_binary_builtins_fixed_arg2(), test_binary_builtins_func(), test_broadcast_sycl(), test_broadcast_sycl_fixed(), test_coeff_wise(), test_complex_operators(), test_complex_sqrt(), test_custom_binary_op_sycl(), test_custom_unary_op_sycl(), test_device_exceptions(), test_diagonal(), test_eigenvalues_direct(), test_forced_eval_sycl(), test_innermost_first_dims(), test_innermost_last_dims(), test_matrix_inverse(), test_multithread_chip(), test_multithread_compound_assignment(), test_multithread_elementwise(), test_multithread_volume_patch(), test_numeric_limits(), test_product(), test_reduce_middle_dims(), test_redux(), test_replicate(), test_sigmoid_sycl(), test_static_dims(), test_sycl_cast(), test_sycl_computations(), test_sycl_mem_sync(), test_sycl_mem_sync_offsets(), test_sycl_memset_offsets(), test_sycl_random_normal(), test_sycl_random_uniform(), test_tanh_sycl(), test_unary_builtins_for_scalar(), test_unary_builtins_return_bool(), To32BitDims(), to_string_padded(), Eigen::TensorEvaluator< const TensorPaddingOp< PaddingDimensions, ArgType >, Device >::updateCostPerDimension(), CFile::writeP4P(), Calibration::writePSDToFile(), CFile::writeRestart(), helpers::writeToFile(), and statistics_while_running< T >::writeToFile().