1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_
5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
7 #include "unsupported/Eigen/CXX11/Tensor"
10 #define BENCHMARK_RANGE(bench, lo, hi) BENCHMARK(bench)->Range(lo, hi)
17 template <
typename Device,
typename T>
37 for (
int iter = 0; iter < 10; ++iter) {
42 for (
int iter = 0; iter < num_iters; ++iter) {
52 if (
sizeof(
T) >=
sizeof(
int)) {
56 sizes[0] =
m_ *
sizeof(
T) /
sizeof(
int);
57 sizes[1] =
k_ *
sizeof(
T) /
sizeof(
int);
62 for (
int iter = 0; iter < 10; ++iter) {
67 for (
int iter = 0; iter < num_iters; ++iter) {
81 for (
int iter = 0; iter < 10; ++iter) {
86 for (
int iter = 0; iter < num_iters; ++iter) {
107 #ifdef EIGEN_USE_SYCL
108 for (
int iter = 0; iter < 10; ++iter) {
109 C.slice(first_quadrant, quarter_sizes).device(
device_) =
A.slice(first_quadrant, quarter_sizes);
110 C.slice(second_quadrant, quarter_sizes).device(
device_) =
B.slice(second_quadrant, quarter_sizes);
111 C.slice(third_quadrant, quarter_sizes).device(
device_) =
A.slice(third_quadrant, quarter_sizes);
112 C.slice(fourth_quadrant, quarter_sizes).device(
device_) =
B.slice(fourth_quadrant, quarter_sizes);
116 for (
int iter = 0; iter < num_iters; ++iter) {
117 C.slice(first_quadrant, quarter_sizes).device(
device_) =
A.slice(first_quadrant, quarter_sizes);
118 C.slice(second_quadrant, quarter_sizes).device(
device_) =
B.slice(second_quadrant, quarter_sizes);
119 C.slice(third_quadrant, quarter_sizes).device(
device_) =
A.slice(third_quadrant, quarter_sizes);
120 C.slice(fourth_quadrant, quarter_sizes).device(
device_) =
B.slice(fourth_quadrant, quarter_sizes);
135 #ifdef EIGEN_USE_SYCL
136 for (
int iter = 0; iter < 10; ++iter) {
141 for (
int iter = 0; iter < num_iters; ++iter) {
156 #ifdef EIGEN_USE_SYCL
157 for (
int iter = 0; iter < 10; ++iter) {
162 for (
int iter = 0; iter < num_iters; ++iter) {
183 #ifdef EIGEN_USE_SYCL
184 for (
int iter = 0; iter < 10; ++iter) {
189 for (
int iter = 0; iter < num_iters; ++iter) {
208 #ifdef EIGEN_USE_SYCL
209 for (
int iter = 0; iter < 10; ++iter) {
214 for (
int iter = 0; iter < num_iters; ++iter) {
234 #ifdef EIGEN_USE_SYCL
235 for (
int iter = 0; iter < 10; ++iter) {
240 for (
int iter = 0; iter < num_iters; ++iter) {
257 broadcast.
set(1,
n_);
259 #ifdef EIGEN_USE_SYCL
260 for (
int iter = 0; iter < 10; ++iter) {
261 C.device(
device_) =
A.broadcast(broadcast);
265 for (
int iter = 0; iter < num_iters; ++iter) {
266 C.device(
device_) =
A.broadcast(broadcast);
280 #ifdef EIGEN_USE_SYCL
281 for (
int iter = 0; iter < 10; ++iter) {
282 C.device(
device_) =
A *
A.constant(
static_cast<T>(3.14)) +
B *
B.constant(
static_cast<T>(2.7));
286 for (
int iter = 0; iter < num_iters; ++iter) {
287 C.device(
device_) =
A *
A.constant(
static_cast<T>(3.14)) +
B *
B.constant(
static_cast<T>(2.7));
303 #ifdef EIGEN_USE_SYCL
304 for (
int iter = 0; iter < 10; ++iter) {
305 C.device(
device_) =
A.rsqrt() +
B.sqrt() *
B.square();
309 for (
int iter = 0; iter < num_iters; ++iter) {
310 C.device(
device_) =
A.rsqrt() +
B.sqrt() *
B.square();
325 #ifdef EIGEN_USE_SYCL
326 for (
int iter = 0; iter < 10; ++iter) {
331 for (
int iter = 0; iter < num_iters; ++iter) {
349 #ifdef EIGEN_USE_SYCL
350 for (
int iter = 0; iter < 10; ++iter) {
351 C.device(
device_) =
B.sum(sum_along_dim);
355 for (
int iter = 0; iter < num_iters; ++iter) {
356 C.device(
device_) =
B.sum(sum_along_dim);
373 #ifndef EIGEN_HAS_INDEX_LIST
375 sum_along_dim[0] = 1;
381 #ifdef EIGEN_USE_SYCL
382 for (
int iter = 0; iter < 10; ++iter) {
383 A.device(
device_) =
B.sum(sum_along_dim);
387 for (
int iter = 0; iter < num_iters; ++iter) {
388 A.device(
device_) =
B.sum(sum_along_dim);
403 #ifdef EIGEN_USE_SYCL
404 for (
int iter = 0; iter < 10; ++iter) {
409 for (
int iter = 0; iter < num_iters; ++iter) {
434 kernel_sizes[0] = kernel_x;
435 kernel_sizes[1] = kernel_y;
438 result_sizes[0] =
m_ - kernel_x + 1;
439 result_sizes[1] =
n_ - kernel_y + 1;
444 #ifdef EIGEN_USE_SYCL
445 for (
int iter = 0; iter < 10; ++iter) {
450 for (
int iter = 0; iter < num_iters; ++iter) {
461 template <
int Layout>
464 sizeA[0] = (trans_a ?
k_ :
m_);
465 sizeA[1] = (trans_a ?
m_ :
k_);
467 sizeB[0] = (trans_b ?
n_ :
k_);
468 sizeB[1] = (trans_b ?
k_ :
n_);
481 dims[0] =
DimPair(a_contract_dim, b_contract_dim);
482 #ifdef EIGEN_USE_SYCL
483 for (
int iter = 0; iter < 10; ++iter) {
488 for (
int iter = 0; iter < num_iters; ++iter) {
509 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
513 #elif defined(EIGEN_USE_SYCL)
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
Eigen::Triplet< double > T
Definition: EigenUnitTest.cpp:11
#define eigen_assert(x)
Definition: Macros.h:910
Matrix< SCALARA, Dynamic, Dynamic, opt_A > A
Definition: bench_gemm.cpp:47
Matrix< Scalar, Dynamic, Dynamic > C
Definition: bench_gemm.cpp:49
Matrix< SCALARB, Dynamic, Dynamic, opt_B > B
Definition: bench_gemm.cpp:48
void SetBenchmarkFlopsProcessed(int64_t)
Definition: benchmark_main.cc:189
void StopBenchmarkTiming()
Definition: benchmark_main.cc:190
void StartBenchmarkTiming()
Definition: benchmark_main.cc:196
Definition: tensor_benchmarks.h:18
void padding(int num_iters)
Definition: tensor_benchmarks.h:196
BenchmarkSuite(const Device &device, size_t m, size_t k)
Definition: tensor_benchmarks.h:26
void typeCasting(int num_iters)
Definition: tensor_benchmarks.h:49
T * c_
Definition: tensor_benchmarks.h:528
void colReduction(int num_iters)
Definition: tensor_benchmarks.h:364
void rowChip(int num_iters)
Definition: tensor_benchmarks.h:127
void contractionRowMajorABT(int num_iters)
Definition: tensor_benchmarks.h:426
~BenchmarkSuite()
Definition: tensor_benchmarks.h:28
void contraction(int num_iters, bool trans_a, bool trans_b)
Definition: tensor_benchmarks.h:462
void striding(int num_iters)
Definition: tensor_benchmarks.h:221
void slicing(int num_iters)
Definition: tensor_benchmarks.h:93
void coeffWiseOp(int num_iters)
Definition: tensor_benchmarks.h:272
void contraction(int num_iters)
Definition: tensor_benchmarks.h:418
void rowReduction(int num_iters)
Definition: tensor_benchmarks.h:340
void initialize()
Definition: tensor_benchmarks.h:496
void transcendentalFunc(int num_iters)
Definition: tensor_benchmarks.h:317
Device device_
Definition: tensor_benchmarks.h:529
BenchmarkSuite(const Device &device, size_t m, size_t k, size_t n)
Definition: tensor_benchmarks.h:20
void broadcasting(int num_iters)
Definition: tensor_benchmarks.h:247
T * b_
Definition: tensor_benchmarks.h:527
void memcpy(int num_iters)
Definition: tensor_benchmarks.h:34
void fullReduction(int num_iters)
Definition: tensor_benchmarks.h:396
void algebraicFunc(int num_iters)
Definition: tensor_benchmarks.h:294
void contractionRowMajor(int num_iters)
Definition: tensor_benchmarks.h:420
void contractionRowMajorAT(int num_iters)
Definition: tensor_benchmarks.h:422
T * a_
Definition: tensor_benchmarks.h:526
BenchmarkSuite(const Device &device, size_t m)
Definition: tensor_benchmarks.h:24
void contractionRowMajorBT(int num_iters)
Definition: tensor_benchmarks.h:424
TensorIndex k_
Definition: tensor_benchmarks.h:524
void shuffling(int num_iters)
Definition: tensor_benchmarks.h:169
void random(int num_iters)
Definition: tensor_benchmarks.h:74
void finalizeBenchmark(int64_t num_items)
Definition: tensor_benchmarks.h:508
TensorIndex m_
Definition: tensor_benchmarks.h:523
void colChip(int num_iters)
Definition: tensor_benchmarks.h:148
void convolution(int num_iters, int kernel_x, int kernel_y)
Definition: tensor_benchmarks.h:428
TensorIndex n_
Definition: tensor_benchmarks.h:525
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33
The tensor class.
Definition: Tensor.h:68
Definition: matrices.h:74
Tensor< float, 1 >::DimensionPair DimPair
Definition: cxx11_tensor_contraction.cpp:17
std::vector< Array2i > sizes
Definition: dense_solvers.cpp:12
@ Aligned
Definition: Constants.h:242
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
int * m
Definition: level2_cplx_impl.h:294
char char char int int * k
Definition: level2_impl.h:374
EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d &m, const Packet2d &n, int mask)
Definition: LSX/PacketMath.h:150
EIGEN_ALWAYS_INLINE DSizes< IndexType, NumDims > strides(const DSizes< IndexType, NumDims > &dimensions)
Definition: TensorBlock.h:29
std::int64_t int64_t
Definition: Meta.h:43
std::array< T, N > array
Definition: EmulateArray.h:231
Definition: TensorDimensions.h:161
Definition: TensorIndexList.h:271
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value)
Definition: TensorIndexList.h:280
Definition: TensorIndexList.h:325
Definition: TensorIndexList.h:39
Definition: TensorIndexList.h:48
int TensorIndex
Definition: tensor_benchmarks.h:4