cxx11_tensor_reduction_sycl.cpp File Reference
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>

Classes

struct  CustomReducer< InT, OutT >
 

Macros

#define EIGEN_TEST_NO_LONGDOUBLE
 
#define EIGEN_TEST_NO_COMPLEX
 
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t
 
#define EIGEN_USE_SYCL
 

Functions

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_sum_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_max_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_with_odd_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_min_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_min_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_max_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device, IndexType dim_x, IndexType dim_y)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_last_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
static void test_last_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , typename AccumType , int DataLayout, typename IndexType >
static void test_full_reductions_custom_sycl (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , typename Dev >
void sycl_reduction_test_full_per_device (const Dev &sycl_device)
 
template<typename DataType , typename Dev >
void sycl_reduction_full_offset_per_device (const Dev &sycl_device)
 
template<typename DataType , typename Dev >
void sycl_reduction_test_first_dim_per_device (const Dev &sycl_device)
 
template<typename DataType , typename Dev >
void sycl_reduction_test_last_dim_per_device (const Dev &sycl_device)
 
 EIGEN_DECLARE_TEST (cxx11_tensor_reduction_sycl)
 

Macro Definition Documentation

◆ EIGEN_DEFAULT_DENSE_INDEX_TYPE

#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t

◆ EIGEN_TEST_NO_COMPLEX

#define EIGEN_TEST_NO_COMPLEX

◆ EIGEN_TEST_NO_LONGDOUBLE

#define EIGEN_TEST_NO_LONGDOUBLE

◆ EIGEN_USE_SYCL

#define EIGEN_USE_SYCL

Function Documentation

◆ EIGEN_DECLARE_TEST()

EIGEN_DECLARE_TEST ( cxx11_tensor_reduction_sycl  )
847  {
848  for (const auto& device : Eigen::get_sycl_supported_devices()) {
849  std::cout << "Running on " << device.template get_info<cl::sycl::info::device::name>() << std::endl;
850  QueueInterface queueInterface(device);
851  auto sycl_device = Eigen::SyclDevice(&queueInterface);
852  CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
853  CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
854  CALL_SUBTEST_3(sycl_reduction_test_first_dim_per_device<float>(sycl_device));
855  CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
856  }
857 }
#define CALL_SUBTEST_3(FUNC)
Definition: split_test_helper.h:16
#define CALL_SUBTEST_1(FUNC)
Definition: split_test_helper.h:4
#define CALL_SUBTEST_2(FUNC)
Definition: split_test_helper.h:10
#define CALL_SUBTEST_4(FUNC)
Definition: split_test_helper.h:22

References CALL_SUBTEST_1, CALL_SUBTEST_2, CALL_SUBTEST_3, and CALL_SUBTEST_4.

◆ sycl_reduction_full_offset_per_device()

template<typename DataType , typename Dev >
void sycl_reduction_full_offset_per_device ( const Dev &  sycl_device)
813  {
814  test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
815  test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
816  test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
817  test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
818  test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
819  test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
820  test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
821  test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
822  test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
823  sycl_device.synchronize();
824 }

◆ sycl_reduction_test_first_dim_per_device()

template<typename DataType , typename Dev >
void sycl_reduction_test_first_dim_per_device ( const Dev &  sycl_device)
827  {
828  test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, 4197, 4097);
829  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 4197, 4097);
830  test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 129, 8);
831  test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
832  test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
833  sycl_device.synchronize();
834 }

◆ sycl_reduction_test_full_per_device()

template<typename DataType , typename Dev >
void sycl_reduction_test_full_per_device ( const Dev &  sycl_device)
797  {
798  test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
799  test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
800  test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
801  test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
802  test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
803  test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
804 
805  test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
806  test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
807  test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(sycl_device);
808  test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(sycl_device);
809  sycl_device.synchronize();
810 }

◆ sycl_reduction_test_last_dim_per_device()

template<typename DataType , typename Dev >
void sycl_reduction_test_last_dim_per_device ( const Dev &  sycl_device)
837  {
838  test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
839  test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
840  test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
841  test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
842  test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
843  test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
844  sycl_device.synchronize();
845 }

◆ test_first_dim_reductions_max_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_max_sycl ( const Eigen::SyclDevice &  sycl_device)
static
379  {
380  IndexType dim_x = 145;
381  IndexType dim_y = 1;
382  IndexType dim_z = 67;
383 
384  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
386  red_axis[0] = 0;
387  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
388 
390  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
391  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
392 
393  in.setRandom();
394 
395  redux = in.maximum(red_axis);
396 
397  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
398  DataType* gpu_out_data =
399  static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
400 
401  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
402  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
403 
404  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
405  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
406  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
407 
408  // Check that the CPU and GPU reductions return the same result.
409  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
410  for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
411 
412  sycl_device.deallocate(gpu_in_data);
413  sycl_device.deallocate(gpu_out_data);
414 }
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33
The tensor class.
Definition: Tensor.h:68
#define VERIFY_IS_APPROX(a, b)
Definition: integer_types.cpp:13
char char char int int * k
Definition: level2_impl.h:374
std::array< T, N > array
Definition: EmulateArray.h:231
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_first_dim_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_max_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
417  {
418  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
419  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
420 
421  const IndexType num_rows = 64;
422  const IndexType num_cols = 64;
423  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
424  array<IndexType, 1> reduced_range = {{num_cols}};
425  const IndexType n_elems = internal::array_prod(tensor_range);
426  const IndexType n_reduced = num_cols;
427 
428  data_tensor in(tensor_range);
429  reduced_tensor redux;
430  reduced_tensor redux_gpu(reduced_range);
431 
432  in.setRandom();
433  array<IndexType, 2> tensor_offset_range(tensor_range);
434  tensor_offset_range[0] -= 1;
435  // Set maximum value outside of the considered range.
436  for (IndexType i = 0; i < n_reduced; i++) {
437  in(i) = static_cast<DataType>(2);
438  }
439 
441  red_axis[0] = 0;
442 
443  const IndexType offset = 64;
444  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
445  redux = in_offset.maximum(red_axis);
446  for (IndexType i = 0; i < n_reduced; i++) {
447  VERIFY_IS_NOT_EQUAL(redux(i), in(i));
448  }
449 
450  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
451  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(n_reduced * sizeof(DataType)));
452 
453  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
454  TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
455  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
456  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
457  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, n_reduced * sizeof(DataType));
458 
459  // Check that the CPU and GPU reductions return the same result.
460  for (IndexType i = 0; i < n_reduced; i++) {
461  VERIFY_IS_APPROX(redux_gpu(i), redux(i));
462  }
463 
464  sycl_device.deallocate(gpu_in_data);
465  sycl_device.deallocate(gpu_out_data);
466 }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define VERIFY_IS_NOT_EQUAL(a, b)
Definition: main.h:368
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_first_dim_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_mean_sycl ( const Eigen::SyclDevice &  sycl_device)
static
562  {
563  IndexType dim_x = 145;
564  IndexType dim_y = 1;
565  IndexType dim_z = 67;
566 
567  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
569  red_axis[0] = 0;
570  array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
571 
573  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
574  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
575 
576  in.setRandom();
577 
578  redux = in.mean(red_axis);
579 
580  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
581  DataType* gpu_out_data =
582  static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
583 
584  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
585  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
586 
587  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
588  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
589  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
590 
591  // Check that the CPU and GPU reductions return the same result.
592  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
593  for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
594 
595  sycl_device.deallocate(gpu_in_data);
596  sycl_device.deallocate(gpu_out_data);
597 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_first_dim_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_first_dim_reductions_sum_sycl ( const Eigen::SyclDevice &  sycl_device,
IndexType  dim_x,
IndexType  dim_y 
)
static
529  {
530  array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
532  red_axis[0] = 0;
533  array<IndexType, 1> reduced_tensorRange = {{dim_y}};
534 
536  Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
537  Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
538 
539  in.setRandom();
540  redux = in.sum(red_axis);
541 
542  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
543  DataType* gpu_out_data =
544  static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
545 
546  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
547  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
548 
549  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
550  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
551  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
552 
553  // Check that the CPU and GPU reductions return the same result.
554  for (IndexType i = 0; i < redux.size(); i++) {
555  VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
556  }
557  sycl_device.deallocate(gpu_in_data);
558  sycl_device.deallocate(gpu_out_data);
559 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), i, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_custom_sycl()

template<typename DataType , typename AccumType , int DataLayout, typename IndexType >
static void test_full_reductions_custom_sycl ( const Eigen::SyclDevice &  sycl_device)
static
768  {
769  constexpr IndexType InSize = 64;
770  auto tensorRange = Sizes<InSize>(InSize);
772  auto reduced_tensorRange = Sizes<>();
775 
777 
778  in_fix.setRandom();
779 
780  size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
781  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
782  AccumType* gpu_out_data = static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
783 
784  TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
785  TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
786 
787  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
788  out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
789  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, sizeof(AccumType));
790  VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
791 
792  sycl_device.deallocate(gpu_in_data);
793  sycl_device.deallocate(gpu_out_data);
794 }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & setRandom()
Definition: TensorBase.h:1049
The fixed sized version of the tensor class.
Definition: TensorFixedSize.h:30
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: TensorFixedSize.h:68
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions dimensions() const
Definition: TensorFixedSize.h:66
static const int DataLayout
Definition: cxx11_tensor_image_patch_sycl.cpp:24
#define VERIFY_IS_EQUAL(a, b)
Definition: main.h:367
Definition: cxx11_tensor_reduction_sycl.cpp:746
Definition: TensorIndexList.h:271
Definition: TensorDimensions.h:85

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_EQUAL.

◆ test_full_reductions_max_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_max_sycl ( const Eigen::SyclDevice &  sycl_device)
static
95  {
96  const IndexType num_rows = 4096;
97  const IndexType num_cols = 4096;
98  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
99 
103 
104  in.setRandom();
105 
106  full_redux = in.maximum();
107 
108  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
109  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
110 
111  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
113  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
114  out_gpu.device(sycl_device) = in_gpu.maximum();
115  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
116  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
117  sycl_device.deallocate(gpu_in_data);
118  sycl_device.deallocate(gpu_out_data);
119 }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar * data()
Definition: Tensor.h:102

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_max_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
122  {
123  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
124  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
125  const IndexType num_rows = 64;
126  const IndexType num_cols = 64;
127  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
128  const IndexType n_elems = internal::array_prod(tensor_range);
129 
130  data_tensor in(tensor_range);
131  scalar_tensor full_redux;
132  scalar_tensor full_redux_gpu;
133 
134  in.setRandom();
135  array<IndexType, 2> tensor_offset_range(tensor_range);
136  tensor_offset_range[0] -= 1;
137  // Set the initial value to be the max.
138  // As we don't include this in the reduction the result should not be 2.
139  in(0) = static_cast<DataType>(2);
140 
141  const IndexType offset = 64;
142  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
143  full_redux = in_offset.maximum();
144  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
145 
146  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
147  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
148 
149  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
150  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
151  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
152  out_gpu.device(sycl_device) = in_gpu.maximum();
153  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
154 
155  // Check that the CPU and GPU reductions return the same result.
156  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
157 
158  sycl_device.deallocate(gpu_in_data);
159  sycl_device.deallocate(gpu_out_data);
160 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_sycl ( const Eigen::SyclDevice &  sycl_device)
static
163  {
164  const IndexType num_rows = 4096;
165  const IndexType num_cols = 4096;
166  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
167  array<IndexType, 1> argRange = {{num_cols}};
169  red_axis[0] = 0;
170  // red_axis[1]=1;
172  Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
173  Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
174  Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
175  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
176  Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
179 
180  in.setRandom();
181  in_arg1.setRandom();
182  in_arg2.setRandom();
183 
184  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
185  DataType* gpu_in_arg1_data =
186  static_cast<DataType*>(sycl_device.allocate(in_arg1.dimensions().TotalSize() * sizeof(DataType)));
187  DataType* gpu_in_arg2_data =
188  static_cast<DataType*>(sycl_device.allocate(in_arg2.dimensions().TotalSize() * sizeof(DataType)));
189  bool* gpu_out_arg__gpu_helper_data =
190  static_cast<bool*>(sycl_device.allocate(out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
191  bool* gpu_out_arg_data =
192  static_cast<bool*>(sycl_device.allocate(out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
193 
194  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
195 
196  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
197  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(gpu_in_arg1_data, tensorRange);
198  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(gpu_in_arg2_data, tensorRange);
199  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(gpu_out_arg_data, argRange);
200  TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(gpu_out_arg__gpu_helper_data, argRange);
202 
203  // CPU VERSION
204  out_arg_cpu =
205  (in_arg1.argmax(1) == in_arg2.argmax(1)).select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
206  full_redux = (out_arg_cpu.template cast<float>()).reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
207 
208  // GPU VERSION
209  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
210  sycl_device.memcpyHostToDevice(gpu_in_arg1_data, in_arg1.data(),
211  (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
212  sycl_device.memcpyHostToDevice(gpu_in_arg2_data, in_arg2.data(),
213  (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
214  out_Argout_gpu_helper.device(sycl_device) = (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
215  out_Argout_gpu.device(sycl_device) =
216  (out_Argout_gpu_helper).select(out_Argout_gpu.constant(true), out_Argout_gpu.constant(false));
217  out_gpu.device(sycl_device) =
218  (out_Argout_gpu.template cast<float>()).reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
219  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
220  // Check that the CPU and GPU reductions return the same result.
221  std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() << '\n';
222  VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
223  sycl_device.deallocate(gpu_in_data);
224  sycl_device.deallocate(gpu_in_arg1_data);
225  sycl_device.deallocate(gpu_in_arg2_data);
226  sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
227  sycl_device.deallocate(gpu_out_arg_data);
228  sycl_device.deallocate(gpu_out_data);
229 }
TensorDevice< Derived, DeviceType > device(const DeviceType &dev)
Definition: TensorBase.h:1209
Definition: TensorFunctors.h:107

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_EQUAL.

◆ test_full_reductions_mean_with_odd_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_with_odd_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
270  {
271  // This is a particular case which illustrates a possible problem when the
272  // number of local threads in a workgroup is even, but is not a power of two.
273  using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
274  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
275  // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
276  // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
277  const IndexType n_elems = 8707;
278  array<IndexType, 1> tensor_range = {{n_elems}};
279 
280  data_tensor in(tensor_range);
281  DataType full_redux;
282  DataType full_redux_gpu;
283  TensorMap<scalar_tensor> red_cpu(&full_redux);
284  TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
285 
286  const DataType const_val = static_cast<DataType>(0.6391);
287  in = in.constant(const_val);
288 
290  red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
291  VERIFY_IS_APPROX(const_val, red_cpu());
292 
293  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
294  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
295 
296  TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
297  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
298  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
299  out_gpu.device(sycl_device) = in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
300  sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, sizeof(DataType));
301 
302  // Check that the CPU and GPU reductions return the same result.
303  VERIFY_IS_APPROX(full_redux_gpu, full_redux);
304 
305  sycl_device.deallocate(gpu_in_data);
306  sycl_device.deallocate(gpu_out_data);
307 }

References Eigen::TensorMap< PlainObjectType, Options_, MakePointer_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), and VERIFY_IS_APPROX.

◆ test_full_reductions_mean_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_mean_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
232  {
233  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
234  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
235  const IndexType num_rows = 64;
236  const IndexType num_cols = 64;
237  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
238  const IndexType n_elems = internal::array_prod(tensor_range);
239 
240  data_tensor in(tensor_range);
241  scalar_tensor full_redux;
242  scalar_tensor full_redux_gpu;
243 
244  in.setRandom();
245  array<IndexType, 2> tensor_offset_range(tensor_range);
246  tensor_offset_range[0] -= 1;
247 
248  const IndexType offset = 64;
249  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
250  full_redux = in_offset.mean();
251  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
252 
253  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
254  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
255 
256  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
257  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
258  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
259  out_gpu.device(sycl_device) = in_gpu.mean();
260  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
261 
262  // Check that the CPU and GPU reductions return the same result.
263  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
264 
265  sycl_device.deallocate(gpu_in_data);
266  sycl_device.deallocate(gpu_out_data);
267 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_min_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_min_sycl ( const Eigen::SyclDevice &  sycl_device)
static
310  {
311  const IndexType num_rows = 876;
312  const IndexType num_cols = 953;
313  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
314 
318 
319  in.setRandom();
320 
321  full_redux = in.minimum();
322 
323  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
324  DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
325 
326  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
328 
329  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
330  out_gpu.device(sycl_device) = in_gpu.minimum();
331  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
332  // Check that the CPU and GPU reductions return the same result.
333  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
334  sycl_device.deallocate(gpu_in_data);
335  sycl_device.deallocate(gpu_out_data);
336 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_min_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_min_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
339  {
340  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
341  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
342  const IndexType num_rows = 64;
343  const IndexType num_cols = 64;
344  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
345  const IndexType n_elems = internal::array_prod(tensor_range);
346 
347  data_tensor in(tensor_range);
348  scalar_tensor full_redux;
349  scalar_tensor full_redux_gpu;
350 
351  in.setRandom();
352  array<IndexType, 2> tensor_offset_range(tensor_range);
353  tensor_offset_range[0] -= 1;
354  // Set the initial value to be the min.
355  // As we don't include this in the reduction the result should not be -2.
356  in(0) = static_cast<DataType>(-2);
357 
358  const IndexType offset = 64;
359  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
360  full_redux = in_offset.minimum();
361  VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
362 
363  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
364  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
365 
366  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
367  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
368  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
369  out_gpu.device(sycl_device) = in_gpu.minimum();
370  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
371 
372  // Check that the CPU and GPU reductions return the same result.
373  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
374 
375  sycl_device.deallocate(gpu_in_data);
376  sycl_device.deallocate(gpu_out_data);
377 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_sum_sycl ( const Eigen::SyclDevice &  sycl_device)
static
25  {
26  const IndexType num_rows = 753;
27  const IndexType num_cols = 537;
28  array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
29 
30  array<IndexType, 2> outRange = {{1, 1}};
31 
33  Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
34  Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
35 
36  in.setRandom();
37  auto dim = DSizes<IndexType, 2>(1, 1);
38  full_redux = in.sum().reshape(dim);
39 
40  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
41  DataType* gpu_out_data =
42  (DataType*)sycl_device.allocate(sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
43 
44  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
45  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, outRange);
46  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
47  out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
48  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
49  (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
50  // Check that the CPU and GPU reductions return the same result.
51  std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) << ", CPU FULL: " << full_redux(0, 0) << "\n";
52  VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
53  sycl_device.deallocate(gpu_in_data);
54  sycl_device.deallocate(gpu_out_data);
55 }
Definition: TensorDimensions.h:161

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::reshape(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_sum_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_full_reductions_sum_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
58  {
60  using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
61  const IndexType num_rows = 64;
62  const IndexType num_cols = 64;
63  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
64  const IndexType n_elems = internal::array_prod(tensor_range);
65 
66  data_tensor in(tensor_range);
67  scalar_tensor full_redux;
68  scalar_tensor full_redux_gpu;
69 
70  in.setRandom();
71  array<IndexType, 2> tensor_offset_range(tensor_range);
72  tensor_offset_range[0] -= 1;
73 
74  const IndexType offset = 64;
75  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
76  full_redux = in_offset.sum();
77 
78  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
79  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
80 
81  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
82  TensorMap<scalar_tensor> out_gpu(gpu_out_data);
83  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
84  out_gpu.device(sycl_device) = in_gpu.sum();
85  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
86 
87  // Check that the CPU and GPU reductions return the same result.
88  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
89 
90  sycl_device.deallocate(gpu_in_data);
91  sycl_device.deallocate(gpu_out_data);
92 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), and VERIFY_IS_APPROX.

◆ test_last_dim_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_max_with_offset_sycl ( const Eigen::SyclDevice &  sycl_device)
static
469  {
470  using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
471  using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
472 
473  const IndexType num_rows = 64;
474  const IndexType num_cols = 64;
475  array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
476  array<IndexType, 1> full_reduced_range = {{num_rows}};
477  array<IndexType, 1> reduced_range = {{num_rows - 1}};
478  const IndexType n_elems = internal::array_prod(tensor_range);
479  const IndexType n_reduced = reduced_range[0];
480 
481  data_tensor in(tensor_range);
482  reduced_tensor redux(full_reduced_range);
483  reduced_tensor redux_gpu(reduced_range);
484 
485  in.setRandom();
486  redux.setZero();
487  array<IndexType, 2> tensor_offset_range(tensor_range);
488  tensor_offset_range[0] -= 1;
489  // Set maximum value outside of the considered range.
490  for (IndexType i = 0; i < n_reduced; i++) {
491  in(i) = static_cast<DataType>(2);
492  }
493 
495  red_axis[0] = 1;
496 
497  const IndexType offset = 64;
498  // Introduce an offset in both the input and the output.
499  TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
500  TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
501  red_offset = in_offset.maximum(red_axis);
502 
503  // Check that the first value hasn't been changed and that the reduced values
504  // are not equal to the previously set maximum in the input outside the range.
505  VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
506  for (IndexType i = 0; i < n_reduced; i++) {
507  VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
508  }
509 
510  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
511  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
512 
513  TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
514  TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
515  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
516  out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
517  sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), n_reduced * sizeof(DataType));
518 
519  // Check that the CPU and GPU reductions return the same result.
520  for (IndexType i = 0; i < n_reduced; i++) {
521  VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
522  }
523 
524  sycl_device.deallocate(gpu_in_data);
525  sycl_device.deallocate(gpu_out_data);
526 }

References Eigen::internal::array_prod(), Eigen::TensorMap< PlainObjectType, Options_, MakePointer_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, VERIFY_IS_APPROX, VERIFY_IS_EQUAL, and VERIFY_IS_NOT_EQUAL.

◆ test_last_dim_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_mean_sycl ( const Eigen::SyclDevice &  sycl_device)
static
600  {
601  IndexType dim_x = 64;
602  IndexType dim_y = 1;
603  IndexType dim_z = 32;
604 
605  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
607  red_axis[0] = 2;
608  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
609 
611  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
612  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
613 
614  in.setRandom();
615 
616  redux = in.mean(red_axis);
617 
618  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
619  DataType* gpu_out_data =
620  static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
621 
622  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
623  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
624 
625  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
626  out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
627  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
628  // Check that the CPU and GPU reductions return the same result.
629  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
630  for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
631 
632  sycl_device.deallocate(gpu_in_data);
633  sycl_device.deallocate(gpu_out_data);
634 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_last_dim_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_last_dim_reductions_sum_sycl ( const Eigen::SyclDevice &  sycl_device)
static
637  {
638  IndexType dim_x = 64;
639  IndexType dim_y = 1;
640  IndexType dim_z = 32;
641 
642  array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
644  red_axis[0] = 2;
645  array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
646 
648  Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
649  Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
650 
651  in.setRandom();
652 
653  redux = in.sum(red_axis);
654 
655  DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
656  DataType* gpu_out_data =
657  static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
658 
659  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
660  TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
661 
662  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
663  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
664  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
665  // Check that the CPU and GPU reductions return the same result.
666  for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
667  for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
668 
669  sycl_device.deallocate(gpu_in_data);
670  sycl_device.deallocate(gpu_out_data);
671 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_last_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_last_reductions_mean_sycl ( const Eigen::SyclDevice &  sycl_device)
static
709  {
710  auto tensorRange = Sizes<64, 32>(64, 32);
712  auto reduced_tensorRange = Sizes<64>(64);
716 
717  in_fix.setRandom();
718  redux_fix = in_fix.mean(red_axis);
719 
720  DataType* gpu_in_data =
721  static_cast<DataType*>(sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
722  DataType* gpu_out_data =
723  static_cast<DataType*>(sycl_device.allocate(redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
724 
725  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
726  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
727 
728  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType));
729  out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
730  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
731  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
732  sycl_device.synchronize();
733  // Check that the CPU and GPU reductions return the same result.
734  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
735  VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
736  }
737 
738  sycl_device.deallocate(gpu_in_data);
739  sycl_device.deallocate(gpu_out_data);
740 }

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), j, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_APPROX.

◆ test_last_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >
static void test_last_reductions_sum_sycl ( const Eigen::SyclDevice &  sycl_device)
static
674  {
675  auto tensorRange = Sizes<64, 32>(64, 32);
676  // auto red_axis = Sizes<0,1>(0,1);
678  auto reduced_tensorRange = Sizes<64>(64);
682 
683  in_fix.setRandom();
684 
685  redux_fix = in_fix.sum(red_axis);
686 
687  DataType* gpu_in_data =
688  static_cast<DataType*>(sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
689  DataType* gpu_out_data =
690  static_cast<DataType*>(sycl_device.allocate(redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
691 
692  TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
693  TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
694 
695  sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType));
696  out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
697  sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
698  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
699  // Check that the CPU and GPU reductions return the same result.
700  for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
701  VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
702  }
703 
704  sycl_device.deallocate(gpu_in_data);
705  sycl_device.deallocate(gpu_out_data);
706 }

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), j, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_APPROX.