#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>

Classes
struct	CustomReducer< InT, OutT >

Macros
#define	EIGEN_TEST_NO_LONGDOUBLE

#define	EIGEN_TEST_NO_COMPLEX

#define	EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t

#define	EIGEN_USE_SYCL

Functions
template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_sum_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_max_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_mean_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_mean_with_odd_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_min_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_full_reductions_min_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_first_dim_reductions_max_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_first_dim_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_last_dim_reductions_max_with_offset_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_first_dim_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device, IndexType dim_x, IndexType dim_y)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_first_dim_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_last_dim_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_last_dim_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_last_reductions_sum_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , int DataLayout, typename IndexType >
static void	test_last_reductions_mean_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , typename AccumType , int DataLayout, typename IndexType >
static void	test_full_reductions_custom_sycl (const Eigen::SyclDevice &sycl_device)

template<typename DataType , typename Dev >
void	sycl_reduction_test_full_per_device (const Dev &sycl_device)

template<typename DataType , typename Dev >
void	sycl_reduction_full_offset_per_device (const Dev &sycl_device)

template<typename DataType , typename Dev >
void	sycl_reduction_test_first_dim_per_device (const Dev &sycl_device)

template<typename DataType , typename Dev >
void	sycl_reduction_test_last_dim_per_device (const Dev &sycl_device)

	EIGEN_DECLARE_TEST (cxx11_tensor_reduction_sycl)

Macro Definition Documentation

◆ EIGEN_DEFAULT_DENSE_INDEX_TYPE

#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t

◆ EIGEN_TEST_NO_COMPLEX

#define EIGEN_TEST_NO_COMPLEX

◆ EIGEN_TEST_NO_LONGDOUBLE

#define EIGEN_TEST_NO_LONGDOUBLE

◆ EIGEN_USE_SYCL

#define EIGEN_USE_SYCL

Function Documentation

◆ EIGEN_DECLARE_TEST()

EIGEN_DECLARE_TEST ( cxx11_tensor_reduction_sycl )

                                                 {
   for (const auto& device : Eigen::get_sycl_supported_devices()) {
     std::cout << "Running on " << device.template get_info<cl::sycl::info::device::name>() << std::endl;
     QueueInterface queueInterface(device);
     auto sycl_device = Eigen::SyclDevice(&queueInterface);
     CALL_SUBTEST_1(sycl_reduction_test_full_per_device<float>(sycl_device));
     CALL_SUBTEST_2(sycl_reduction_full_offset_per_device<float>(sycl_device));
     CALL_SUBTEST_3(sycl_reduction_test_first_dim_per_device<float>(sycl_device));
     CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device<float>(sycl_device));
   }
 }

References CALL_SUBTEST_1, CALL_SUBTEST_2, CALL_SUBTEST_3, and CALL_SUBTEST_4.

◆ sycl_reduction_full_offset_per_device()

template<typename DataType , typename Dev >

void sycl_reduction_full_offset_per_device ( const Dev & sycl_device )

                                                                    {
   test_full_reductions_sum_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_sum_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_min_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_min_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_max_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_mean_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_mean_with_offset_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_mean_with_odd_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   sycl_device.synchronize();
 }

◆ sycl_reduction_test_first_dim_per_device()

template<typename DataType , typename Dev >

void sycl_reduction_test_first_dim_per_device ( const Dev & sycl_device )

                                                                       {
   test_first_dim_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device, 4197, 4097);
   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 4197, 4097);
   test_first_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device, 129, 8);
   test_first_dim_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_first_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   sycl_device.synchronize();
 }

◆ sycl_reduction_test_full_per_device()

template<typename DataType , typename Dev >

void sycl_reduction_test_full_per_device ( const Dev & sycl_device )

                                                                  {
   test_full_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_min_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_min_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_max_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_max_sycl<DataType, RowMajor, int64_t>(sycl_device);
  
   test_full_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_full_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_full_reductions_custom_sycl<DataType, int, RowMajor, int64_t>(sycl_device);
   test_full_reductions_custom_sycl<DataType, int, ColMajor, int64_t>(sycl_device);
   sycl_device.synchronize();
 }

◆ sycl_reduction_test_last_dim_per_device()

template<typename DataType , typename Dev >

void sycl_reduction_test_last_dim_per_device ( const Dev & sycl_device )

                                                                      {
   test_last_dim_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_last_dim_reductions_max_with_offset_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_last_reductions_sum_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_last_reductions_sum_sycl<DataType, RowMajor, int64_t>(sycl_device);
   test_last_reductions_mean_sycl<DataType, ColMajor, int64_t>(sycl_device);
   test_last_reductions_mean_sycl<DataType, RowMajor, int64_t>(sycl_device);
   sycl_device.synchronize();
 }

◆ test_first_dim_reductions_max_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_first_dim_reductions_max_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                    {
   IndexType dim_x = 145;
   IndexType dim_y = 1;
   IndexType dim_z = 67;
  
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
  
   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
  
   in.setRandom();
  
   redux = in.maximum(red_axis);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
     for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_first_dim_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_first_dim_reductions_max_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                                {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
  
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   array<IndexType, 1> reduced_range = {{num_cols}};
   const IndexType n_elems = internal::array_prod(tensor_range);
   const IndexType n_reduced = num_cols;
  
   data_tensor in(tensor_range);
   reduced_tensor redux;
   reduced_tensor redux_gpu(reduced_range);
  
   in.setRandom();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
   // Set maximum value outside of the considered range.
   for (IndexType i = 0; i < n_reduced; i++) {
     in(i) = static_cast<DataType>(2);
   }
  
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
  
   const IndexType offset = 64;
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   redux = in_offset.maximum(red_axis);
   for (IndexType i = 0; i < n_reduced; i++) {
     VERIFY_IS_NOT_EQUAL(redux(i), in(i));
   }
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(n_reduced * sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<reduced_tensor> out_gpu(gpu_out_data, reduced_range);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, n_reduced * sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType i = 0; i < n_reduced; i++) {
     VERIFY_IS_APPROX(redux_gpu(i), redux(i));
   }
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_first_dim_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_first_dim_reductions_mean_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                     {
   IndexType dim_x = 145;
   IndexType dim_y = 1;
   IndexType dim_z = 67;
  
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
   array<IndexType, 2> reduced_tensorRange = {{dim_y, dim_z}};
  
   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
  
   in.setRandom();
  
   redux = in.mean(red_axis);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
     for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_first_dim_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_first_dim_reductions_sum_sycl	(	const Eigen::SyclDevice &	sycl_device,
		IndexType	dim_x,
		IndexType	dim_y
	)

static

                                                                                                                      {
   array<IndexType, 2> tensorRange = {{dim_x, dim_y}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
   array<IndexType, 1> reduced_tensorRange = {{dim_y}};
  
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 1, DataLayout, IndexType> redux(reduced_tensorRange);
   Tensor<DataType, 1, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
  
   in.setRandom();
   redux = in.sum(red_axis);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType i = 0; i < redux.size(); i++) {
     VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]);
   }
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), i, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_custom_sycl()

template<typename DataType , typename AccumType , int DataLayout, typename IndexType >

static void test_full_reductions_custom_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                  {
   constexpr IndexType InSize = 64;
   auto tensorRange = Sizes<InSize>(InSize);
   Eigen::IndexList<Eigen::type2index<0>> dims;
   auto reduced_tensorRange = Sizes<>();
   TensorFixedSize<DataType, Sizes<InSize>, DataLayout> in_fix;
   TensorFixedSize<AccumType, Sizes<>, DataLayout> redux_gpu_fix;
  
   CustomReducer<DataType, AccumType> reducer;
  
   in_fix.setRandom();
  
   size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType);
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in_size_bytes));
   AccumType* gpu_out_data = static_cast<AccumType*>(sycl_device.allocate(sizeof(AccumType)));
  
   TensorMap<TensorFixedSize<DataType, Sizes<InSize>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
   TensorMap<TensorFixedSize<AccumType, Sizes<>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes);
   out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer);
   sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, sizeof(AccumType));
   VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0));
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_EQUAL.

◆ test_full_reductions_max_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_max_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                               {
   const IndexType num_rows = 4096;
   const IndexType num_cols = 4096;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
  
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
  
   in.setRandom();
  
   full_redux = in.maximum();
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
  
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_max_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                           {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   const IndexType n_elems = internal::array_prod(tensor_range);
  
   data_tensor in(tensor_range);
   scalar_tensor full_redux;
   scalar_tensor full_redux_gpu;
  
   in.setRandom();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
   // Set the initial value to be the max.
   // As we don't include this in the reduction the result should not be 2.
   in(0) = static_cast<DataType>(2);
  
   const IndexType offset = 64;
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   full_redux = in_offset.maximum();
   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_mean_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                {
   const IndexType num_rows = 4096;
   const IndexType num_cols = 4096;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
   array<IndexType, 1> argRange = {{num_cols}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 0;
   //  red_axis[1]=1;
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> in_arg1(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> in_arg2(tensorRange);
   Tensor<bool, 1, DataLayout, IndexType> out_arg_cpu(argRange);
   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu(argRange);
   Tensor<bool, 1, DataLayout, IndexType> out_arg_gpu_helper(argRange);
   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
  
   in.setRandom();
   in_arg1.setRandom();
   in_arg2.setRandom();
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_in_arg1_data =
       static_cast<DataType*>(sycl_device.allocate(in_arg1.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_in_arg2_data =
       static_cast<DataType*>(sycl_device.allocate(in_arg2.dimensions().TotalSize() * sizeof(DataType)));
   bool* gpu_out_arg__gpu_helper_data =
       static_cast<bool*>(sycl_device.allocate(out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
   bool* gpu_out_arg_data =
       static_cast<bool*>(sycl_device.allocate(out_arg_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
  
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg1_gpu(gpu_in_arg1_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_Arg2_gpu(gpu_in_arg2_data, tensorRange);
   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu(gpu_out_arg_data, argRange);
   TensorMap<Tensor<bool, 1, DataLayout, IndexType>> out_Argout_gpu_helper(gpu_out_arg__gpu_helper_data, argRange);
   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
  
   // CPU VERSION
   out_arg_cpu =
       (in_arg1.argmax(1) == in_arg2.argmax(1)).select(out_arg_cpu.constant(true), out_arg_cpu.constant(false));
   full_redux = (out_arg_cpu.template cast<float>()).reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
  
   // GPU VERSION
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   sycl_device.memcpyHostToDevice(gpu_in_arg1_data, in_arg1.data(),
                                  (in_arg1.dimensions().TotalSize()) * sizeof(DataType));
   sycl_device.memcpyHostToDevice(gpu_in_arg2_data, in_arg2.data(),
                                  (in_arg2.dimensions().TotalSize()) * sizeof(DataType));
   out_Argout_gpu_helper.device(sycl_device) = (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1));
   out_Argout_gpu.device(sycl_device) =
       (out_Argout_gpu_helper).select(out_Argout_gpu.constant(true), out_Argout_gpu.constant(false));
   out_gpu.device(sycl_device) =
       (out_Argout_gpu.template cast<float>()).reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() << '\n';
   VERIFY_IS_EQUAL(full_redux_gpu(), full_redux());
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_in_arg1_data);
   sycl_device.deallocate(gpu_in_arg2_data);
   sycl_device.deallocate(gpu_out_arg__gpu_helper_data);
   sycl_device.deallocate(gpu_out_arg_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_EQUAL.

◆ test_full_reductions_mean_with_odd_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_mean_with_odd_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                                {
   // This is a particular case which illustrates a possible problem when the
   // number of local threads in a workgroup is even, but is not a power of two.
   using data_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
   // 2177 = (17 * 128) + 1 gives rise to 18 local threads.
   // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads.
   const IndexType n_elems = 8707;
   array<IndexType, 1> tensor_range = {{n_elems}};
  
   data_tensor in(tensor_range);
   DataType full_redux;
   DataType full_redux_gpu;
   TensorMap<scalar_tensor> red_cpu(&full_redux);
   TensorMap<scalar_tensor> red_gpu(&full_redux_gpu);
  
   const DataType const_val = static_cast<DataType>(0.6391);
   in = in.constant(const_val);
  
   Eigen::IndexList<Eigen::type2index<0>> red_axis;
   red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
   VERIFY_IS_APPROX(const_val, red_cpu());
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data, tensor_range);
   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.reduce(red_axis, Eigen::internal::MeanReducer<DataType>());
   sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu, full_redux);
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::TensorMap< PlainObjectType, Options_, MakePointer_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), and VERIFY_IS_APPROX.

◆ test_full_reductions_mean_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_mean_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                            {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   const IndexType n_elems = internal::array_prod(tensor_range);
  
   data_tensor in(tensor_range);
   scalar_tensor full_redux;
   scalar_tensor full_redux_gpu;
  
   in.setRandom();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
  
   const IndexType offset = 64;
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   full_redux = in_offset.mean();
   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.mean();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_min_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_min_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                               {
   const IndexType num_rows = 876;
   const IndexType num_cols = 953;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
  
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 0, DataLayout, IndexType> full_redux;
   Tensor<DataType, 0, DataLayout, IndexType> full_redux_gpu;
  
   in.setRandom();
  
   full_redux = in.minimum();
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType));
  
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 0, DataLayout, IndexType>> out_gpu(gpu_out_data);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.minimum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_min_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_min_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                           {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   const IndexType n_elems = internal::array_prod(tensor_range);
  
   data_tensor in(tensor_range);
   scalar_tensor full_redux;
   scalar_tensor full_redux_gpu;
  
   in.setRandom();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
   // Set the initial value to be the min.
   // As we don't include this in the reduction the result should not be -2.
   in(0) = static_cast<DataType>(-2);
  
   const IndexType offset = 64;
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   full_redux = in_offset.minimum();
   VERIFY_IS_NOT_EQUAL(full_redux(), in(0));
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.minimum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), VERIFY_IS_APPROX, and VERIFY_IS_NOT_EQUAL.

◆ test_full_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_sum_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                               {
   const IndexType num_rows = 753;
   const IndexType num_cols = 537;
   array<IndexType, 2> tensorRange = {{num_rows, num_cols}};
  
   array<IndexType, 2> outRange = {{1, 1}};
  
   Tensor<DataType, 2, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> full_redux(outRange);
   Tensor<DataType, 2, DataLayout, IndexType> full_redux_gpu(outRange);
  
   in.setRandom();
   auto dim = DSizes<IndexType, 2>(1, 1);
   full_redux = in.sum().reshape(dim);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       (DataType*)sycl_device.allocate(sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize()));
  
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, outRange);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim);
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data,
                                  (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) << ", CPU FULL: " << full_redux(0, 0) << "\n";
   VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0));
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), Eigen::TensorBase< Derived, AccessLevel >::reshape(), Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_full_reductions_sum_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_full_reductions_sum_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                           {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using scalar_tensor = Tensor<DataType, 0, DataLayout, IndexType>;
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   const IndexType n_elems = internal::array_prod(tensor_range);
  
   data_tensor in(tensor_range);
   scalar_tensor full_redux;
   scalar_tensor full_redux_gpu;
  
   in.setRandom();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
  
   const IndexType offset = 64;
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   full_redux = in_offset.sum();
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<scalar_tensor> out_gpu(gpu_out_data);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum();
   sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorBase< Derived, AccessLevel >::device(), and VERIFY_IS_APPROX.

◆ test_last_dim_reductions_max_with_offset_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_last_dim_reductions_max_with_offset_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                               {
   using data_tensor = Tensor<DataType, 2, DataLayout, IndexType>;
   using reduced_tensor = Tensor<DataType, 1, DataLayout, IndexType>;
  
   const IndexType num_rows = 64;
   const IndexType num_cols = 64;
   array<IndexType, 2> tensor_range = {{num_rows, num_cols}};
   array<IndexType, 1> full_reduced_range = {{num_rows}};
   array<IndexType, 1> reduced_range = {{num_rows - 1}};
   const IndexType n_elems = internal::array_prod(tensor_range);
   const IndexType n_reduced = reduced_range[0];
  
   data_tensor in(tensor_range);
   reduced_tensor redux(full_reduced_range);
   reduced_tensor redux_gpu(reduced_range);
  
   in.setRandom();
   redux.setZero();
   array<IndexType, 2> tensor_offset_range(tensor_range);
   tensor_offset_range[0] -= 1;
   // Set maximum value outside of the considered range.
   for (IndexType i = 0; i < n_reduced; i++) {
     in(i) = static_cast<DataType>(2);
   }
  
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 1;
  
   const IndexType offset = 64;
   // Introduce an offset in both the input and the output.
   TensorMap<data_tensor> in_offset(in.data() + offset, tensor_offset_range);
   TensorMap<reduced_tensor> red_offset(redux.data() + 1, reduced_range);
   red_offset = in_offset.maximum(red_axis);
  
   // Check that the first value hasn't been changed and that the reduced values
   // are not equal to the previously set maximum in the input outside the range.
   VERIFY_IS_EQUAL(redux(0), static_cast<DataType>(0));
   for (IndexType i = 0; i < n_reduced; i++) {
     VERIFY_IS_NOT_EQUAL(red_offset(i), in(i));
   }
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(n_elems * sizeof(DataType)));
   DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate((n_reduced + 1) * sizeof(DataType)));
  
   TensorMap<data_tensor> in_gpu(gpu_in_data + offset, tensor_offset_range);
   TensorMap<reduced_tensor> out_gpu(gpu_out_data + 1, reduced_range);
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), n_elems * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.maximum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), n_reduced * sizeof(DataType));
  
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType i = 0; i < n_reduced; i++) {
     VERIFY_IS_APPROX(redux_gpu(i), red_offset(i));
   }
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::internal::array_prod(), Eigen::TensorMap< PlainObjectType, Options_, MakePointer_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, VERIFY_IS_APPROX, VERIFY_IS_EQUAL, and VERIFY_IS_NOT_EQUAL.

◆ test_last_dim_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_last_dim_reductions_mean_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                    {
   IndexType dim_x = 64;
   IndexType dim_y = 1;
   IndexType dim_z = 32;
  
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 2;
   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
  
   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
  
   in.setRandom();
  
   redux = in.mean(red_axis);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.mean(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
     for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_last_dim_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_last_dim_reductions_sum_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                   {
   IndexType dim_x = 64;
   IndexType dim_y = 1;
   IndexType dim_z = 32;
  
   array<IndexType, 3> tensorRange = {{dim_x, dim_y, dim_z}};
   Eigen::array<IndexType, 1> red_axis;
   red_axis[0] = 2;
   array<IndexType, 2> reduced_tensorRange = {{dim_x, dim_y}};
  
   Tensor<DataType, 3, DataLayout, IndexType> in(tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux(reduced_tensorRange);
   Tensor<DataType, 2, DataLayout, IndexType> redux_gpu(reduced_tensorRange);
  
   in.setRandom();
  
   redux = in.sum(red_axis);
  
   DataType* gpu_in_data = static_cast<DataType*>(sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> in_gpu(gpu_in_data, tensorRange);
   TensorMap<Tensor<DataType, 2, DataLayout, IndexType>> out_gpu(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++)
     for (IndexType k = 0; k < reduced_tensorRange[1]; k++) VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k));
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::dimensions(), j, k, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), Eigen::DSizes< DenseIndex, NumDims >::TotalSize(), and VERIFY_IS_APPROX.

◆ test_last_reductions_mean_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_last_reductions_mean_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                                {
   auto tensorRange = Sizes<64, 32>(64, 32);
   Eigen::IndexList<Eigen::type2index<1>> red_axis;
   auto reduced_tensorRange = Sizes<64>(64);
   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
  
   in_fix.setRandom();
   redux_fix = in_fix.mean(red_axis);
  
   DataType* gpu_in_data =
       static_cast<DataType*>(sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
                                  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
   sycl_device.synchronize();
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
   }
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), j, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_APPROX.

◆ test_last_reductions_sum_sycl()

template<typename DataType , int DataLayout, typename IndexType >

static void test_last_reductions_sum_sycl ( const Eigen::SyclDevice & sycl_device )

static

                                                                               {
   auto tensorRange = Sizes<64, 32>(64, 32);
   // auto red_axis =  Sizes<0,1>(0,1);
   Eigen::IndexList<Eigen::type2index<1>> red_axis;
   auto reduced_tensorRange = Sizes<64>(64);
   TensorFixedSize<DataType, Sizes<64, 32>, DataLayout> in_fix;
   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_fix;
   TensorFixedSize<DataType, Sizes<64>, DataLayout> redux_gpu_fix;
  
   in_fix.setRandom();
  
   redux_fix = in_fix.sum(red_axis);
  
   DataType* gpu_in_data =
       static_cast<DataType*>(sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType)));
   DataType* gpu_out_data =
       static_cast<DataType*>(sycl_device.allocate(redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)));
  
   TensorMap<TensorFixedSize<DataType, Sizes<64, 32>, DataLayout>> in_gpu_fix(gpu_in_data, tensorRange);
   TensorMap<TensorFixedSize<DataType, Sizes<64>, DataLayout>> out_gpu_fix(gpu_out_data, reduced_tensorRange);
  
   sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), (in_fix.dimensions().TotalSize()) * sizeof(DataType));
   out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis);
   sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data,
                                  redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType));
   // Check that the CPU and GPU reductions return the same result.
   for (IndexType j = 0; j < reduced_tensorRange[0]; j++) {
     VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j));
   }
  
   sycl_device.deallocate(gpu_in_data);
   sycl_device.deallocate(gpu_out_data);
 }

References Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::data(), DataLayout, Eigen::TensorBase< Derived, AccessLevel >::device(), Eigen::TensorFixedSize< Scalar_, Dimensions_, Options_, IndexType >::dimensions(), j, Eigen::TensorBase< Derived, AccessLevel >::setRandom(), and VERIFY_IS_APPROX.

Classes

Macros

Functions

Macro Definition Documentation

◆ EIGEN_DEFAULT_DENSE_INDEX_TYPE

◆ EIGEN_TEST_NO_COMPLEX

◆ EIGEN_TEST_NO_LONGDOUBLE

◆ EIGEN_USE_SYCL

Function Documentation

◆ EIGEN_DECLARE_TEST()

◆ sycl_reduction_full_offset_per_device()

◆ sycl_reduction_test_first_dim_per_device()

◆ sycl_reduction_test_full_per_device()

◆ sycl_reduction_test_last_dim_per_device()

◆ test_first_dim_reductions_max_sycl()

◆ test_first_dim_reductions_max_with_offset_sycl()

◆ test_first_dim_reductions_mean_sycl()

◆ test_first_dim_reductions_sum_sycl()

◆ test_full_reductions_custom_sycl()

◆ test_full_reductions_max_sycl()

◆ test_full_reductions_max_with_offset_sycl()

◆ test_full_reductions_mean_sycl()

◆ test_full_reductions_mean_with_odd_offset_sycl()

◆ test_full_reductions_mean_with_offset_sycl()

◆ test_full_reductions_min_sycl()

◆ test_full_reductions_min_with_offset_sycl()

◆ test_full_reductions_sum_sycl()

◆ test_full_reductions_sum_with_offset_sycl()

◆ test_last_dim_reductions_max_with_offset_sycl()

◆ test_last_dim_reductions_mean_sycl()

◆ test_last_dim_reductions_sum_sycl()

◆ test_last_reductions_mean_sycl()

◆ test_last_reductions_sum_sycl()