cxx11_tensor_sycl.cpp File Reference
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>

Macros

#define EIGEN_TEST_NO_LONGDOUBLE
 
#define EIGEN_TEST_NO_COMPLEX
 
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t
 
#define EIGEN_USE_SYCL
 

Functions

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_transfers (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_sync (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_sync_offsets (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_memset_offsets (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_computations (const Eigen::SyclDevice &sycl_device)
 
template<typename Scalar1 , typename Scalar2 , int DataLayout, typename IndexType >
static void test_sycl_cast (const Eigen::SyclDevice &sycl_device)
 
template<typename DataType , typename dev_Selector >
void sycl_computing_test_per_device (dev_Selector s)
 
 EIGEN_DECLARE_TEST (cxx11_tensor_sycl)
 

Macro Definition Documentation

◆ EIGEN_DEFAULT_DENSE_INDEX_TYPE

#define EIGEN_DEFAULT_DENSE_INDEX_TYPE   int64_t

◆ EIGEN_TEST_NO_COMPLEX

#define EIGEN_TEST_NO_COMPLEX

◆ EIGEN_TEST_NO_LONGDOUBLE

#define EIGEN_TEST_NO_LONGDOUBLE

◆ EIGEN_USE_SYCL

#define EIGEN_USE_SYCL

Function Documentation

◆ EIGEN_DECLARE_TEST()

EIGEN_DECLARE_TEST ( cxx11_tensor_sycl  )
344  {
345  for (const auto& device : Eigen::get_sycl_supported_devices()) {
346  CALL_SUBTEST(sycl_computing_test_per_device<float>(device));
347  }
348 }
#define CALL_SUBTEST(FUNC)
Definition: main.h:382

References CALL_SUBTEST.

◆ sycl_computing_test_per_device()

template<typename DataType , typename dev_Selector >
void sycl_computing_test_per_device ( dev_Selector  s)
329  {
330  QueueInterface queueInterface(s);
331  auto sycl_device = Eigen::SyclDevice(&queueInterface);
332  test_sycl_mem_transfers<DataType, RowMajor, int64_t>(sycl_device);
333  test_sycl_computations<DataType, RowMajor, int64_t>(sycl_device);
334  test_sycl_mem_sync<DataType, RowMajor, int64_t>(sycl_device);
335  test_sycl_mem_sync_offsets<DataType, RowMajor, int64_t>(sycl_device);
336  test_sycl_memset_offsets<DataType, RowMajor, int64_t>(sycl_device);
337  test_sycl_mem_transfers<DataType, ColMajor, int64_t>(sycl_device);
338  test_sycl_computations<DataType, ColMajor, int64_t>(sycl_device);
339  test_sycl_mem_sync<DataType, ColMajor, int64_t>(sycl_device);
340  test_sycl_cast<DataType, int, RowMajor, int64_t>(sycl_device);
341  test_sycl_cast<DataType, int, ColMajor, int64_t>(sycl_device);
342 }
RealScalar s
Definition: level1_cplx_impl.h:130

References s.

◆ test_sycl_cast()

template<typename Scalar1 , typename Scalar2 , int DataLayout, typename IndexType >
static void test_sycl_cast ( const Eigen::SyclDevice &  sycl_device)
static
303  {
304  IndexType size = 20;
305  array<IndexType, 1> tensorRange = {{size}};
308  Tensor<Scalar2, 1, DataLayout, IndexType> out_host(tensorRange);
309 
310  in = in.random();
311 
312  Scalar1* gpu_in_data = static_cast<Scalar1*>(sycl_device.allocate(in.size() * sizeof(Scalar1)));
313  Scalar2* gpu_out_data = static_cast<Scalar2*>(sycl_device.allocate(out.size() * sizeof(Scalar2)));
314 
315  TensorMap<Tensor<Scalar1, 1, DataLayout, IndexType>> gpu_in(gpu_in_data, tensorRange);
316  TensorMap<Tensor<Scalar2, 1, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
317  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), (in.size()) * sizeof(Scalar1));
318  gpu_out.device(sycl_device) = gpu_in.template cast<Scalar2>();
319  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, out.size() * sizeof(Scalar2));
320  out_host = in.template cast<Scalar2>();
321  for (IndexType i = 0; i < size; i++) {
322  VERIFY_IS_APPROX(out(i), out_host(i));
323  }
324  printf("cast Test Passed\n");
325  sycl_device.deallocate(gpu_in_data);
326  sycl_device.deallocate(gpu_out_data);
327 }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
A tensor expression mapping an existing array of data.
Definition: TensorMap.h:33
The tensor class.
Definition: Tensor.h:68
#define VERIFY_IS_APPROX(a, b)
Definition: integer_types.cpp:13
std::array< T, N > array
Definition: EmulateArray.h:231
std::ofstream out("Result.txt")

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, out(), size, Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), and VERIFY_IS_APPROX.

◆ test_sycl_computations()

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_computations ( const Eigen::SyclDevice &  sycl_device)

a=1.2f

a=b*1.2f

c=a*b

c=a+b

c=a*a

d= (a>0.5? b:c)

179  {
180  IndexType sizeDim1 = 100;
181  IndexType sizeDim2 = 10;
182  IndexType sizeDim3 = 20;
183  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
188 
189  in2 = in2.random();
190  in3 = in3.random();
191 
192  DataType* gpu_in1_data = static_cast<DataType*>(sycl_device.allocate(in1.size() * sizeof(DataType)));
193  DataType* gpu_in2_data = static_cast<DataType*>(sycl_device.allocate(in2.size() * sizeof(DataType)));
194  DataType* gpu_in3_data = static_cast<DataType*>(sycl_device.allocate(in3.size() * sizeof(DataType)));
195  DataType* gpu_out_data = static_cast<DataType*>(sycl_device.allocate(out.size() * sizeof(DataType)));
196 
197  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in1(gpu_in1_data, tensorRange);
198  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in2(gpu_in2_data, tensorRange);
199  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_in3(gpu_in3_data, tensorRange);
200  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu_out(gpu_out_data, tensorRange);
201 
203  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
204  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data, (in1.size()) * sizeof(DataType));
205  sycl_device.synchronize();
206 
207  for (IndexType i = 0; i < sizeDim1; ++i) {
208  for (IndexType j = 0; j < sizeDim2; ++j) {
209  for (IndexType k = 0; k < sizeDim3; ++k) {
210  VERIFY_IS_APPROX(in1(i, j, k), 1.2f);
211  }
212  }
213  }
214  printf("a=1.2f Test passed\n");
215 
217  gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
218  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
219  sycl_device.synchronize();
220 
221  for (IndexType i = 0; i < sizeDim1; ++i) {
222  for (IndexType j = 0; j < sizeDim2; ++j) {
223  for (IndexType k = 0; k < sizeDim3; ++k) {
224  VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) * 1.2f);
225  }
226  }
227  }
228  printf("a=b*1.2f Test Passed\n");
229 
231  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(), (in2.size()) * sizeof(DataType));
232  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
233  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
234  sycl_device.synchronize();
235 
236  for (IndexType i = 0; i < sizeDim1; ++i) {
237  for (IndexType j = 0; j < sizeDim2; ++j) {
238  for (IndexType k = 0; k < sizeDim3; ++k) {
239  VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) * in2(i, j, k));
240  }
241  }
242  }
243  printf("c=a*b Test Passed\n");
244 
246  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
247  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
248  sycl_device.synchronize();
249  for (IndexType i = 0; i < sizeDim1; ++i) {
250  for (IndexType j = 0; j < sizeDim2; ++j) {
251  for (IndexType k = 0; k < sizeDim3; ++k) {
252  VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) + in2(i, j, k));
253  }
254  }
255  }
256  printf("c=a+b Test Passed\n");
257 
259  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
260  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
261  sycl_device.synchronize();
262  for (IndexType i = 0; i < sizeDim1; ++i) {
263  for (IndexType j = 0; j < sizeDim2; ++j) {
264  for (IndexType k = 0; k < sizeDim3; ++k) {
265  VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) * in1(i, j, k));
266  }
267  }
268  }
269  printf("c= a*a Test Passed\n");
270 
271  // a*3.14f + b*2.7f
272  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
273  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
274  sycl_device.synchronize();
275  for (IndexType i = 0; i < sizeDim1; ++i) {
276  for (IndexType j = 0; j < sizeDim2; ++j) {
277  for (IndexType k = 0; k < sizeDim3; ++k) {
278  VERIFY_IS_APPROX(out(i, j, k), in1(i, j, k) * 3.14f + in2(i, j, k) * 2.7f);
279  }
280  }
281  }
282  printf("a*3.14f + b*2.7f Test Passed\n");
283 
285  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(), (in3.size()) * sizeof(DataType));
286  gpu_out.device(sycl_device) = (gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
287  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data, (out.size()) * sizeof(DataType));
288  sycl_device.synchronize();
289  for (IndexType i = 0; i < sizeDim1; ++i) {
290  for (IndexType j = 0; j < sizeDim2; ++j) {
291  for (IndexType k = 0; k < sizeDim3; ++k) {
292  VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f) ? in2(i, j, k) : in3(i, j, k));
293  }
294  }
295  }
296  printf("d= (a>0.5? b:c) Test Passed\n");
297  sycl_device.deallocate(gpu_in1_data);
298  sycl_device.deallocate(gpu_in2_data);
299  sycl_device.deallocate(gpu_in3_data);
300  sycl_device.deallocate(gpu_out_data);
301 }
char char char int int * k
Definition: level2_impl.h:374
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, j, k, out(), Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), and VERIFY_IS_APPROX.

◆ test_sycl_mem_sync()

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_sync ( const Eigen::SyclDevice &  sycl_device)
69  {
70  IndexType size = 20;
71  array<IndexType, 1> tensorRange = {{size}};
75 
76  in1 = in1.random();
77  in2 = in1;
78 
79  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(in1.size() * sizeof(DataType)));
80 
81  TensorMap<Tensor<DataType, 1, DataLayout, IndexType>> gpu1(gpu_data, tensorRange);
82  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), (in1.size()) * sizeof(DataType));
83  sycl_device.synchronize();
84  in1.setZero();
85 
86  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, out.size() * sizeof(DataType));
87  sycl_device.synchronize();
88 
89  for (IndexType i = 0; i < in1.size(); ++i) {
90  VERIFY_IS_APPROX(out(i), in2(i));
91  }
92 
93  sycl_device.deallocate(gpu_data);
94 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), i, out(), Eigen::TensorBase< Derived, AccessLevel >::setZero(), size, Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), and VERIFY_IS_APPROX.

◆ test_sycl_mem_sync_offsets()

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_sync_offsets ( const Eigen::SyclDevice &  sycl_device)
97  {
99  IndexType full_size = 32;
100  IndexType half_size = full_size / 2;
101  array<IndexType, 1> tensorRange = {{full_size}};
102  tensor_type in1(tensorRange);
103  tensor_type out(tensorRange);
104 
105  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
106  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
107 
108  in1 = in1.random();
109  // Copy all data to device, then permute on copy back to host
110  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
111  sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType));
112  sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType));
113 
114  for (IndexType i = 0; i < half_size; ++i) {
115  VERIFY_IS_APPROX(out(i), in1(i + half_size));
116  VERIFY_IS_APPROX(out(i + half_size), in1(i));
117  }
118 
119  in1 = in1.random();
120  out.setZero();
121  // Permute copies to device, then copy all back to host
122  sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType));
123  sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType));
124  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
125 
126  for (IndexType i = 0; i < half_size; ++i) {
127  VERIFY_IS_APPROX(out(i), in1(i + half_size));
128  VERIFY_IS_APPROX(out(i + half_size), in1(i));
129  }
130 
131  in1 = in1.random();
132  out.setZero();
133  DataType* gpu_data_out = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
134  TensorMap<tensor_type> gpu2(gpu_data_out, tensorRange);
135  // Copy all to device, permute copies on device, then copy all back to host
136  sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType));
137  sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType));
138  sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType));
139  sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType));
140 
141  for (IndexType i = 0; i < half_size; ++i) {
142  VERIFY_IS_APPROX(out(i), in1(i + half_size));
143  VERIFY_IS_APPROX(out(i + half_size), in1(i));
144  }
145 
146  sycl_device.deallocate(gpu_data_out);
147  sycl_device.deallocate(gpu_data);
148 }

References i, out(), and VERIFY_IS_APPROX.

◆ test_sycl_mem_transfers()

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_mem_transfers ( const Eigen::SyclDevice &  sycl_device)
30  {
31  IndexType sizeDim1 = 5;
32  IndexType sizeDim2 = 5;
33  IndexType sizeDim3 = 1;
34  array<IndexType, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
39 
40  in1 = in1.random();
41 
42  DataType* gpu_data1 = static_cast<DataType*>(sycl_device.allocate(in1.size() * sizeof(DataType)));
43  DataType* gpu_data2 = static_cast<DataType*>(sycl_device.allocate(out1.size() * sizeof(DataType)));
44 
45  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu1(gpu_data1, tensorRange);
46  TensorMap<Tensor<DataType, 3, DataLayout, IndexType>> gpu2(gpu_data2, tensorRange);
47 
48  sycl_device.memcpyHostToDevice(gpu_data1, in1.data(), (in1.size()) * sizeof(DataType));
49  sycl_device.memcpyHostToDevice(gpu_data2, in1.data(), (in1.size()) * sizeof(DataType));
50  gpu1.device(sycl_device) = gpu1 * 3.14f;
51  gpu2.device(sycl_device) = gpu2 * 2.7f;
52  sycl_device.memcpyDeviceToHost(out1.data(), gpu_data1, (out1.size()) * sizeof(DataType));
53  sycl_device.memcpyDeviceToHost(out2.data(), gpu_data1, (out2.size()) * sizeof(DataType));
54  sycl_device.memcpyDeviceToHost(out3.data(), gpu_data2, (out3.size()) * sizeof(DataType));
55  sycl_device.synchronize();
56 
57  for (IndexType i = 0; i < in1.size(); ++i) {
58  // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n";
59  VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f);
60  VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f);
61  VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f);
62  }
63 
64  sycl_device.deallocate(gpu_data1);
65  sycl_device.deallocate(gpu_data2);
66 }

References Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::data(), Eigen::TensorBase< Derived, AccessLevel >::device(), i, Eigen::Tensor< Scalar_, NumIndices_, Options_, IndexType_ >::size(), and VERIFY_IS_APPROX.

◆ test_sycl_memset_offsets()

template<typename DataType , int DataLayout, typename IndexType >
void test_sycl_memset_offsets ( const Eigen::SyclDevice &  sycl_device)
151  {
152  using tensor_type = Tensor<DataType, 1, DataLayout, IndexType>;
153  IndexType full_size = 32;
154  IndexType half_size = full_size / 2;
155  array<IndexType, 1> tensorRange = {{full_size}};
156  tensor_type cpu_out(tensorRange);
157  tensor_type out(tensorRange);
158 
159  cpu_out.setZero();
160 
161  std::memset(cpu_out.data(), 0, half_size * sizeof(DataType));
162  std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType));
163 
164  DataType* gpu_data = static_cast<DataType*>(sycl_device.allocate(full_size * sizeof(DataType)));
165  TensorMap<tensor_type> gpu1(gpu_data, tensorRange);
166 
167  sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType));
168  sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType));
169  sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType));
170 
171  for (IndexType i = 0; i < full_size; ++i) {
172  VERIFY_IS_APPROX(out(i), cpu_out(i));
173  }
174 
175  sycl_device.deallocate(gpu_data);
176 }

References i, out(), and VERIFY_IS_APPROX.