15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
20 #include "./InternalHeaderCheck.h"
24 namespace TensorSycl {
28 struct SyclDeviceInfo {
29 SyclDeviceInfo(cl::sycl::queue queue)
30 : local_mem_type(queue.get_device().template get_info<cl::sycl::
info::device::local_mem_type>()),
31 max_work_item_sizes(queue.get_device().template get_info<cl::sycl::
info::device::max_work_item_sizes<3>>()),
32 max_mem_alloc_size(queue.get_device().template get_info<cl::sycl::
info::device::max_mem_alloc_size>()),
33 max_compute_units(queue.get_device().template get_info<cl::sycl::
info::device::max_compute_units>()),
34 max_work_group_size(queue.get_device().template get_info<cl::sycl::
info::device::max_work_group_size>()),
35 local_mem_size(queue.get_device().template get_info<cl::sycl::
info::device::local_mem_size>()),
36 platform_name(queue.get_device().get_platform().template get_info<cl::sycl::
info::platform::
name>()),
37 device_name(queue.get_device().template get_info<cl::sycl::
info::device::
name>()),
38 device_vendor(queue.get_device().template get_info<cl::sycl::
info::device::vendor>()) {}
40 cl::sycl::info::local_mem_type local_mem_type;
41 cl::sycl::id<3> max_work_item_sizes;
42 unsigned long max_mem_alloc_size;
43 unsigned long max_compute_units;
44 unsigned long max_work_group_size;
45 size_t local_mem_size;
57 EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) {
58 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
59 return {cl::sycl::device(cl::sycl::default_selector())};
61 std::vector<cl::sycl::device> supported_devices;
62 auto platform_list = cl::sycl::platform::get_platforms();
63 for (
const auto &platform : platform_list) {
64 auto device_list = platform.get_devices();
65 auto platform_name = platform.template get_info<cl::sycl::info::platform::name>();
66 std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
67 for (
const auto &device : device_list) {
68 auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
69 std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
70 bool unsupported_condition = (device.is_cpu() && platform_name.find(
"amd") != std::string::npos &&
71 vendor.find(
"apu") == std::string::npos) ||
72 (platform_name.find(
"experimental") != std::string::npos) || device.is_host();
73 if (!unsupported_condition) {
74 supported_devices.push_back(device);
78 return supported_devices;
82 class QueueInterface {
85 template <
typename DeviceOrSelector>
86 explicit QueueInterface(
const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
87 unsigned num_threads = std::thread::hardware_concurrency())
88 : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
89 m_thread_pool(num_threads),
90 m_device_info(m_queue) {}
92 template <
typename DeviceOrSelector>
93 explicit QueueInterface(
const DeviceOrSelector &dev_or_sel,
94 unsigned num_threads = std::thread::hardware_concurrency())
96 dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); },
99 explicit QueueInterface(
const cl::sycl::queue &
q,
unsigned num_threads = std::thread::hardware_concurrency())
100 : m_queue(
q), m_thread_pool(num_threads), m_device_info(m_queue) {}
103 #if EIGEN_MAX_ALIGN_BYTES > 0
106 return (
void *)cl::sycl::malloc_device(num_bytes, m_queue);
111 return (
void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
114 template <
typename data_t>
121 EIGEN_STRONG_INLINE void deallocate_temp(
const void *
p)
const { deallocate_temp(
const_cast<void *
>(
p)); }
130 std::function<
void()> callback)
const {
131 auto e = m_queue.memcpy(dst, src,
n);
132 synchronize_and_callback(
e, callback);
140 std::function<
void()> callback)
const {
142 if (callback) callback();
145 auto e = m_queue.memcpy(dst, src,
n);
146 synchronize_and_callback(
e, callback);
156 m_queue.memcpy(dst, src,
n).wait();
166 m_queue.memset(
data,
c,
n).wait();
169 template <
typename T>
174 const size_t count =
end - begin;
175 m_queue.fill(begin,
value, count).wait();
178 template <
typename OutScalar,
typename sycl_kernel,
typename Lhs,
typename Rhs,
typename OutPtr,
typename Range,
179 typename Index,
typename...
T>
181 Range thread_range,
Index scratchSize,
T... var)
const {
182 auto kernel_functor = [=](cl::sycl::handler &cgh) {
183 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
186 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
187 cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
190 return m_queue.submit(kernel_functor);
193 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
typename OutPtr,
typename Range,
typename Index,
195 EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(
const InPtr &inptr, OutPtr &outptr, Range thread_range,
196 Index scratchSize,
T... var)
const {
197 auto kernel_functor = [=](cl::sycl::handler &cgh) {
198 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
201 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
202 cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...));
204 return m_queue.submit(kernel_functor);
207 template <
typename OutScalar,
typename sycl_kernel,
typename InPtr,
typename Range,
typename Index,
typename...
T>
208 EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(
const InPtr &inptr, Range thread_range,
Index scratchSize,
210 auto kernel_functor = [=](cl::sycl::handler &cgh) {
211 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
214 LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
215 cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
218 return m_queue.submit(kernel_functor);
222 #ifdef EIGEN_EXCEPTIONS
223 m_queue.wait_and_throw();
229 template <
typename Index>
231 tileSize =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
232 tileSize =
std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
233 static_cast<Index>(tileSize));
235 if (rng == 0) rng =
static_cast<Index>(1);
237 if (tileSize > GRange)
239 else if (GRange > tileSize) {
240 Index xMode =
static_cast<Index>(GRange % tileSize);
241 if (xMode != 0) GRange +=
static_cast<Index>(tileSize - xMode);
247 template <
typename Index>
248 EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
249 cl::sycl::range<2> &local_range)
const {
250 std::array<Index, 2> input_range = input_dim;
251 Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
252 max_workgroup_Size =
std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
253 static_cast<Index>(max_workgroup_Size));
256 input_range[1] = input_dim[1];
257 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
258 global_range[1] = input_range[1];
259 if (local_range[1] > global_range[1])
260 local_range[1] = global_range[1];
261 else if (global_range[1] > local_range[1]) {
262 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
263 if (xMode != 0) global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
265 local_range[0] =
static_cast<Index>(max_workgroup_Size / local_range[1]);
266 input_range[0] = input_dim[0];
267 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
268 global_range[0] = input_range[0];
269 if (local_range[0] > global_range[0])
270 local_range[0] = global_range[0];
271 else if (global_range[0] > local_range[0]) {
272 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
273 if (xMode != 0) global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
279 template <
typename Index>
280 EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
281 cl::sycl::range<3> &local_range)
const {
282 std::array<Index, 3> input_range = input_dim;
283 Index max_workgroup_Size =
static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
284 max_workgroup_Size =
std::min(
static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
285 static_cast<Index>(max_workgroup_Size));
288 input_range[2] = input_dim[2];
289 if (input_range[2] == 0) input_range[1] =
static_cast<Index>(1);
290 global_range[2] = input_range[2];
291 if (local_range[2] > global_range[2])
292 local_range[2] = global_range[2];
293 else if (global_range[2] > local_range[2]) {
294 Index xMode =
static_cast<Index>(global_range[2] % local_range[2]);
295 if (xMode != 0) global_range[2] +=
static_cast<Index>(local_range[2] - xMode);
297 pow_of_2 =
static_cast<Index>(
std::log2(
static_cast<Index>(max_workgroup_Size / local_range[2])));
299 input_range[1] = input_dim[1];
300 if (input_range[1] == 0) input_range[1] =
static_cast<Index>(1);
301 global_range[1] = input_range[1];
302 if (local_range[1] > global_range[1])
303 local_range[1] = global_range[1];
304 else if (global_range[1] > local_range[1]) {
305 Index xMode =
static_cast<Index>(global_range[1] % local_range[1]);
306 if (xMode != 0) global_range[1] +=
static_cast<Index>(local_range[1] - xMode);
308 local_range[0] =
static_cast<Index>(max_workgroup_Size / (local_range[1] * local_range[2]));
309 input_range[0] = input_dim[0];
310 if (input_range[0] == 0) input_range[0] =
static_cast<Index>(1);
311 global_range[0] = input_range[0];
312 if (local_range[0] > global_range[0])
313 local_range[0] = global_range[0];
314 else if (global_range[0] > local_range[0]) {
315 Index xMode =
static_cast<Index>(global_range[0] % local_range[0]);
316 if (xMode != 0) global_range[0] +=
static_cast<Index>(local_range[0] - xMode);
321 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
323 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
326 return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local;
330 EIGEN_STRONG_INLINE unsigned long max_buffer_size()
const {
return m_device_info.max_mem_alloc_size; }
332 EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors()
const {
return m_device_info.max_compute_units; }
334 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock()
const {
return m_device_info.max_work_group_size; }
336 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
return m_device_info.max_work_item_sizes; }
346 EIGEN_STRONG_INLINE size_t sharedMemPerBlock()
const {
return m_device_info.local_mem_size; }
351 return getPowerOfTwo(m_device_info.max_work_group_size,
false);
364 if (roundUp) --wGSize;
365 wGSize |= (wGSize >> 1);
366 wGSize |= (wGSize >> 2);
367 wGSize |= (wGSize >> 4);
368 wGSize |= (wGSize >> 8);
369 wGSize |= (wGSize >> 16);
370 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
371 wGSize |= (wGSize >> 32);
373 return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
381 if (!exception_caught_) {
384 return !exception_caught_;
388 void synchronize_and_callback(cl::sycl::event
e,
const std::function<
void()> &callback)
const {
390 auto callback_ = [=]() {
391 #ifdef EIGEN_EXCEPTIONS
392 cl::sycl::event(
e).wait_and_throw();
394 cl::sycl::event(
e).wait();
398 m_thread_pool.Schedule(std::move(callback_));
400 #ifdef EIGEN_EXCEPTIONS
401 m_queue.wait_and_throw();
408 bool sycl_async_handler(cl::sycl::exception_list exceptions)
const {
409 bool exception_caught =
false;
410 for (
const auto &
e : exceptions) {
412 exception_caught =
true;
416 return exception_caught;
420 bool exception_caught_ =
false;
422 mutable cl::sycl::queue m_queue;
427 const TensorSycl::internal::SyclDeviceInfo m_device_info;
430 struct SyclDeviceBase {
433 const QueueInterface *m_queue_stream;
434 explicit SyclDeviceBase(
const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {}
440 struct SyclDevice :
public SyclDeviceBase {
441 explicit SyclDevice(
const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {}
445 template <
typename Index>
447 queue_stream()->parallel_for_setup(
n, tileSize, rng, GRange);
452 template <
typename Index>
453 EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
454 cl::sycl::range<2> &local_range)
const {
455 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
460 template <
typename Index>
461 EIGEN_STRONG_INLINE void parallel_for_setup(
const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
462 cl::sycl::range<3> &local_range)
const {
463 queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
467 EIGEN_STRONG_INLINE void *allocate(
size_t num_bytes)
const {
return queue_stream()->allocate(num_bytes); }
469 EIGEN_STRONG_INLINE void *allocate_temp(
size_t num_bytes)
const {
return queue_stream()->allocate_temp(num_bytes); }
474 EIGEN_STRONG_INLINE void deallocate_temp(
void *buffer)
const { queue_stream()->deallocate_temp(buffer); }
476 EIGEN_STRONG_INLINE void deallocate_temp(
const void *buffer)
const { queue_stream()->deallocate_temp(buffer); }
478 template <
typename data_t>
487 template <
typename Index>
489 std::function<
void()> callback = {})
const {
490 queue_stream()->memcpyHostToDevice(dst, src,
n, callback);
493 template <
typename Index>
495 std::function<
void()> callback = {})
const {
496 queue_stream()->memcpyDeviceToHost(dst, src,
n, callback);
499 template <
typename Index>
501 queue_stream()->memcpy(dst, src,
n);
506 template <
typename T>
508 queue_stream()->fill(begin,
end,
value);
511 EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue()
const {
return queue_stream()->sycl_queue(); }
518 return firstLevelCacheSize();
521 return queue_stream()->getNumSyclMultiProcessors();
523 EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock()
const {
return queue_stream()->maxSyclThreadsPerBlock(); }
524 EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes()
const {
return queue_stream()->maxWorkItemSizes(); }
527 return queue_stream()->maxSyclThreadsPerMultiProcessor();
529 EIGEN_STRONG_INLINE size_t sharedMemPerBlock()
const {
return queue_stream()->sharedMemPerBlock(); }
531 return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
535 return queue_stream()->getPowerOfTwo(
val, roundUp);
538 EIGEN_STRONG_INLINE int majorDeviceVersion()
const {
return queue_stream()->majorDeviceVersion(); }
546 EIGEN_STRONG_INLINE bool has_local_memory()
const {
return queue_stream()->has_local_memory(); }
547 EIGEN_STRONG_INLINE long max_buffer_size()
const {
return queue_stream()->max_buffer_size(); }
551 template <
typename OutScalar,
typename KernelType,
typename...
T>
553 return queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(var...);
555 template <
typename OutScalar,
typename KernelType,
typename...
T>
557 return queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(var...);
560 template <
typename OutScalar,
typename KernelType,
typename...
T>
562 return queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_THROW_X(X)
Definition: Macros.h:1260
int data[]
Definition: Map_placement_new.cpp:1
float * p
Definition: Tutorial_Map_using.cpp:9
Definition: NonBlockingThreadPool.h:19
#define min(a, b)
Definition: datatypes.h:22
EIGEN_DONT_INLINE void transform(const Transformation &t, Data &data)
Definition: geometry.cpp:25
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
int info
Definition: level2_cplx_impl.h:39
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:625
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
int c
Definition: calibrate.py:100
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
std::string string(const unsigned &i)
Definition: oomph_definitions.cc:286
string name
Definition: plotDoE.py:33
Scalar log2(Scalar x)
Definition: packetmath.cpp:754
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29