TensorDeviceSycl.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
16 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
17 #include <unordered_set>
18 
19 // IWYU pragma: private
20 #include "./InternalHeaderCheck.h"
21 
22 namespace Eigen {
23 
24 namespace TensorSycl {
25 namespace internal {
26 
28 struct SyclDeviceInfo {
29  SyclDeviceInfo(cl::sycl::queue queue)
30  : local_mem_type(queue.get_device().template get_info<cl::sycl::info::device::local_mem_type>()),
31  max_work_item_sizes(queue.get_device().template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
32  max_mem_alloc_size(queue.get_device().template get_info<cl::sycl::info::device::max_mem_alloc_size>()),
33  max_compute_units(queue.get_device().template get_info<cl::sycl::info::device::max_compute_units>()),
34  max_work_group_size(queue.get_device().template get_info<cl::sycl::info::device::max_work_group_size>()),
35  local_mem_size(queue.get_device().template get_info<cl::sycl::info::device::local_mem_size>()),
36  platform_name(queue.get_device().get_platform().template get_info<cl::sycl::info::platform::name>()),
37  device_name(queue.get_device().template get_info<cl::sycl::info::device::name>()),
38  device_vendor(queue.get_device().template get_info<cl::sycl::info::device::vendor>()) {}
39 
40  cl::sycl::info::local_mem_type local_mem_type;
41  cl::sycl::id<3> max_work_item_sizes;
42  unsigned long max_mem_alloc_size;
43  unsigned long max_compute_units;
44  unsigned long max_work_group_size;
45  size_t local_mem_size;
46  std::string platform_name;
47  std::string device_name;
48  std::string device_vendor;
49 };
50 
51 } // end namespace internal
52 } // end namespace TensorSycl
53 
54 // All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
55 // can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
56 // TensorFlow via the Eigen SYCL Backend.
57 EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) {
58 #ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
59  return {cl::sycl::device(cl::sycl::default_selector())};
60 #else
61  std::vector<cl::sycl::device> supported_devices;
62  auto platform_list = cl::sycl::platform::get_platforms();
63  for (const auto &platform : platform_list) {
64  auto device_list = platform.get_devices();
65  auto platform_name = platform.template get_info<cl::sycl::info::platform::name>();
66  std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
67  for (const auto &device : device_list) {
68  auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
69  std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
70  bool unsupported_condition = (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
71  vendor.find("apu") == std::string::npos) ||
72  (platform_name.find("experimental") != std::string::npos) || device.is_host();
73  if (!unsupported_condition) {
74  supported_devices.push_back(device);
75  }
76  }
77  }
78  return supported_devices;
79 #endif
80 }
81 
82 class QueueInterface {
83  public:
85  template <typename DeviceOrSelector>
86  explicit QueueInterface(const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
87  unsigned num_threads = std::thread::hardware_concurrency())
88  : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
89  m_thread_pool(num_threads),
90  m_device_info(m_queue) {}
91 
92  template <typename DeviceOrSelector>
93  explicit QueueInterface(const DeviceOrSelector &dev_or_sel,
94  unsigned num_threads = std::thread::hardware_concurrency())
95  : QueueInterface(
96  dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); },
97  num_threads) {}
98 
99  explicit QueueInterface(const cl::sycl::queue &q, unsigned num_threads = std::thread::hardware_concurrency())
100  : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
101 
102  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
103 #if EIGEN_MAX_ALIGN_BYTES > 0
104  return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, num_bytes, m_queue);
105 #else
106  return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
107 #endif
108  }
109 
110  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
111  return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
112  }
113 
114  template <typename data_t>
115  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
116  return data;
117  }
118 
119  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
120 
121  EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { deallocate_temp(const_cast<void *>(p)); }
122 
123  EIGEN_STRONG_INLINE void deallocate(void *p) const { cl::sycl::free(p, m_queue); }
124 
129  EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n,
130  std::function<void()> callback) const {
131  auto e = m_queue.memcpy(dst, src, n);
132  synchronize_and_callback(e, callback);
133  }
134 
139  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n,
140  std::function<void()> callback) const {
141  if (n == 0) {
142  if (callback) callback();
143  return;
144  }
145  auto e = m_queue.memcpy(dst, src, n);
146  synchronize_and_callback(e, callback);
147  }
148 
152  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
153  if (n == 0) {
154  return;
155  }
156  m_queue.memcpy(dst, src, n).wait();
157  }
158 
162  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
163  if (n == 0) {
164  return;
165  }
166  m_queue.memset(data, c, n).wait();
167  }
168 
169  template <typename T>
170  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
171  if (begin == end) {
172  return;
173  }
174  const size_t count = end - begin;
175  m_queue.fill(begin, value, count).wait();
176  }
177 
178  template <typename OutScalar, typename sycl_kernel, typename Lhs, typename Rhs, typename OutPtr, typename Range,
179  typename Index, typename... T>
180  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(const Lhs &lhs, const Rhs &rhs, OutPtr outptr,
181  Range thread_range, Index scratchSize, T... var) const {
182  auto kernel_functor = [=](cl::sycl::handler &cgh) {
183  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
184  LocalAccessor;
185 
186  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
187  cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
188  };
189 
190  return m_queue.submit(kernel_functor);
191  }
192 
193  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename OutPtr, typename Range, typename Index,
194  typename... T>
195  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, OutPtr &outptr, Range thread_range,
196  Index scratchSize, T... var) const {
197  auto kernel_functor = [=](cl::sycl::handler &cgh) {
198  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
199  LocalAccessor;
200 
201  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
202  cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...));
203  };
204  return m_queue.submit(kernel_functor);
205  }
206 
207  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename Range, typename Index, typename... T>
208  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(const InPtr &inptr, Range thread_range, Index scratchSize,
209  T... var) const {
210  auto kernel_functor = [=](cl::sycl::handler &cgh) {
211  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
212  LocalAccessor;
213 
214  LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
215  cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
216  };
217 
218  return m_queue.submit(kernel_functor);
219  }
220 
221  EIGEN_STRONG_INLINE void synchronize() const {
222 #ifdef EIGEN_EXCEPTIONS
223  m_queue.wait_and_throw();
224 #else
225  m_queue.wait();
226 #endif
227  }
228 
229  template <typename Index>
230  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
231  tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
232  tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
233  static_cast<Index>(tileSize));
234  rng = n;
235  if (rng == 0) rng = static_cast<Index>(1);
236  GRange = rng;
237  if (tileSize > GRange)
238  tileSize = GRange;
239  else if (GRange > tileSize) {
240  Index xMode = static_cast<Index>(GRange % tileSize);
241  if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
242  }
243  }
244 
247  template <typename Index>
248  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
249  cl::sycl::range<2> &local_range) const {
250  std::array<Index, 2> input_range = input_dim;
251  Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
252  max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
253  static_cast<Index>(max_workgroup_Size));
254  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
255  local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
256  input_range[1] = input_dim[1];
257  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
258  global_range[1] = input_range[1];
259  if (local_range[1] > global_range[1])
260  local_range[1] = global_range[1];
261  else if (global_range[1] > local_range[1]) {
262  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
263  if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
264  }
265  local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
266  input_range[0] = input_dim[0];
267  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
268  global_range[0] = input_range[0];
269  if (local_range[0] > global_range[0])
270  local_range[0] = global_range[0];
271  else if (global_range[0] > local_range[0]) {
272  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
273  if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
274  }
275  }
276 
279  template <typename Index>
280  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
281  cl::sycl::range<3> &local_range) const {
282  std::array<Index, 3> input_range = input_dim;
283  Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
284  max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
285  static_cast<Index>(max_workgroup_Size));
286  Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
287  local_range[2] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
288  input_range[2] = input_dim[2];
289  if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
290  global_range[2] = input_range[2];
291  if (local_range[2] > global_range[2])
292  local_range[2] = global_range[2];
293  else if (global_range[2] > local_range[2]) {
294  Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
295  if (xMode != 0) global_range[2] += static_cast<Index>(local_range[2] - xMode);
296  }
297  pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
298  local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
299  input_range[1] = input_dim[1];
300  if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
301  global_range[1] = input_range[1];
302  if (local_range[1] > global_range[1])
303  local_range[1] = global_range[1];
304  else if (global_range[1] > local_range[1]) {
305  Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
306  if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
307  }
308  local_range[0] = static_cast<Index>(max_workgroup_Size / (local_range[1] * local_range[2]));
309  input_range[0] = input_dim[0];
310  if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
311  global_range[0] = input_range[0];
312  if (local_range[0] > global_range[0])
313  local_range[0] = global_range[0];
314  else if (global_range[0] > local_range[0]) {
315  Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
316  if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
317  }
318  }
319 
320  EIGEN_STRONG_INLINE bool has_local_memory() const {
321 #if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
322  return false;
323 #elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
324  return true;
325 #else
326  return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local;
327 #endif
328  }
329 
330  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { return m_device_info.max_mem_alloc_size; }
331 
332  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { return m_device_info.max_compute_units; }
333 
334  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return m_device_info.max_work_group_size; }
335 
336  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return m_device_info.max_work_item_sizes; }
337 
339  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
340 
341  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
342  // OpenCL does not have such a concept
343  return 2;
344  }
345 
346  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return m_device_info.local_mem_size; }
347 
348  // This function returns the nearest power of 2 Work-group size which is <=
349  // maximum device workgroup size.
350  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
351  return getPowerOfTwo(m_device_info.max_work_group_size, false);
352  }
353 
354  EIGEN_STRONG_INLINE std::string getPlatformName() const { return m_device_info.platform_name; }
355 
356  EIGEN_STRONG_INLINE std::string getDeviceName() const { return m_device_info.device_name; }
357 
358  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return m_device_info.device_vendor; }
359 
360  // This function returns the nearest power of 2
361  // if roundup is true returns result>=wgsize
362  // else it return result <= wgsize
363  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
364  if (roundUp) --wGSize;
365  wGSize |= (wGSize >> 1);
366  wGSize |= (wGSize >> 2);
367  wGSize |= (wGSize >> 4);
368  wGSize |= (wGSize >> 8);
369  wGSize |= (wGSize >> 16);
370 #if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
371  wGSize |= (wGSize >> 32);
372 #endif
373  return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
374  }
375 
376  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
377 
378  // This function checks if the runtime recorded an error for the
379  // underlying stream device.
380  EIGEN_STRONG_INLINE bool ok() const {
381  if (!exception_caught_) {
382  synchronize();
383  }
384  return !exception_caught_;
385  }
386 
387  protected:
388  void synchronize_and_callback(cl::sycl::event e, const std::function<void()> &callback) const {
389  if (callback) {
390  auto callback_ = [=]() {
391 #ifdef EIGEN_EXCEPTIONS
392  cl::sycl::event(e).wait_and_throw();
393 #else
394  cl::sycl::event(e).wait();
395 #endif
396  callback();
397  };
398  m_thread_pool.Schedule(std::move(callback_));
399  } else {
400 #ifdef EIGEN_EXCEPTIONS
401  m_queue.wait_and_throw();
402 #else
403  m_queue.wait();
404 #endif
405  }
406  }
407 
408  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
409  bool exception_caught = false;
410  for (const auto &e : exceptions) {
411  if (e) {
412  exception_caught = true;
413  EIGEN_THROW_X(e);
414  }
415  }
416  return exception_caught;
417  }
418 
420  bool exception_caught_ = false;
422  mutable cl::sycl::queue m_queue;
425  mutable Eigen::ThreadPool m_thread_pool;
426 
427  const TensorSycl::internal::SyclDeviceInfo m_device_info;
428 };
429 
430 struct SyclDeviceBase {
433  const QueueInterface *m_queue_stream;
434  explicit SyclDeviceBase(const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {}
435  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { return m_queue_stream; }
436 };
437 
438 // Here is a sycl device struct which accept the sycl queue interface
439 // as an input
440 struct SyclDevice : public SyclDeviceBase {
441  explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {}
442 
445  template <typename Index>
446  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
447  queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
448  }
449 
452  template <typename Index>
453  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
454  cl::sycl::range<2> &local_range) const {
455  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
456  }
457 
460  template <typename Index>
461  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
462  cl::sycl::range<3> &local_range) const {
463  queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
464  }
465 
467  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return queue_stream()->allocate(num_bytes); }
468 
469  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { return queue_stream()->allocate_temp(num_bytes); }
470 
472  EIGEN_STRONG_INLINE void deallocate(void *p) const { queue_stream()->deallocate(p); }
473 
474  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); }
475 
476  EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); }
477 
478  template <typename data_t>
479  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
480  return data;
481  }
482 
483  // some runtime conditions that can be applied here
484  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
485 
487  template <typename Index>
488  EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n,
489  std::function<void()> callback = {}) const {
490  queue_stream()->memcpyHostToDevice(dst, src, n, callback);
491  }
493  template <typename Index>
494  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n,
495  std::function<void()> callback = {}) const {
496  queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
497  }
499  template <typename Index>
500  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
501  queue_stream()->memcpy(dst, src, n);
502  }
504  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); }
506  template <typename T>
507  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
508  queue_stream()->fill(begin, end, value);
509  }
511  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); }
512 
513  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
514 
515  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
516  // We won't try to take advantage of the l2 cache for the time being, and
517  // there is no l3 cache on sycl devices.
518  return firstLevelCacheSize();
519  }
520  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
521  return queue_stream()->getNumSyclMultiProcessors();
522  }
523  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return queue_stream()->maxSyclThreadsPerBlock(); }
524  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return queue_stream()->maxWorkItemSizes(); }
525  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
526  // OpenCL does not have such a concept
527  return queue_stream()->maxSyclThreadsPerMultiProcessor();
528  }
529  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return queue_stream()->sharedMemPerBlock(); }
530  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
531  return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
532  }
533 
534  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
535  return queue_stream()->getPowerOfTwo(val, roundUp);
536  }
538  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return queue_stream()->majorDeviceVersion(); }
539 
540  EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); }
541 
542  // This function checks if the runtime recorded an error for the
543  // underlying stream device.
544  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
545 
546  EIGEN_STRONG_INLINE bool has_local_memory() const { return queue_stream()->has_local_memory(); }
547  EIGEN_STRONG_INLINE long max_buffer_size() const { return queue_stream()->max_buffer_size(); }
548  EIGEN_STRONG_INLINE std::string getPlatformName() const { return queue_stream()->getPlatformName(); }
549  EIGEN_STRONG_INLINE std::string getDeviceName() const { return queue_stream()->getDeviceName(); }
550  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); }
551  template <typename OutScalar, typename KernelType, typename... T>
552  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
553  return queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(var...);
554  }
555  template <typename OutScalar, typename KernelType, typename... T>
556  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
557  return queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(var...);
558  }
559 
560  template <typename OutScalar, typename KernelType, typename... T>
561  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
562  return queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
563  }
564 };
565 } // end namespace Eigen
566 
567 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
const unsigned n
Definition: CG3DPackingUnitTest.cpp:11
#define EIGEN_MAX_ALIGN_BYTES
Definition: ConfigureVectorization.h:163
Array< double, 1, 3 > e(1./3., 0.5, 2.)
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_THROW_X(X)
Definition: Macros.h:1260
int data[]
Definition: Map_placement_new.cpp:1
float * p
Definition: Tutorial_Map_using.cpp:9
Definition: NonBlockingThreadPool.h:19
#define min(a, b)
Definition: datatypes.h:22
EIGEN_DONT_INLINE void transform(const Transformation &t, Data &data)
Definition: geometry.cpp:25
static constexpr lastp1_t end
Definition: IndexedViewHelper.h:79
int info
Definition: level2_cplx_impl.h:39
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16 &a, const bfloat16 &b)
Definition: BFloat16.h:625
@ Lhs
Definition: TensorContractionMapper.h:20
@ Rhs
Definition: TensorContractionMapper.h:20
EIGEN_DEVICE_FUNC const Scalar & q
Definition: SpecialFunctionsImpl.h:2019
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
int c
Definition: calibrate.py:100
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
std::string string(const unsigned &i)
Definition: oomph_definitions.cc:286
string name
Definition: plotDoE.py:33
Scalar log2(Scalar x)
Definition: packetmath.cpp:754
Container::iterator get(Container &c, Position position)
Definition: stdlist_overload.cpp:29