37 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
38 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
44 namespace TensorSycl {
47 #ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
48 #define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
51 template <
typename index_t>
67 ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
68 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
69 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
84 template <
typename Evaluator,
typename CoeffReturnType,
typename OutAccessor,
typename Op,
typename Index,
87 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
99 OutAccessor out_accessor_, OutAccessor temp_accessor_,
101 const bool inclusive_)
110 template <scan_step sst = stp,
typename Input>
112 const Input &inpt,
Index global_id)
const {
113 return inpt.coeff(global_id);
116 template <scan_step sst = stp,
typename Input>
118 const Input &inpt,
Index global_id)
const {
119 return inpt[global_id];
122 template <scan_step sst = stp,
typename InclusiveOp>
124 InclusiveOp inclusive_op)
const {
128 template <scan_step sst = stp,
typename InclusiveOp>
130 InclusiveOp)
const {}
134 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
145 CoeffReturnType inclusive_scan;
153 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
154 Index next_elements = 0;
156 for (
int i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
157 Index global_id = global_offset + next_elements;
172 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
173 Index private_offset = 1;
178 for (
Index l = 0; l < d; l++) {
179 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
180 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
194 private_offset >>= 1;
196 for (
Index l = 0; l < d; l++) {
197 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
198 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
202 private_scan[ai] = private_scan[bi];
210 for (
Index d = scratch_stride >> 1; d > 0; d >>= 1) {
212 itemID.barrier(cl::sycl::access::fence_space::local_space);
214 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
215 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
224 itemID.barrier(cl::sycl::access::fence_space::local_space);
238 for (
Index d = 1; d < scratch_stride; d *= 2) {
241 itemID.barrier(cl::sycl::access::fence_space::local_space);
243 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
244 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
253 itemID.barrier(cl::sycl::access::fence_space::local_space);
256 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
259 CoeffReturnType accum = private_scan[packetIndex +
i];
261 private_scan[packetIndex +
i] =
accumulator.finalize(accum);
267 private_scan[0] =
accumulator.finalize(inclusive_scan);
273 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
274 Index global_id = global_offset + next_elements;
279 out_ptr[global_id] = private_scan[private_id];
287 template <
typename CoeffReturnType,
typename InAccessor,
typename OutAccessor,
typename Op,
typename Index>
289 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
297 OutAccessor out_accessor_,
304 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
320 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
323 CoeffReturnType adjust_val =
in_ptr[in_id];
325 Index next_elements = 0;
327 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
328 Index global_id = global_offset + next_elements;
332 CoeffReturnType accum = adjust_val;
342 template <
typename Index>
361 const Eigen::SyclDevice &
dev;
363 const Index &non_scan_size_,
const Index &scan_stride_,
const Index &non_scan_stride_,
364 const Eigen::SyclDevice &dev_)
374 Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
386 #ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
404 template <
typename EvaluatorPo
interType,
typename CoeffReturnType,
typename Reducer,
typename Index>
407 Reducer &accumulator,
const Index total_size,
408 const Index scan_size,
const Index panel_size,
409 const Index non_scan_size,
const Index scan_stride,
410 const Index non_scan_stride,
const Eigen::SyclDevice &dev) {
412 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
416 dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
417 scan_info.max_elements_per_block,
418 scan_info.get_scan_parameter(), accumulator)
423 template <
typename CoeffReturnType, scan_step stp>
425 template <
typename Input,
typename EvaluatorPo
interType,
typename Reducer,
typename Index>
428 const Index non_scan_size,
const Index scan_stride,
429 const Index non_scan_stride,
const bool inclusive,
430 const Eigen::SyclDevice &dev) {
432 ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
433 const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
435 CoeffReturnType *temp_pointer =
436 static_cast<CoeffReturnType *
>(dev.allocate_temp(temp_pointer_size *
sizeof(CoeffReturnType)));
437 EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
440 dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
441 in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
442 scan_info.get_scan_parameter(), accumulator, inclusive)
445 if (scan_info.block_size > 1) {
447 tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
448 non_scan_size,
Index(1), scan_info.block_size,
false, dev);
451 tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
452 non_scan_stride, dev);
454 dev.deallocate_temp(temp_pointer);
461 template <
typename Self,
typename Reducer,
bool vectorize>
469 const Index scan_size =
self.size();
470 const Index scan_stride =
self.stride();
472 auto accumulator =
self.accumulator();
473 auto inclusive = !
self.exclusive();
474 auto consume_dim =
self.consume_dim();
475 auto dev =
self.device();
477 auto dims =
self.inner().dimensions();
479 Index non_scan_size = 1;
480 Index panel_size = 1;
481 if (
static_cast<int>(Self::Layout) ==
static_cast<int>(
ColMajor)) {
482 for (
int i = 0;
i < consume_dim;
i++) {
483 non_scan_size *= dims[
i];
485 for (
int i = consume_dim + 1;
i < Self::NumDims;
i++) {
486 panel_size *= dims[
i];
489 for (
int i = Self::NumDims - 1;
i > consume_dim;
i--) {
490 non_scan_size *= dims[
i];
492 for (
int i = consume_dim - 1;
i >= 0;
i--) {
493 panel_size *= dims[
i];
496 const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
497 auto eval_impl =
self.inner();
499 eval_impl,
data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
#define EIGEN_SYCL_MAX_GLOBAL_RANGE
Definition: TensorScanSycl.h:48
#define min(a, b)
Definition: datatypes.h:22
@ ColMajor
Definition: Constants.h:318
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
scan_step
Definition: TensorScanSycl.h:83
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16 &a)
Definition: BFloat16.h:644
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
Definition: TensorScanSycl.h:405
static EIGEN_STRONG_INLINE void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:406
Definition: TensorScanSycl.h:288
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:294
OutAccessor out_ptr
Definition: TensorScanSycl.h:293
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:291
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:290
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_, OutAccessor out_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_)
Definition: TensorScanSycl.h:296
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:302
InAccessor in_ptr
Definition: TensorScanSycl.h:292
Op accumulator
Definition: TensorScanSycl.h:295
Definition: TensorScanSycl.h:343
Index max_elements_per_block
Definition: TensorScanSycl.h:351
const Index & total_size
Definition: TensorScanSycl.h:344
const Index & non_scan_stride
Definition: TensorScanSycl.h:349
Index group_threads
Definition: TensorScanSycl.h:354
const Index & non_scan_size
Definition: TensorScanSycl.h:347
const Index & scan_stride
Definition: TensorScanSycl.h:348
const Eigen::SyclDevice & dev
Definition: TensorScanSycl.h:361
Index local_range
Definition: TensorScanSycl.h:360
cl::sycl::nd_range< 1 > get_thread_range()
Definition: TensorScanSycl.h:399
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_, const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_, const Eigen::SyclDevice &dev_)
Definition: TensorScanSycl.h:362
Index elements_per_block
Definition: TensorScanSycl.h:357
Index block_threads
Definition: TensorScanSycl.h:355
Index block_size
Definition: TensorScanSycl.h:352
Index panel_threads
Definition: TensorScanSycl.h:353
Index elements_per_group
Definition: TensorScanSycl.h:356
Index loop_range
Definition: TensorScanSycl.h:358
const Index & panel_size
Definition: TensorScanSycl.h:346
const Index & scan_size
Definition: TensorScanSycl.h:345
Index global_range
Definition: TensorScanSycl.h:359
ScanParameters< Index > get_scan_parameter()
Definition: TensorScanSycl.h:395
Definition: TensorScanSycl.h:86
Op accumulator
Definition: TensorScanSycl.h:96
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:89
std::enable_if_t< sst !=scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:117
Evaluator dev_eval
Definition: TensorScanSycl.h:92
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorScanSycl.h:88
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
Definition: TensorScanSycl.h:98
OutAccessor out_ptr
Definition: TensorScanSycl.h:93
LocalAccessor scratch
Definition: TensorScanSycl.h:91
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorScanSycl.h:132
OutAccessor tmp_ptr
Definition: TensorScanSycl.h:94
std::enable_if_t< sst !=scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp) const
Definition: TensorScanSycl.h:129
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:111
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op) const
Definition: TensorScanSycl.h:123
const bool inclusive
Definition: TensorScanSycl.h:97
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:95
Definition: TensorScanSycl.h:424
static EIGEN_STRONG_INLINE void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator, const Index total_size, const Index scan_size, const Index panel_size, const Index non_scan_size, const Index scan_stride, const Index non_scan_stride, const bool inclusive, const Eigen::SyclDevice &dev)
Definition: TensorScanSycl.h:426
Definition: TensorScanSycl.h:52
const index_t non_scan_stride
Definition: TensorScanSycl.h:58
const index_t scan_stride
Definition: TensorScanSycl.h:59
const index_t total_size
Definition: TensorScanSycl.h:55
const index_t non_scan_size
Definition: TensorScanSycl.h:56
const index_t block_threads
Definition: TensorScanSycl.h:62
static EIGEN_CONSTEXPR index_t ScanPerThread
Definition: TensorScanSycl.h:54
const index_t group_threads
Definition: TensorScanSycl.h:61
const index_t panel_threads
Definition: TensorScanSycl.h:60
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_, index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_, index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
Definition: TensorScanSycl.h:67
const index_t elements_per_group
Definition: TensorScanSycl.h:63
const index_t elements_per_block
Definition: TensorScanSycl.h:64
const index_t scan_size
Definition: TensorScanSycl.h:57
const index_t loop_range
Definition: TensorScanSycl.h:65
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorScanSycl.h:466
Self::Index Index
Definition: TensorScanSycl.h:463
Self::CoeffReturnType CoeffReturnType
Definition: TensorScanSycl.h:464
void operator()(Self &self, EvaluatorPointerType data) const
Definition: TensorScanSycl.h:467
Self::Storage Storage
Definition: TensorScanSycl.h:465
Definition: TensorScan.h:179