134 Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
145 CoeffReturnType inclusive_scan;
153 const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
154 Index next_elements = 0;
156 for (
int i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
157 Index global_id = global_offset + next_elements;
172 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
173 Index private_offset = 1;
178 for (
Index l = 0; l < d; l++) {
179 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
180 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
194 private_offset >>= 1;
196 for (
Index l = 0; l < d; l++) {
197 Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
198 Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
202 private_scan[ai] = private_scan[bi];
210 for (
Index d = scratch_stride >> 1; d > 0; d >>= 1) {
212 itemID.barrier(cl::sycl::access::fence_space::local_space);
214 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
215 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
224 itemID.barrier(cl::sycl::access::fence_space::local_space);
238 for (
Index d = 1; d < scratch_stride; d *= 2) {
241 itemID.barrier(cl::sycl::access::fence_space::local_space);
243 Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
244 Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
253 itemID.barrier(cl::sycl::access::fence_space::local_space);
256 for (
int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex +=
PacketSize) {
259 CoeffReturnType accum = private_scan[packetIndex +
i];
261 private_scan[packetIndex +
i] =
accumulator.finalize(accum);
267 private_scan[0] =
accumulator.finalize(inclusive_scan);
273 for (
Index i = 0; i < ScanParameters<Index>::ScanPerThread;
i++) {
274 Index global_id = global_offset + next_elements;
279 out_ptr[global_id] = private_scan[private_id];
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:89
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:111
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op) const
Definition: TensorScanSycl.h:123
const index_t non_scan_stride
Definition: TensorScanSycl.h:58
const index_t scan_stride
Definition: TensorScanSycl.h:59
const index_t total_size
Definition: TensorScanSycl.h:55
const index_t non_scan_size
Definition: TensorScanSycl.h:56
const index_t block_threads
Definition: TensorScanSycl.h:62
static EIGEN_CONSTEXPR Index ScanPerThread
Definition: TensorScanSycl.h:54
const index_t group_threads
Definition: TensorScanSycl.h:61
const index_t panel_threads
Definition: TensorScanSycl.h:60
const index_t elements_per_group
Definition: TensorScanSycl.h:63
const index_t elements_per_block
Definition: TensorScanSycl.h:64
const index_t scan_size
Definition: TensorScanSycl.h:57
const index_t loop_range
Definition: TensorScanSycl.h:65