Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp > Struct Template Reference

#include <TensorScanSycl.h>

Public Types

typedef cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
 

Public Member Functions

EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor (LocalAccessor scratch_, const Evaluator dev_eval_, OutAccessor out_accessor_, OutAccessor temp_accessor_, const ScanParameters< Index > scanParameters_, Op accumulator_, const bool inclusive_)
 
template<scan_step sst = stp, typename Input >
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read (const Input &inpt, Index global_id) const
 
template<scan_step sst = stp, typename Input >
std::enable_if_t< sst !=scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read (const Input &inpt, Index global_id) const
 
template<scan_step sst = stp, typename InclusiveOp >
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation (InclusiveOp inclusive_op) const
 
template<scan_step sst = stp, typename InclusiveOp >
std::enable_if_t< sst !=scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation (InclusiveOp) const
 
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator() (cl::sycl::nd_item< 1 > itemID) const
 

Public Attributes

LocalAccessor scratch
 
Evaluator dev_eval
 
OutAccessor out_ptr
 
OutAccessor tmp_ptr
 
const ScanParameters< IndexscanParameters
 
Op accumulator
 
const bool inclusive
 

Static Public Attributes

static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2
 

Member Typedef Documentation

◆ LocalAccessor

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::LocalAccessor

Constructor & Destructor Documentation

◆ ScanKernelFunctor()

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::ScanKernelFunctor ( LocalAccessor  scratch_,
const Evaluator  dev_eval_,
OutAccessor  out_accessor_,
OutAccessor  temp_accessor_,
const ScanParameters< Index scanParameters_,
Op  accumulator_,
const bool  inclusive_ 
)
inline
102  : scratch(scratch_),
103  dev_eval(dev_eval_),
104  out_ptr(out_accessor_),
105  tmp_ptr(temp_accessor_),
106  scanParameters(scanParameters_),
107  accumulator(accumulator_),
108  inclusive(inclusive_) {}
Op accumulator
Definition: TensorScanSycl.h:96
Evaluator dev_eval
Definition: TensorScanSycl.h:92
OutAccessor out_ptr
Definition: TensorScanSycl.h:93
LocalAccessor scratch
Definition: TensorScanSycl.h:91
OutAccessor tmp_ptr
Definition: TensorScanSycl.h:94
const bool inclusive
Definition: TensorScanSycl.h:97
const ScanParameters< Index > scanParameters
Definition: TensorScanSycl.h:95

Member Function Documentation

◆ first_step_inclusive_Operation() [1/2]

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
template<scan_step sst = stp, typename InclusiveOp >
std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::first_step_inclusive_Operation ( InclusiveOp  inclusive_op) const
inline

◆ first_step_inclusive_Operation() [2/2]

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
template<scan_step sst = stp, typename InclusiveOp >
std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::first_step_inclusive_Operation ( InclusiveOp  ) const
inline
130  {}

◆ operator()()

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::operator() ( cl::sycl::nd_item< 1 >  itemID) const
inline
132  {
133  for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
134  Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
135  Index tmp = data_offset % scanParameters.panel_threads;
136  const Index panel_id = data_offset / scanParameters.panel_threads;
137  const Index group_id = tmp / scanParameters.group_threads;
139  const Index block_id = tmp / scanParameters.block_threads;
140  const Index local_id = tmp % scanParameters.block_threads;
141  // we put one element per packet in scratch_mem
142  const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
143  const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
144  CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
145  CoeffReturnType inclusive_scan;
146  // the actual panel size is scan_size * non_scan_size.
147  // elements_per_panel is roundup to power of 2 for binary tree
148  const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
149  const Index group_offset = group_id * scanParameters.non_scan_stride;
150  // This will be effective when the size is bigger than elements_per_block
151  const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
152  const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
153  const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
154  Index next_elements = 0;
156  for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
157  Index global_id = global_offset + next_elements;
158  private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
160  (global_id < scanParameters.total_size))
161  ? read(dev_eval, global_id)
162  : accumulator.initialize();
163  next_elements += scanParameters.scan_stride;
164  }
166  if (inclusive) {
167  inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
168  }
169  });
170  // This for loop must be 2
172  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
173  Index private_offset = 1;
174  // build sum in place up the tree
176  for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
178  for (Index l = 0; l < d; l++) {
179  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
180  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
181  CoeffReturnType accum = accumulator.initialize();
182  accumulator.reduce(private_scan[ai], &accum);
183  accumulator.reduce(private_scan[bi], &accum);
184  private_scan[bi] = accumulator.finalize(accum);
185  }
186  private_offset *= 2;
187  }
188  scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
189  private_scan[PacketSize - 1 + packetIndex];
190  private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
191  // traverse down tree & build scan
193  for (Index d = 1; d < PacketSize; d *= 2) {
194  private_offset >>= 1;
196  for (Index l = 0; l < d; l++) {
197  Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
198  Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
199  CoeffReturnType accum = accumulator.initialize();
200  accumulator.reduce(private_scan[ai], &accum);
201  accumulator.reduce(private_scan[bi], &accum);
202  private_scan[ai] = private_scan[bi];
203  private_scan[bi] = accumulator.finalize(accum);
204  }
205  }
206  }
207 
208  Index offset = 1;
209  // build sum in place up the tree
210  for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
211  // Synchronise
212  itemID.barrier(cl::sycl::access::fence_space::local_space);
213  if (local_id < d) {
214  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
215  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
216  CoeffReturnType accum = accumulator.initialize();
217  accumulator.reduce(scratch[ai], &accum);
218  accumulator.reduce(scratch[bi], &accum);
219  scratch[bi] = accumulator.finalize(accum);
220  }
221  offset *= 2;
222  }
223  // Synchronise
224  itemID.barrier(cl::sycl::access::fence_space::local_space);
225  // next step optimisation
226  if (local_id == 0) {
231  block_id;
232  tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
233  }
234  // clear the last element
235  scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
236  }
237  // traverse down tree & build scan
238  for (Index d = 1; d < scratch_stride; d *= 2) {
239  offset >>= 1;
240  // Synchronise
241  itemID.barrier(cl::sycl::access::fence_space::local_space);
242  if (local_id < d) {
243  Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
244  Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
245  CoeffReturnType accum = accumulator.initialize();
246  accumulator.reduce(scratch[ai], &accum);
247  accumulator.reduce(scratch[bi], &accum);
248  scratch[ai] = scratch[bi];
249  scratch[bi] = accumulator.finalize(accum);
250  }
251  }
252  // Synchronise
253  itemID.barrier(cl::sycl::access::fence_space::local_space);
254  // This for loop must be 2
256  for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
258  for (Index i = 0; i < PacketSize; i++) {
259  CoeffReturnType accum = private_scan[packetIndex + i];
260  accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
261  private_scan[packetIndex + i] = accumulator.finalize(accum);
262  }
263  }
265  if (inclusive) {
266  accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
267  private_scan[0] = accumulator.finalize(inclusive_scan);
268  }
269  });
270  next_elements = 0;
271  // right the first set of private param
273  for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
274  Index global_id = global_offset + next_elements;
275  if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
277  (global_id < scanParameters.total_size)) {
278  Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
279  out_ptr[global_id] = private_scan[private_id];
280  }
281  next_elements += scanParameters.scan_stride;
282  }
283  } // end for loop
284  }
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
Eigen::Matrix< Scalar, Dynamic, Dynamic, ColMajor > tmp
Definition: level3_impl.h:365
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
static EIGEN_CONSTEXPR int PacketSize
Definition: TensorScanSycl.h:89
std::enable_if_t< sst==scan_step::first, CoeffReturnType > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(const Input &inpt, Index global_id) const
Definition: TensorScanSycl.h:111
std::enable_if_t< sst==scan_step::first > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(InclusiveOp inclusive_op) const
Definition: TensorScanSycl.h:123
const index_t non_scan_stride
Definition: TensorScanSycl.h:58
const index_t scan_stride
Definition: TensorScanSycl.h:59
const index_t total_size
Definition: TensorScanSycl.h:55
const index_t non_scan_size
Definition: TensorScanSycl.h:56
const index_t block_threads
Definition: TensorScanSycl.h:62
static EIGEN_CONSTEXPR Index ScanPerThread
Definition: TensorScanSycl.h:54
const index_t group_threads
Definition: TensorScanSycl.h:61
const index_t panel_threads
Definition: TensorScanSycl.h:60
const index_t elements_per_group
Definition: TensorScanSycl.h:63
const index_t elements_per_block
Definition: TensorScanSycl.h:64
const index_t scan_size
Definition: TensorScanSycl.h:57
const index_t loop_range
Definition: TensorScanSycl.h:65

References Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::accumulator, Eigen::TensorSycl::internal::ScanParameters< index_t >::block_threads, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::dev_eval, EIGEN_DEVICE_FUNC, EIGEN_UNROLL_LOOP, Eigen::TensorSycl::internal::ScanParameters< index_t >::elements_per_block, Eigen::TensorSycl::internal::ScanParameters< index_t >::elements_per_group, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::first_step_inclusive_Operation(), Eigen::TensorSycl::internal::ScanParameters< index_t >::group_threads, i, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::inclusive, Eigen::TensorSycl::internal::ScanParameters< index_t >::loop_range, Eigen::TensorSycl::internal::ScanParameters< index_t >::non_scan_size, Eigen::TensorSycl::internal::ScanParameters< index_t >::non_scan_stride, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::out_ptr, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::PacketSize, Eigen::TensorSycl::internal::ScanParameters< index_t >::panel_threads, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::read(), Eigen::TensorSycl::internal::ScanParameters< index_t >::scan_size, Eigen::TensorSycl::internal::ScanParameters< index_t >::scan_stride, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::scanParameters, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::scratch, tmp, Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::tmp_ptr, and Eigen::TensorSycl::internal::ScanParameters< index_t >::total_size.

◆ read() [1/2]

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
template<scan_step sst = stp, typename Input >
std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::read ( const Input &  inpt,
Index  global_id 
) const
inline

◆ read() [2/2]

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
template<scan_step sst = stp, typename Input >
std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::read ( const Input &  inpt,
Index  global_id 
) const
inline
118  {
119  return inpt[global_id];
120  }

Member Data Documentation

◆ accumulator

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
Op Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::accumulator

◆ dev_eval

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
Evaluator Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::dev_eval

◆ inclusive

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
const bool Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::inclusive

◆ out_ptr

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
OutAccessor Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::out_ptr

◆ PacketSize

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
EIGEN_CONSTEXPR int Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::PacketSize = ScanParameters<Index>::ScanPerThread / 2
static

◆ scanParameters

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
const ScanParameters<Index> Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::scanParameters

◆ scratch

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
LocalAccessor Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::scratch

◆ tmp_ptr

template<typename Evaluator , typename CoeffReturnType , typename OutAccessor , typename Op , typename Index , scan_step stp>
OutAccessor Eigen::TensorSycl::internal::ScanKernelFunctor< Evaluator, CoeffReturnType, OutAccessor, Op, Index, stp >::tmp_ptr

The documentation for this struct was generated from the following file: