28 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
34 namespace TensorSycl {
37 template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
39 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType
PacketReturnType;
49 template <
typename CoeffReturnType,
typename Index>
59 return quotient_op(accumulator, CoeffReturnType(scale));
63 template <
typename CoeffReturnType,
typename Index>
65 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType
PacketReturnType;
77 template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
80 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
99 const Index localid = itemID.get_local_id(0);
100 auto aInPtr =
aI + localid;
102 CoeffReturnType *scratchptr =
scratch.get_pointer();
103 CoeffReturnType accumulator = *aInPtr;
105 scratchptr[localid] =
op.finalize(accumulator);
106 for (
Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
107 itemID.barrier(cl::sycl::access::fence_space::local_space);
108 if (localid < offset) {
109 op.reduce(scratchptr[localid + offset], &accumulator);
110 scratchptr[localid] =
op.finalize(accumulator);
113 if (localid == 0) *aOutPtr =
op.finalize(accumulator);
119 template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
124 typedef OpDefiner<OpType,
typename Evaluator::CoeffReturnType,
Index,
125 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
131 typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
PacketReturnType,
134 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
143 Index rng_, OpType op_)
148 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
150 const cl::sycl::nd_item<1> &itemID)
const {
152 Index VectorizedRange = (
rng / Evaluator::PacketSize) * Evaluator::PacketSize;
153 Index globalid = itemID.get_global_id(0);
154 Index localid = itemID.get_local_id(0);
155 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
156 Index start = Evaluator::PacketSize * globalid;
158 PacketReturnType packetAccumulator =
op.template initializePacket<PacketReturnType>();
160 op.template reducePacket<PacketReturnType>(
evaluator.impl().template packet<Unaligned>(
i), &packetAccumulator);
162 globalid += VectorizedRange;
164 for (
Index i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
165 op.template reducePacket<PacketReturnType>(
170 scratch[localid] = packetAccumulator =
174 for (
Index offset = local_range / 2; offset > 0; offset /= 2) {
175 itemID.barrier(cl::sycl::access::fence_space::local_space);
176 if (localid < offset) {
177 op.template reducePacket<PacketReturnType>(
scratch[localid + offset], &packetAccumulator);
178 scratch[localid] =
op.template finalizePacket<PacketReturnType>(packetAccumulator);
182 output_ptr[itemID.get_group(0)] =
183 op.finalizeBoth(
op.initialize(),
op.template finalizePacket<PacketReturnType>(packetAccumulator));
187 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
189 const cl::sycl::nd_item<1> &itemID)
const {
191 Index globalid = itemID.get_global_id(0);
192 Index localid = itemID.get_local_id(0);
196 for (
Index i = globalid;
i <
rng;
i += itemID.get_global_range(0)) {
203 for (
Index offset = local_range / 2; offset > 0; offset /= 2) {
204 itemID.barrier(cl::sycl::access::fence_space::local_space);
205 if (localid < offset) {
206 op.reduce(
scratch[localid + offset], &accumulator);
207 scratch[localid] =
op.finalize(accumulator);
211 output_ptr[itemID.get_group(0)] =
op.finalize(accumulator);
216 template <
typename Evaluator,
typename OpType>
224 template <
typename Scratch>
235 Op non_const_functor;
236 std::memcpy(&non_const_functor, &
functor,
sizeof(
Op));
238 Index globalid =
static_cast<Index>(itemID.get_global_linear_id());
239 if (globalid <
range) {
257 template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
264 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
276 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
277 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
294 Index localOffset = globalRId;
297 const Index per_thread_global_stride =
300 op.reduce(
evaluator.impl().coeff(global_offset), &accumulator);
301 localOffset += per_thread_local_stride;
302 global_offset += per_thread_global_stride;
306 const Index linearLocalThreadId = itemID.get_local_id(0);
308 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
310 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
316 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
317 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
325 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
328 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
329 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
330 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
335 auto out_scratch_ptr =
336 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
337 itemID.barrier(cl::sycl::access::fence_space::local_space);
339 accumulator = *out_scratch_ptr;
343 for (
Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
344 if (rLocalThreadId < offset) {
345 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
348 *out_scratch_ptr =
op.finalize(accumulator);
355 itemID.barrier(cl::sycl::access::fence_space::local_space);
359 outPtr[globalPId] =
op.finalize(accumulator);
364 template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
368 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
377 OutputAccessor output_accessor_, OpType op_,
378 const Index num_coeffs_to_preserve_,
379 const Index num_coeffs_to_reduce_)
387 const Index globalId = itemID.get_global_id(0);
393 OutScalar accumulator =
op.initialize();
396 op.reduce(*in_ptr, &accumulator);
403 template <
typename Index, Index LTP, Index LTR,
bool BC_>
410 template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
422 Index num_coeffs_to_reduce,
Index num_coeffs_to_preserve) {
430 "The Local thread size must be a power of 2 for the reduction "
441 const Index reductionPerThread = 64;
442 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
444 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
445 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ?
std::min(rGroups, localRange) : 1;
446 const Index globalRange = pNumGroups * rNumGroups * localRange;
450 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
451 if (rNumGroups > 1) {
453 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(
CoeffReturnType)));
455 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
456 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
457 num_coeffs_to_reduce)
460 SecondStepPartialReductionKernel;
461 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
463 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
464 Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
466 self.device().deallocate_temp(temp_pointer);
468 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
469 self,
output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
470 num_coeffs_to_reduce)
481 template <
typename Self,
typename Op,
bool Vectorizable>
488 typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
489 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
490 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
491 "The Local thread size must be a power of 2 for the reduction "
493 EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
495 typename Self::Index inputSize =
self.impl().dimensions().TotalSize();
499 const Index reductionPerThread = 2048;
501 Index reductionGroup = dev.getPowerOfTwo(
502 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
503 const Index num_work_group =
std::min(reductionGroup, local_range);
507 const Index global_range = num_work_group * local_range;
509 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
511 if (num_work_group > 1) {
514 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
515 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self, tmp_global_accessor, thread_range,
516 local_range, inputSize, reducer)
521 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
522 tmp_global_accessor,
data,
523 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
524 num_work_group, reducer)
526 dev.deallocate_temp(temp_pointer);
528 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(
self,
data, thread_range, local_range, inputSize,
536 template <
typename Self,
typename Op>
540 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
541 typename Self::EvaluatorPointerType
output,
typename Self::Index num_coeffs_to_reduce,
543 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
544 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>
::run(
self, reducer, dev,
output,
545 num_coeffs_to_reduce,
546 num_coeffs_to_preserve);
550 template <
typename Self,
typename Op>
554 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
555 typename Self::EvaluatorPointerType
output,
typename Self::Index num_coeffs_to_reduce,
557 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
558 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>
::run(
self, reducer, dev,
output,
559 num_coeffs_to_reduce,
560 num_coeffs_to_preserve);
567 template <
typename Self,
typename Op>
568 struct GenericReducer<Self, Op,
Eigen::SyclDevice> {
570 static bool run(
const Self &
self,
const Op &reducer,
const Eigen::SyclDevice &dev,
571 typename Self::EvaluatorPointerType
output,
typename Self::Index num_values_to_reduce,
574 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
576 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
578 self,
output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
579 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce :
static_cast<Index>(1))
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
Definition: TensorReductionSycl.h:120
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:146
std::conditional_t<(Evaluator::ReducerTraits::PacketAccess &Evaluator::InputPacketAccess), PacketReturnType, CoeffReturnType > OutType
Definition: TensorReductionSycl.h:133
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_, Index rng_, OpType op_)
Definition: TensorReductionSycl.h:142
EvaluatorPointerType final_output
Definition: TensorReductionSycl.h:138
Evaluator::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:130
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:129
cl::sycl::accessor< OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:135
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:188
Evaluator evaluator
Definition: TensorReductionSycl.h:137
Evaluator::Index Index
Definition: TensorReductionSycl.h:123
Index rng
Definition: TensorReductionSycl.h:139
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t< Vect > compute_reduction(const cl::sycl::nd_item< 1 > &itemID) const
Definition: TensorReductionSycl.h:149
OpDef::type Op
Definition: TensorReductionSycl.h:128
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:122
LocalAccessor scratch
Definition: TensorReductionSycl.h:136
Op op
Definition: TensorReductionSycl.h:140
Definition: TensorReductionSycl.h:217
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_, Index range_, Index num_values_to_reduce_)
Definition: TensorReductionSycl.h:225
Op functor
Definition: TensorReductionSycl.h:250
Evaluator::Index Index
Definition: TensorReductionSycl.h:221
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:219
Index range
Definition: TensorReductionSycl.h:251
OpDef::type Op
Definition: TensorReductionSycl.h:223
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:222
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:233
Index num_values_to_reduce
Definition: TensorReductionSycl.h:252
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:220
Evaluator evaluator
Definition: TensorReductionSycl.h:248
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:249
#define min(a, b)
Definition: datatypes.h:22
char char * op
Definition: level2_impl.h:374
reduction_dim
Definition: TensorReductionSycl.h:255
EIGEN_DEVICE_FUNC Packet pdiv(const Packet &a, const Packet &b)
Definition: GenericPacketMath.h:368
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:67
Vectorise< CoeffReturnType, Eigen::SyclDevice, true >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:65
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:66
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:71
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer< CoeffReturnType > &)
Definition: TensorReductionSycl.h:52
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator, const Index &scale)
Definition: TensorReductionSycl.h:56
Eigen::internal::SumReducer< CoeffReturnType > type
Definition: TensorReductionSycl.h:51
Definition: TensorReductionSycl.h:38
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op)
Definition: TensorReductionSycl.h:41
Op type
Definition: TensorReductionSycl.h:40
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator, const Index &)
Definition: TensorReductionSycl.h:43
Vectorise< CoeffReturnType, Eigen::SyclDevice, Vectorizable >::PacketReturnType PacketReturnType
Definition: TensorReductionSycl.h:39
Definition: InteropHeaders.h:135
Definition: TensorReductionSycl.h:411
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:412
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output, Index num_coeffs_to_reduce, Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:421
Self::Index Index
Definition: TensorReductionSycl.h:415
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:413
PartialReductionKernel< Self, Op, PannelParameters, rt > SyclReducerKerneType
Definition: TensorReductionSycl.h:419
Self::Storage Storage
Definition: TensorReductionSycl.h:414
ReductionPannel< typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true > PannelParameters
Definition: TensorReductionSycl.h:417
Definition: TensorReductionSycl.h:258
const Index preserve_elements_num_groups
Definition: TensorReductionSycl.h:270
Evaluator::Index Index
Definition: TensorReductionSycl.h:261
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:273
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId, CoeffReturnType &accumulator) const
Definition: TensorReductionSycl.h:287
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAcc
Definition: TensorReductionSycl.h:265
ScratchAcc scratch
Definition: TensorReductionSycl.h:266
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:272
EvaluatorPointerType output_accessor
Definition: TensorReductionSycl.h:268
OpDef::type Op
Definition: TensorReductionSycl.h:263
const Index reduce_elements_num_groups
Definition: TensorReductionSycl.h:271
Evaluator::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:260
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_, const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:275
Op op
Definition: TensorReductionSycl.h:269
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:305
Evaluator evaluator
Definition: TensorReductionSycl.h:267
Evaluator::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:259
OpDefiner< OpType, CoeffReturnType, Index, false > OpDef
Definition: TensorReductionSycl.h:262
Definition: TensorReductionSycl.h:404
static EIGEN_CONSTEXPR Index LocalThreadSizeR
Definition: TensorReductionSycl.h:406
static EIGEN_CONSTEXPR Index LocalThreadSizeP
Definition: TensorReductionSycl.h:405
static EIGEN_CONSTEXPR bool BC
Definition: TensorReductionSycl.h:407
Definition: TensorReductionSycl.h:79
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
Definition: TensorReductionSycl.h:88
OpDefiner< OpType, CoeffReturnType, Index, true > OpDef
Definition: TensorReductionSycl.h:82
cl::sycl::accessor< CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > LocalAccessor
Definition: TensorReductionSycl.h:81
Op op
Definition: TensorReductionSycl.h:87
void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:91
OutputAccessor outAcc
Definition: TensorReductionSycl.h:86
OpDef::type Op
Definition: TensorReductionSycl.h:83
LocalAccessor scratch
Definition: TensorReductionSycl.h:84
InputAccessor aI
Definition: TensorReductionSycl.h:85
Definition: TensorReductionSycl.h:365
OutputAccessor output_accessor
Definition: TensorReductionSycl.h:371
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item< 1 > itemID) const
Definition: TensorReductionSycl.h:386
const Index num_coeffs_to_preserve
Definition: TensorReductionSycl.h:373
cl::sycl::accessor< OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local > ScratchAccessor
Definition: TensorReductionSycl.h:369
OpDefiner< OpType, OutScalar, Index, false > OpDef
Definition: TensorReductionSycl.h:366
const Index num_coeffs_to_reduce
Definition: TensorReductionSycl.h:374
Op op
Definition: TensorReductionSycl.h:372
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_, OutputAccessor output_accessor_, OpType op_, const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
Definition: TensorReductionSycl.h:376
InputAccessor input_accessor
Definition: TensorReductionSycl.h:370
OpDef::type Op
Definition: TensorReductionSycl.h:367
Self::CoeffReturnType CoeffReturnType
Definition: TensorReductionSycl.h:483
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data)
Definition: TensorReductionSycl.h:487
Self::EvaluatorPointerType EvaluatorPointerType
Definition: TensorReductionSycl.h:484
Definition: TensorReduction.h:356
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:357
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:139
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:570
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:554
Definition: TensorReduction.h:430
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
Definition: TensorFunctors.h:107
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_coeffs_to_preserve)
Definition: TensorReductionSycl.h:540
Definition: TensorReduction.h:442
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
Definition: TensorFunctors.h:66
Template functor to compute the quotient of two scalars.
Definition: BinaryFunctors.h:430