11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
20 #define KERNEL_FRIEND friend
37 template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
43 typedef typename XprType::Nested
Nested;
45 static constexpr
int Layout = XprTraits::Layout;
56 template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
61 template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
67 template <
typename OutputDims>
69 template <
typename InputDims,
typename ReducedDims>
72 OutputDims* output_dims, ReducedDims* reduced_dims) {
76 for (
int i = 0;
i < NumInputDims; ++
i) {
78 (*reduced_dims)[reduceIndex] = input_dims[
i];
81 (*output_dims)[outputIndex] = input_dims[
i];
90 template <
typename InputDims,
typename Index,
size_t Rank>
94 for (
int i = 0;
i < NumInputDims; ++
i) {
95 (*reduced_dims)[
i] = input_dims[
i];
100 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
104 template <
typename ReducedDims,
int NumTensorDims,
int Layout>
109 template <
typename ReducedDims,
int NumTensorDims>
111 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
112 static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
113 static const bool tmp3 =
115 static const bool value = tmp1 & tmp2 & tmp3;
117 template <
typename ReducedDims,
int NumTensorDims>
119 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
122 static const bool value = tmp1 & tmp2 & tmp3;
124 template <
typename ReducedDims,
int NumTensorDims>
126 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
127 static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
128 static const bool value = tmp1 & tmp2;
130 template <
typename ReducedDims,
int NumTensorDims>
132 static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
134 static const bool value = tmp1 & tmp2;
137 template <
int DimIndex,
typename Self,
typename Op>
140 Op& reducer,
typename Self::CoeffReturnType* accum) {
142 for (
int j = 0;
j <
self.m_reducedDims[DimIndex]; ++
j) {
143 const typename Self::Index input = firstIndex +
j *
self.m_reducedStrides[DimIndex];
148 template <
typename Self,
typename Op>
151 Op& reducer,
typename Self::CoeffReturnType* accum) {
152 for (
int j = 0;
j <
self.m_reducedDims[0]; ++
j) {
153 const typename Self::Index input = firstIndex +
j *
self.m_reducedStrides[0];
154 reducer.reduce(
self.m_impl.coeff(input), accum);
158 template <
typename Self,
typename Op>
161 typename Self::CoeffReturnType* accum) {
162 reducer.reduce(
self.m_impl.coeff(index), accum);
166 template <
typename Self,
typename Op,
167 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
168 bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
171 !Self::RunningOnGPU)>
175 typename Self::CoeffReturnType accum = reducer.initialize();
177 reducer.reduce(
self.m_impl.coeff(firstIndex +
j), &accum);
179 return reducer.finalize(accum);
183 template <
typename Self,
typename Op>
186 const Self&
self,
typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer0) {
190 typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
191 if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
192 const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
193 typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
194 typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
195 typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
196 const Index offset0 = firstIndex;
197 const Index offset1 = firstIndex + packetSize;
198 const Index offset2 = firstIndex + 2 * packetSize;
199 const Index offset3 = firstIndex + 3 * packetSize;
200 for (
Index j = 0;
j < VectorizedSize4;
j += 4 * packetSize) {
201 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(offset0 +
j), &paccum0);
202 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(offset1 +
j), &paccum1);
203 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(offset2 +
j), &paccum2);
204 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(offset3 +
j), &paccum3);
206 reducer0.reducePacket(paccum1, &paccum0);
207 reducer0.reducePacket(paccum2, &paccum0);
208 reducer0.reducePacket(paccum3, &paccum0);
209 start = VectorizedSize4;
211 if (
start <= (numValuesToReduce - packetSize)) {
212 const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
214 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex +
j), &paccum0);
216 start = VectorizedSize;
218 typename Self::CoeffReturnType accum = reducer0.initialize();
220 reducer0.reduce(
self.m_impl.coeff(firstIndex +
j), &accum);
222 return reducer0.finalizeBoth(accum, paccum0);
226 #if !defined(EIGEN_HIPCC)
231 template <
typename T>
244 template <
typename Self,
typename Op>
248 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
249 typename Self::CoeffReturnType accum = reducer.initialize();
250 if (numValuesToReduce > kLeafSize) {
253 reducer.reduce(
reduce(
self, firstIndex,
half, reducer), &accum);
254 reducer.reduce(
reduce(
self, firstIndex +
half, numValuesToReduce -
half, reducer), &accum);
255 return reducer.finalize(accum);
262 template <
typename Self,
typename Op>
266 const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
268 typename Self::CoeffReturnType accum = reducer.initialize();
269 if (numValuesToReduce > packetSize * kLeafSize) {
275 reducer.reduce(
reduce(
self, firstIndex, num_left, reducer), &accum);
276 if (num_left < numValuesToReduce) {
277 reducer.reduce(
reduce(
self,
split, numValuesToReduce - num_left, reducer), &accum);
279 return reducer.finalize(accum);
287 template <
int DimIndex,
typename Self,
typename Op,
288 bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
291 typename Self::PacketReturnType*) {
296 template <
int DimIndex,
typename Self,
typename Op>
299 Op& reducer,
typename Self::PacketReturnType* accum) {
301 for (
typename Self::Index j = 0;
j <
self.m_reducedDims[DimIndex]; ++
j) {
302 const typename Self::Index input = firstIndex +
j *
self.m_reducedStrides[DimIndex];
308 template <
typename Self,
typename Op>
311 Op& reducer0,
typename Self::PacketReturnType* accum0) {
313 const Index stride =
self.m_reducedStrides[0];
314 const Index size =
self.m_reducedDims[0];
315 if (!Self::ReducerTraits::IsStateful &&
size >= 16) {
316 const Index unrolled_size4 = (
size / 4) * 4;
317 typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
318 typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
319 typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
320 for (
Index j = 0;
j < unrolled_size4;
j += 4) {
321 const Index input0 = firstIndex +
j * stride;
322 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input0), accum0);
323 const Index input1 = firstIndex + (
j + 1) * stride;
324 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input1), &accum1);
325 const Index input2 = firstIndex + (
j + 2) * stride;
326 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input2), &accum2);
327 const Index input3 = firstIndex + (
j + 3) * stride;
328 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input3), &accum3);
330 reducer0.reducePacket(accum1, accum0);
331 reducer0.reducePacket(accum2, accum0);
332 reducer0.reducePacket(accum3, accum0);
334 Index input = firstIndex +
j * stride;
335 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input), accum0);
339 Index input = firstIndex +
j * stride;
340 reducer0.reducePacket(
self.m_impl.template packet<Unaligned>(input), accum0);
345 template <
typename Self,
typename Op>
348 typename Self::PacketReturnType*) {
354 template <
typename Self,
typename Op,
typename Device,
355 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
357 static constexpr
bool HasOptimizedImplementation =
false;
360 typename Self::EvaluatorPointerType
output) {
366 #ifdef EIGEN_USE_THREADS
368 template <
typename Self,
typename Op,
369 bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
370 struct FullReducerShard {
372 typename Self::Index numValuesToReduce, Op& reducer,
373 typename Self::CoeffReturnType*
output) {
379 template <
typename Self,
typename Op,
bool Vectorizable>
380 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
381 static constexpr
bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
385 static void run(
const Self&
self, Op& reducer,
const ThreadPoolDevice& device,
386 typename Self::CoeffReturnType*
output) {
389 if (num_coeffs == 0) {
390 *
output = reducer.finalize(reducer.initialize());
393 const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
396 if (num_threads == 1) {
400 const Index blocksize = num_coeffs / num_threads;
401 const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
404 Barrier barrier(internal::convert_index<unsigned int>(numblocks));
405 MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
406 for (
Index i = 0;
i < numblocks; ++
i) {
408 blocksize, reducer, &shards[
i]);
410 typename Self::CoeffReturnType finalShard;
411 if (numblocks * blocksize < num_coeffs) {
413 num_coeffs - numblocks * blocksize, reducer);
415 finalShard = reducer.initialize();
419 for (
Index i = 0;
i < numblocks; ++
i) {
420 reducer.reduce(shards[
i], &finalShard);
422 *
output = reducer.finalize(finalShard);
429 template <
typename Self,
typename Op,
typename Device>
441 template <
typename Self,
typename Op,
typename Device>
452 #ifdef EIGEN_USE_SYCL
454 template <
typename Self,
typename Op,
typename Device>
455 struct GenericReducer {
456 static constexpr
bool HasOptimizedImplementation =
false;
458 EIGEN_DEVICE_FUNC static bool run(
const Self&, Op&,
const Device&,
typename Self::CoeffReturnType*,
466 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
467 template <
int B,
int N,
typename S,
typename R,
typename I_>
471 #if defined(EIGEN_HAS_GPU_FP16)
472 template <
typename S,
typename R,
typename I_>
475 template <
int B,
int N,
typename S,
typename R,
typename I_>
478 template <
int NPT,
typename S,
typename R,
typename I_>
483 template <
int NPT,
typename S,
typename R,
typename I_>
486 template <
int NPT,
typename S,
typename R,
typename I_>
498 template <
typename Op,
typename CoeffReturnType>
500 #if defined(EIGEN_USE_SYCL)
501 typedef std::remove_const_t<decltype(std::declval<Op>().initialize())>
type;
503 typedef std::remove_const_t<CoeffReturnType>
type;
509 template <
typename Op,
typename Dims,
typename XprType,
template <
class>
class MakePointer_>
534 template <
typename ArgType,
typename Device>
538 template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_,
typename Device>
548 static constexpr
int NumOutputDims = NumInputDims - NumReducedDims;
566 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
568 static constexpr
bool RunningOnSycl =
false;
569 #elif defined(EIGEN_USE_SYCL)
571 static constexpr
bool RunningOnGPU =
false;
573 static constexpr
bool RunningOnGPU =
false;
574 static constexpr
bool RunningOnSycl =
false;
580 PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
582 PreferBlockAccess =
true,
595 static constexpr
bool RunningFullReduction = (NumOutputDims == 0);
598 : m_impl(
op.expression(), device), m_reducer(
op.reducer()), m_result(NULL), m_device(device) {
600 EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
601 YOU_MADE_A_PROGRAMMING_MISTAKE);
604 for (
int i = 0;
i < NumInputDims; ++
i) {
605 m_reduced[
i] =
false;
607 for (
int i = 0;
i < NumReducedDims; ++
i) {
610 m_reduced[
op.dims()[
i]] =
true;
617 if (NumOutputDims > 0) {
618 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
619 m_outputStrides[0] = 1;
620 for (
int i = 1;
i < NumOutputDims; ++
i) {
621 m_outputStrides[
i] = m_outputStrides[
i - 1] * m_dimensions[
i - 1];
625 m_outputStrides[
static_cast<size_t>(NumOutputDims - 1)] = 1;
626 for (
int i = NumOutputDims - 2;
i >= 0; --
i) {
627 m_outputStrides[
i] = m_outputStrides[
i + 1] * m_dimensions[
i + 1];
634 if (NumInputDims > 0) {
636 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
637 input_strides[0] = 1;
638 for (
int i = 1;
i < NumInputDims; ++
i) {
639 input_strides[
i] = input_strides[
i - 1] * input_dims[
i - 1];
642 input_strides.back() = 1;
643 for (
int i = NumInputDims - 2;
i >= 0; --
i) {
644 input_strides[
i] = input_strides[
i + 1] * input_dims[
i + 1];
650 for (
int i = 0;
i < NumInputDims; ++
i) {
652 m_reducedStrides[reduceIndex] = input_strides[
i];
655 m_preservedStrides[outputIndex] = input_strides[
i];
656 m_output_to_input_dim_map[outputIndex] =
i;
663 if (NumOutputDims == 0) {
668 : (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor))
669 ? m_preservedStrides[0]
670 : m_preservedStrides[
static_cast<size_t>(NumOutputDims - 1)];
677 if ((RunningFullReduction && RunningOnSycl) ||
679 ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
680 bool need_assign =
false;
687 Op reducer(m_reducer);
693 else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
694 bool reducing_inner_dims =
true;
695 for (
int i = 0;
i < NumReducedDims; ++
i) {
696 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
697 reducing_inner_dims &= m_reduced[
i];
699 reducing_inner_dims &= m_reduced[NumInputDims - 1 -
i];
703 (reducing_inner_dims || ReducingInnerMostDims)) {
707 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
708 num_values_to_reduce > 128) ||
717 Op reducer(m_reducer);
720 num_coeffs_to_preserve)) {
722 m_device.deallocate_temp(m_result);
727 return (m_result != NULL);
731 bool preserving_inner_dims =
true;
732 for (
int i = 0;
i < NumReducedDims; ++
i) {
733 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
734 preserving_inner_dims &= m_reduced[NumInputDims - 1 -
i];
736 preserving_inner_dims &= m_reduced[
i];
743 if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
744 num_values_to_reduce > 32) ||
753 Op reducer(m_reducer);
756 num_coeffs_to_preserve)) {
758 m_device.deallocate_temp(m_result);
763 return (m_result != NULL);
766 #if defined(EIGEN_USE_SYCL)
777 Op reducer(m_reducer);
779 num_coeffs_to_preserve);
780 return (m_result != NULL);
787 #ifdef EIGEN_USE_THREADS
788 template <
typename EvalSubExprsCallback>
790 m_impl.evalSubExprsIfNeededAsync(NULL, [
this,
data, done](
bool) { done(evalSubExprsIfNeededCommon(
data)); });
795 m_impl.evalSubExprsIfNeeded(NULL);
796 return evalSubExprsIfNeededCommon(
data);
802 m_device.deallocate_temp(m_result);
808 if ((RunningFullReduction || RunningOnGPU) && m_result) {
809 return *(m_result + index);
811 Op reducer(m_reducer);
812 if (ReducingInnerMostDims || RunningFullReduction) {
813 const Index num_values_to_reduce = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor))
814 ? m_preservedStrides[0]
815 : m_preservedStrides[NumPreservedStrides - 1];
820 return reducer.finalize(accum);
825 template <
int LoadMode>
829 if (RunningOnGPU && m_result) {
830 return internal::pload<PacketReturnType>(m_result + index);
833 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
834 if (ReducingInnerMostDims) {
835 const Index num_values_to_reduce = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor))
836 ? m_preservedStrides[0]
837 : m_preservedStrides[NumPreservedStrides - 1];
838 const Index firstIndex = firstInput(index);
839 for (
Index i = 0;
i < PacketSize; ++
i) {
840 Op reducer(m_reducer);
842 num_values_to_reduce, reducer);
844 }
else if (PreservingInnerMostDims) {
845 const Index firstIndex = firstInput(index);
846 const int innermost_dim = (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) ? 0 : NumOutputDims - 1;
848 if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
849 Op reducer(m_reducer);
850 typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
852 return reducer.finalizePacket(accum);
854 for (
int i = 0;
i < PacketSize; ++
i) {
855 values[
i] = coeff(index +
i);
859 for (
int i = 0;
i < PacketSize; ++
i) {
860 values[
i] = coeff(index +
i);
869 if (RunningFullReduction && m_result) {
874 return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
875 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
884 template <
int,
typename,
typename>
886 template <
typename,
typename,
bool,
bool>
888 template <
int,
typename,
typename,
bool>
890 template <
typename S,
typename O,
typename D,
bool V>
892 #ifdef EIGEN_USE_THREADS
893 template <
typename S,
typename O,
bool V>
894 friend struct internal::FullReducerShard;
896 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
897 template <
int B,
int N,
typename S,
typename R,
typename I_>
898 KERNEL_FRIEND void internal::FullReductionKernel(
R,
const S, I_,
typename S::CoeffReturnType*,
unsigned int*);
899 #if defined(EIGEN_HAS_GPU_FP16)
900 template <
typename S,
typename R,
typename I_>
901 KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(
R,
const S, I_,
903 template <
int B,
int N,
typename S,
typename R,
typename I_>
906 template <
int NPT,
typename S,
typename R,
typename I_>
909 template <
int NPT,
typename S,
typename R,
typename I_>
910 KERNEL_FRIEND void internal::InnerReductionKernel(
R,
const S, I_, I_,
typename S::CoeffReturnType*);
912 template <
int NPT,
typename S,
typename R,
typename I_>
913 KERNEL_FRIEND void internal::OuterReductionKernel(
R,
const S, I_, I_,
typename S::CoeffReturnType*);
916 #if defined(EIGEN_USE_SYCL)
917 template <
typename Evaluator_,
typename Op__>
920 template <
typename,
typename,
typename>
921 friend struct internal::GenericReducer;
924 template <
typename S,
typename O,
typename D>
927 struct BlockIteratorState {
936 if (ReducingInnerMostDims) {
937 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
938 return index * m_preservedStrides[0];
940 return index * m_preservedStrides[NumPreservedStrides - 1];
944 Index startInput = 0;
945 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
946 for (
int i = NumOutputDims - 1;
i > 0; --
i) {
948 const Index idx = index / m_outputStrides[
i];
949 startInput += idx * m_preservedStrides[
i];
950 index -= idx * m_outputStrides[
i];
952 if (PreservingInnerMostDims) {
956 startInput += index * m_preservedStrides[0];
959 for (
int i = 0;
i < NumOutputDims - 1; ++
i) {
961 const Index idx = index / m_outputStrides[
i];
962 startInput += idx * m_preservedStrides[
i];
963 index -= idx * m_outputStrides[
i];
965 if (PreservingInnerMostDims) {
966 eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
969 startInput += index * m_preservedStrides[NumPreservedStrides - 1];
1007 template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_,
typename Device>
1014 template <
typename Op,
typename Dims,
typename ArgType,
template <
class>
class MakePointer_>
1025 return *(this->
data() + index);
1030 template <
int LoadMode>
1032 return internal::pload<typename Base::PacketReturnType>(this->
data() + index);
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
@ R
Definition: StatisticsVector.h:21
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
#define KERNEL_FRIEND
Definition: TensorReduction.h:20
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
The tensor base class.
Definition: TensorBase.h:1026
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition: TensorCostModel.h:154
Definition: TensorCostModel.h:28
Definition: TensorReduction.h:510
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & reducer() const
Definition: TensorReduction.h:526
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims & dims() const
Definition: TensorReduction.h:525
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims, const Op &reducer)
Definition: TensorReduction.h:521
Eigen::internal::traits< TensorReductionOp >::Scalar Scalar
Definition: TensorReduction.h:512
std::remove_const_t< typename XprType::CoeffReturnType > CoeffReturnType
Definition: TensorReduction.h:514
Eigen::internal::nested< TensorReductionOp >::type Nested
Definition: TensorReduction.h:515
Eigen::internal::traits< TensorReductionOp >::StorageKind StorageKind
Definition: TensorReduction.h:516
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorReduction.h:524
const Dims m_dims
Definition: TensorReduction.h:530
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims)
Definition: TensorReduction.h:519
Eigen::internal::traits< TensorReductionOp >::Index Index
Definition: TensorReduction.h:517
XprType::Nested m_expr
Definition: TensorReduction.h:529
const Op m_reducer
Definition: TensorReduction.h:531
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorReduction.h:513
Definition: TensorReductionSycl.h:217
Definition: TensorBlock.h:566
#define max(a, b)
Definition: datatypes.h:23
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
char char * op
Definition: level2_impl.h:374
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC Index LeafSize< half >()
Definition: TensorReduction.h:236
EIGEN_DEVICE_FUNC Index LeafSize()
Definition: TensorReduction.h:232
EIGEN_DEVICE_FUNC Index LeafSize< bfloat16 >()
Definition: TensorReduction.h:240
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
void split(const DoubleVector &in_vector, Vector< DoubleVector * > &out_vector_pt)
Definition: double_vector.cc:1413
@ S
Definition: quadtree.h:62
Type
Type of JSON value.
Definition: rapidjson.h:513
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorForwardDeclarations.h:25
Definition: TensorMeta.h:47
Definition: TensorDimensions.h:85
Definition: TensorForwardDeclarations.h:42
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Eigen::SyclDevice &device)
Definition: TensorReduction.h:1019
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Eigen::SyclDevice > Base
Definition: TensorReduction.h:1018
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::CoeffReturnType coeff(typename Base::Index index) const
Definition: TensorReduction.h:1024
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::PacketReturnType packet(typename Base::Index index) const
Definition: TensorReduction.h:1031
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Base
Definition: TensorReduction.h:1010
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Device &device)
Definition: TensorReduction.h:1011
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
Index input_dim
Definition: TensorReduction.h:928
Index output_count
Definition: TensorReduction.h:930
Index output_size
Definition: TensorReduction.h:929
Definition: TensorReduction.h:539
Index m_numValuesToReduce
Definition: TensorReduction.h:987
Dims ReducedDims
Definition: TensorReduction.h:541
XprType::Index Index
Definition: TensorReduction.h:543
TensorEvaluator< ArgType, Device > m_impl
Definition: TensorReduction.h:997
EIGEN_DEVICE_FUNC const TensorEvaluator< ArgType, Device > & impl() const
Definition: TensorReduction.h:880
std::conditional_t< NumOutputDims==0, Sizes<>, DSizes< Index, NumOutputDims > > Dimensions
Definition: TensorReduction.h:549
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorReduction.h:673
EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType &op, const Device &device)
Definition: TensorReduction.h:597
array< bool, NumInputDims > m_reduced
Definition: TensorReduction.h:976
Dimensions m_dimensions
Definition: TensorReduction.h:978
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorReduction.h:799
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorReduction.h:590
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition: TensorReduction.h:794
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Definition: TensorReduction.h:935
array< Index, NumReducedDims > m_reducedStrides
Definition: TensorReduction.h:991
const Device EIGEN_DEVICE_REF m_device
Definition: TensorReduction.h:1004
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorReduction.h:826
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorReduction.h:554
EvaluatorPointerType m_result
Definition: TensorReduction.h:1002
EIGEN_DEVICE_FUNC const Device & device() const
Definition: TensorReduction.h:881
array< Index, NumReducedDims > m_reducedDims
Definition: TensorReduction.h:994
Storage::Type EvaluatorPointerType
Definition: TensorReduction.h:559
std::remove_const_t< Scalar > ScalarNoConst
Definition: TensorReduction.h:587
array< internal::TensorIntDivisor< Index >,(std::max)(NumOutputDims, 1)> m_fastOutputStrides
Definition: TensorReduction.h:982
array< Index,(std::max)(NumPreservedStrides, 1)> m_preservedStrides
Definition: TensorReduction.h:983
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Self
Definition: TensorReduction.h:551
XprType::Scalar Scalar
Definition: TensorReduction.h:550
internal::ReductionReturnType< Op, typename XprType::CoeffReturnType >::type CoeffReturnType
Definition: TensorReduction.h:553
array< Index,(std::max)(NumOutputDims, 1)> m_output_to_input_dim_map
Definition: TensorReduction.h:985
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorReduction.h:868
array< Index,(std::max)(NumOutputDims, 1)> m_outputStrides
Definition: TensorReduction.h:981
StorageMemory< CoeffReturnType, Device > Storage
Definition: TensorReduction.h:558
EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data)
Definition: TensorReduction.h:675
internal::reducer_traits< Op, Device > ReducerTraits
Definition: TensorReduction.h:540
Op m_reducer
Definition: TensorReduction.h:1000
ArgType ChildType
Definition: TensorReduction.h:544
Eigen::internal::traits< XprType >::PointerType TensorPointerType
Definition: TensorReduction.h:557
TensorReductionOp< Op, Dims, ArgType, MakePointer_ > XprType
Definition: TensorReduction.h:542
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorReduction.h:807
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorReduction.h:879
TensorEvaluator< ArgType, Device >::Dimensions InputDimensions
Definition: TensorReduction.h:545
Definition: TensorReduction.h:535
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, Rank > &, Sizes<> *, array< Index, Rank > *reduced_dims)
Definition: TensorReduction.h:91
Definition: TensorReduction.h:68
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, internal::array_size< InputDims >::value > &reduced, OutputDims *output_dims, ReducedDims *reduced_dims)
Definition: TensorReduction.h:70
Definition: TensorReduction.h:356
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition: TensorReduction.h:359
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index index, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:160
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:150
Definition: TensorReduction.h:138
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:139
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
Definition: TensorReduction.h:347
Definition: TensorReduction.h:309
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer0, typename Self::PacketReturnType *accum0)
Definition: TensorReduction.h:310
Definition: TensorReduction.h:297
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
Definition: TensorReduction.h:298
Definition: TensorReduction.h:289
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
Definition: TensorReduction.h:290
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:246
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer0)
Definition: TensorReduction.h:185
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:264
Definition: TensorReduction.h:172
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:173
Definition: TensorReduction.h:430
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
Definition: TensorReduction.h:442
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
Definition: TensorReduction.h:499
std::remove_const_t< CoeffReturnType > type
Definition: TensorReduction.h:503
Definition: TensorReduction.h:101
static const bool value
Definition: TensorReduction.h:102
const TensorReductionOp< Op, Dims, XprType, MakePointer_ > & type
Definition: TensorReduction.h:58
Definition: XprHelper.h:427
Definition: XprHelper.h:205
@ Cost
Definition: XprHelper.h:206
TensorReductionOp< Op, Dims, XprType, MakePointer_ > type
Definition: TensorReduction.h:64
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
Packet16h type
Definition: AVX512/PacketMath.h:68
Definition: TensorReduction.h:105
static const bool value
Definition: TensorReduction.h:106
Definition: TensorFunctors.h:60
MakePointer_< T > MakePointerT
Definition: TensorReduction.h:51
MakePointerT::Type Type
Definition: TensorReduction.h:52
XprTraits::Index Index
Definition: TensorReduction.h:42
XprTraits::PointerType PointerType
Definition: TensorReduction.h:46
traits< XprType > XprTraits
Definition: TensorReduction.h:39
XprTraits::Scalar Scalar
Definition: TensorReduction.h:40
XprTraits::StorageKind StorageKind
Definition: TensorReduction.h:41
XprType::Nested Nested
Definition: TensorReduction.h:43
Definition: ForwardDeclarations.h:21
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
Definition: TensorMeta.h:32
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317