TensorReduction.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 // Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
6 //
7 // This Source Code Form is subject to the terms of the Mozilla
8 // Public License v. 2.0. If a copy of the MPL was not distributed
9 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
10 
11 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
12 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
13 
14 // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
15 // so we'll use a macro to make clang happy.
16 #ifndef KERNEL_FRIEND
17 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
18 #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
19 #else
20 #define KERNEL_FRIEND friend
21 #endif
22 #endif
23 
24 // IWYU pragma: private
25 #include "./InternalHeaderCheck.h"
26 
27 namespace Eigen {
28 
36 namespace internal {
37 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
38 struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
40  typedef typename XprTraits::Scalar Scalar;
41  typedef typename XprTraits::StorageKind StorageKind;
42  typedef typename XprTraits::Index Index;
43  typedef typename XprType::Nested Nested;
44  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
45  static constexpr int Layout = XprTraits::Layout;
46  typedef typename XprTraits::PointerType PointerType;
47 
48  template <class T>
49  struct MakePointer {
50  // Intermediate typedef to workaround MSVC issue.
51  typedef MakePointer_<T> MakePointerT;
52  typedef typename MakePointerT::Type Type;
53  };
54 };
55 
56 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
57 struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
59 };
60 
61 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
62 struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
63  typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
65 };
66 
67 template <typename OutputDims>
69  template <typename InputDims, typename ReducedDims>
70  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims,
71  const array<bool, internal::array_size<InputDims>::value>& reduced,
72  OutputDims* output_dims, ReducedDims* reduced_dims) {
73  const int NumInputDims = internal::array_size<InputDims>::value;
74  int outputIndex = 0;
75  int reduceIndex = 0;
76  for (int i = 0; i < NumInputDims; ++i) {
77  if (reduced[i]) {
78  (*reduced_dims)[reduceIndex] = input_dims[i];
79  ++reduceIndex;
80  } else {
81  (*output_dims)[outputIndex] = input_dims[i];
82  ++outputIndex;
83  }
84  }
85  }
86 };
87 
88 template <>
89 struct DimInitializer<Sizes<> > {
90  template <typename InputDims, typename Index, size_t Rank>
91  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array<bool, Rank>&, Sizes<>*,
92  array<Index, Rank>* reduced_dims) {
93  const int NumInputDims = internal::array_size<InputDims>::value;
94  for (int i = 0; i < NumInputDims; ++i) {
95  (*reduced_dims)[i] = input_dims[i];
96  }
97  }
98 };
99 
100 template <typename ReducedDims, int NumTensorDims, int Layout>
102  static const bool value = false;
103 };
104 template <typename ReducedDims, int NumTensorDims, int Layout>
106  static const bool value = false;
107 };
108 
109 template <typename ReducedDims, int NumTensorDims>
110 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
111  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
112  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
113  static const bool tmp3 =
114  index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
115  static const bool value = tmp1 & tmp2 & tmp3;
116 };
117 template <typename ReducedDims, int NumTensorDims>
118 struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
119  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
120  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
121  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
122  static const bool value = tmp1 & tmp2 & tmp3;
123 };
124 template <typename ReducedDims, int NumTensorDims>
125 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
126  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
127  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
128  static const bool value = tmp1 & tmp2;
129 };
130 template <typename ReducedDims, int NumTensorDims>
131 struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
132  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
133  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
134  static const bool value = tmp1 & tmp2;
135 };
136 
137 template <int DimIndex, typename Self, typename Op>
139  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
140  Op& reducer, typename Self::CoeffReturnType* accum) {
141  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
142  for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
143  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
144  GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
145  }
146  }
147 };
148 template <typename Self, typename Op>
149 struct GenericDimReducer<0, Self, Op> {
150  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
151  Op& reducer, typename Self::CoeffReturnType* accum) {
152  for (int j = 0; j < self.m_reducedDims[0]; ++j) {
153  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
154  reducer.reduce(self.m_impl.coeff(input), accum);
155  }
156  }
157 };
158 template <typename Self, typename Op>
159 struct GenericDimReducer<-1, Self, Op> {
160  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer,
161  typename Self::CoeffReturnType* accum) {
162  reducer.reduce(self.m_impl.coeff(index), accum);
163  }
164 };
165 
166 template <typename Self, typename Op,
167  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
168  bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
169  // GPU threads can quickly run out of stack space
170  // for moderately sized inputs.
171  !Self::RunningOnGPU)>
173  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
174  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
175  typename Self::CoeffReturnType accum = reducer.initialize();
176  for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
177  reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
178  }
179  return reducer.finalize(accum);
180  }
181 };
182 
183 template <typename Self, typename Op>
184 struct InnerMostDimReducer<Self, Op, true, false> {
185  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
186  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
187  using Index = typename Self::Index;
189  Index start = 0;
190  typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
191  if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
192  const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
193  typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
194  typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
195  typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
196  const Index offset0 = firstIndex;
197  const Index offset1 = firstIndex + packetSize;
198  const Index offset2 = firstIndex + 2 * packetSize;
199  const Index offset3 = firstIndex + 3 * packetSize;
200  for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
201  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
202  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
203  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
204  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
205  }
206  reducer0.reducePacket(paccum1, &paccum0);
207  reducer0.reducePacket(paccum2, &paccum0);
208  reducer0.reducePacket(paccum3, &paccum0);
209  start = VectorizedSize4;
210  }
211  if (start <= (numValuesToReduce - packetSize)) {
212  const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
213  for (Index j = start; j < VectorizedSize; j += packetSize) {
214  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
215  }
216  start = VectorizedSize;
217  }
218  typename Self::CoeffReturnType accum = reducer0.initialize();
219  for (Index j = start; j < numValuesToReduce; ++j) {
220  reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
221  }
222  return reducer0.finalizeBoth(accum, paccum0);
223  }
224 };
225 
226 #if !defined(EIGEN_HIPCC)
227 
228 // The following implements tree-based reduction, which improves the accuracy
229 // of sum and mean reductions, since each of the n inputs only participates in
230 // O(log n) additions.
231 template <typename T>
233  return 1024;
234 }
235 template <>
237  return 200;
238 }
239 template <>
241  return 128;
242 }
243 
244 template <typename Self, typename Op>
245 struct InnerMostDimReducer<Self, Op, false, true> {
246  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
247  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
248  const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
249  typename Self::CoeffReturnType accum = reducer.initialize();
250  if (numValuesToReduce > kLeafSize) {
251  const typename Self::Index half = numValuesToReduce / 2;
252  // Recursively reduce the two halves.
253  reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
254  reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
255  return reducer.finalize(accum);
256  } else {
257  return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
258  }
259  }
260 };
261 
262 template <typename Self, typename Op>
263 struct InnerMostDimReducer<Self, Op, true, true> {
264  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
265  const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
266  const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
268  typename Self::CoeffReturnType accum = reducer.initialize();
269  if (numValuesToReduce > packetSize * kLeafSize) {
270  // Make sure the split point is aligned on a packet boundary.
271  const typename Self::Index split =
272  packetSize *
273  numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize);
274  const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
275  reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
276  if (num_left < numValuesToReduce) {
277  reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
278  }
279  return reducer.finalize(accum);
280  } else {
281  return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
282  }
283  }
284 };
285 #endif
286 
287 template <int DimIndex, typename Self, typename Op,
288  bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
290  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
291  typename Self::PacketReturnType*) {
292  eigen_assert(false && "should never be called");
293  }
294 };
295 
296 template <int DimIndex, typename Self, typename Op>
298  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
299  Op& reducer, typename Self::PacketReturnType* accum) {
300  EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
301  for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
302  const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
303  InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
304  }
305  }
306 };
307 
308 template <typename Self, typename Op>
310  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
311  Op& reducer0, typename Self::PacketReturnType* accum0) {
312  using Index = typename Self::Index;
313  const Index stride = self.m_reducedStrides[0];
314  const Index size = self.m_reducedDims[0];
315  if (!Self::ReducerTraits::IsStateful && size >= 16) {
316  const Index unrolled_size4 = (size / 4) * 4;
317  typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
318  typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
319  typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
320  for (Index j = 0; j < unrolled_size4; j += 4) {
321  const Index input0 = firstIndex + j * stride;
322  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
323  const Index input1 = firstIndex + (j + 1) * stride;
324  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
325  const Index input2 = firstIndex + (j + 2) * stride;
326  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
327  const Index input3 = firstIndex + (j + 3) * stride;
328  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
329  }
330  reducer0.reducePacket(accum1, accum0);
331  reducer0.reducePacket(accum2, accum0);
332  reducer0.reducePacket(accum3, accum0);
333  for (Index j = unrolled_size4; j < size; ++j) {
334  Index input = firstIndex + j * stride;
335  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
336  }
337  } else {
338  for (Index j = 0; j < size; ++j) {
339  Index input = firstIndex + j * stride;
340  reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
341  }
342  }
343  }
344 };
345 template <typename Self, typename Op>
346 struct InnerMostDimPreserver<-1, Self, Op, true> {
347  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
348  typename Self::PacketReturnType*) {
349  eigen_assert(false && "should never be called");
350  }
351 };
352 
353 // Default full reducer
354 template <typename Self, typename Op, typename Device,
355  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
356 struct FullReducer {
357  static constexpr bool HasOptimizedImplementation = false;
358 
359  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&,
360  typename Self::EvaluatorPointerType output) {
361  const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
362  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
363  }
364 };
365 
366 #ifdef EIGEN_USE_THREADS
367 // Multithreaded full reducers
368 template <typename Self, typename Op,
369  bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
370 struct FullReducerShard {
371  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
372  typename Self::Index numValuesToReduce, Op& reducer,
373  typename Self::CoeffReturnType* output) {
374  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, firstIndex, numValuesToReduce, reducer);
375  }
376 };
377 
378 // Multithreaded full reducer
379 template <typename Self, typename Op, bool Vectorizable>
380 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
381  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
383 
384  // launch one reducer per thread and accumulate the result.
385  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
386  typename Self::CoeffReturnType* output) {
387  typedef typename Self::Index Index;
388  const Index num_coeffs = array_prod(self.m_impl.dimensions());
389  if (num_coeffs == 0) {
390  *output = reducer.finalize(reducer.initialize());
391  return;
392  }
393  const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
394  TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
395  const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
396  if (num_threads == 1) {
397  *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
398  return;
399  }
400  const Index blocksize = num_coeffs / num_threads;
401  const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
402  eigen_assert(num_coeffs >= numblocks * blocksize);
403 
404  Barrier barrier(internal::convert_index<unsigned int>(numblocks));
405  MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
406  for (Index i = 0; i < numblocks; ++i) {
407  device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run, self, i * blocksize,
408  blocksize, reducer, &shards[i]);
409  }
410  typename Self::CoeffReturnType finalShard;
411  if (numblocks * blocksize < num_coeffs) {
412  finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
413  num_coeffs - numblocks * blocksize, reducer);
414  } else {
415  finalShard = reducer.initialize();
416  }
417  barrier.Wait();
418 
419  for (Index i = 0; i < numblocks; ++i) {
420  reducer.reduce(shards[i], &finalShard);
421  }
422  *output = reducer.finalize(finalShard);
423  }
424 };
425 
426 #endif
427 
428 // Default inner reducer
429 template <typename Self, typename Op, typename Device>
430 struct InnerReducer {
431  static constexpr bool HasOptimizedImplementation = false;
432 
433  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
434  typename Self::Index, typename Self::Index) {
435  eigen_assert(false && "Not implemented");
436  return true;
437  }
438 };
439 
440 // Default outer reducer
441 template <typename Self, typename Op, typename Device>
442 struct OuterReducer {
443  static constexpr bool HasOptimizedImplementation = false;
444 
445  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
446  typename Self::Index, typename Self::Index) {
447  eigen_assert(false && "Not implemented");
448  return true;
449  }
450 };
451 
452 #ifdef EIGEN_USE_SYCL
453 // Default Generic reducer
454 template <typename Self, typename Op, typename Device>
455 struct GenericReducer {
456  static constexpr bool HasOptimizedImplementation = false;
457 
458  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
459  typename Self::Index, typename Self::Index) {
460  eigen_assert(false && "Not implemented");
461  return true;
462  }
463 };
464 #endif
465 
466 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
467 template <int B, int N, typename S, typename R, typename I_>
468 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
469  unsigned int*);
470 
471 #if defined(EIGEN_HAS_GPU_FP16)
472 template <typename S, typename R, typename I_>
473 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
475 template <int B, int N, typename S, typename R, typename I_>
476 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
478 template <int NPT, typename S, typename R, typename I_>
479 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
480 
481 #endif
482 
483 template <int NPT, typename S, typename R, typename I_>
484 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
485 
486 template <int NPT, typename S, typename R, typename I_>
487 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
488 #endif
489 
498 template <typename Op, typename CoeffReturnType>
500 #if defined(EIGEN_USE_SYCL)
501  typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
502 #else
503  typedef std::remove_const_t<CoeffReturnType> type;
504 #endif
505 };
506 
507 } // end namespace internal
508 
509 template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
510 class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
511  public:
514  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
518 
520  : m_expr(expr), m_dims(dims) {}
522  : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
523 
525  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
527 
528  protected:
529  typename XprType::Nested m_expr;
530  const Dims m_dims;
531  const Op m_reducer;
532 };
533 
534 template <typename ArgType, typename Device>
536 
537 // Eval as rvalue
538 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
539 struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
541  typedef Dims ReducedDims;
543  typedef typename XprType::Index Index;
544  typedef ArgType ChildType;
546  static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
547  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
548  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
549  typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> > Dimensions;
550  typedef typename XprType::Scalar Scalar;
552  static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
555  static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
556 
560 
561  // Subset of strides of the input tensor for the non-reduced dimensions.
562  // Indexed by output dimensions.
563  static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
564 
565  // For full reductions
566 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
567  static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
568  static constexpr bool RunningOnSycl = false;
569 #elif defined(EIGEN_USE_SYCL)
570  static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
571  static constexpr bool RunningOnGPU = false;
572 #else
573  static constexpr bool RunningOnGPU = false;
574  static constexpr bool RunningOnSycl = false;
575 #endif
576 
577  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
578  enum {
579  IsAligned = false,
580  PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
581  BlockAccess = false,
582  PreferBlockAccess = true,
583  CoordAccess = false, // to be implemented
584  RawAccess = false
585  };
586 
587  typedef std::remove_const_t<Scalar> ScalarNoConst;
588 
589  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
591  //===--------------------------------------------------------------------===//
592 
593  static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
594  static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
595  static constexpr bool RunningFullReduction = (NumOutputDims == 0);
596 
598  : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
599  EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
600  EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
601  YOU_MADE_A_PROGRAMMING_MISTAKE);
602 
603  // Build the bitmap indicating if an input dimension is reduced or not.
604  for (int i = 0; i < NumInputDims; ++i) {
605  m_reduced[i] = false;
606  }
607  for (int i = 0; i < NumReducedDims; ++i) {
608  eigen_assert(op.dims()[i] >= 0);
609  eigen_assert(op.dims()[i] < NumInputDims);
610  m_reduced[op.dims()[i]] = true;
611  }
612 
613  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
614  internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
615 
616  // Precompute output strides.
617  if (NumOutputDims > 0) {
618  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
619  m_outputStrides[0] = 1;
620  for (int i = 1; i < NumOutputDims; ++i) {
621  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
622  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
623  }
624  } else {
625  m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
626  for (int i = NumOutputDims - 2; i >= 0; --i) {
627  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
628  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
629  }
630  }
631  }
632 
633  // Precompute input strides.
634  if (NumInputDims > 0) {
635  array<Index, NumInputDims> input_strides;
636  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
637  input_strides[0] = 1;
638  for (int i = 1; i < NumInputDims; ++i) {
639  input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
640  }
641  } else {
642  input_strides.back() = 1;
643  for (int i = NumInputDims - 2; i >= 0; --i) {
644  input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
645  }
646  }
647 
648  int outputIndex = 0;
649  int reduceIndex = 0;
650  for (int i = 0; i < NumInputDims; ++i) {
651  if (m_reduced[i]) {
652  m_reducedStrides[reduceIndex] = input_strides[i];
653  ++reduceIndex;
654  } else {
655  m_preservedStrides[outputIndex] = input_strides[i];
656  m_output_to_input_dim_map[outputIndex] = i;
657  ++outputIndex;
658  }
659  }
660  }
661 
662  // Special case for full reductions
663  if (NumOutputDims == 0) {
664  m_preservedStrides[0] = internal::array_prod(input_dims);
665  }
666 
667  m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
668  : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
669  ? m_preservedStrides[0]
670  : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
671  }
672 
673  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
674 
676  // Use the FullReducer if possible.
677  if ((RunningFullReduction && RunningOnSycl) ||
679  ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
680  bool need_assign = false;
681  if (!data) {
682  m_result = static_cast<EvaluatorPointerType>(
683  m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
684  data = m_result;
685  need_assign = true;
686  }
687  Op reducer(m_reducer);
688  internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
689  return need_assign;
690  }
691 
692  // Attempt to use an optimized reduction.
693  else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
694  bool reducing_inner_dims = true;
695  for (int i = 0; i < NumReducedDims; ++i) {
696  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
697  reducing_inner_dims &= m_reduced[i];
698  } else {
699  reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
700  }
701  }
703  (reducing_inner_dims || ReducingInnerMostDims)) {
704  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
705  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
706  if (!data) {
707  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
708  num_values_to_reduce > 128) ||
709  (RunningOnSycl)) {
710  data = static_cast<EvaluatorPointerType>(m_device.get(
711  (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
712  m_result = data;
713  } else {
714  return true;
715  }
716  }
717  Op reducer(m_reducer);
718  // For SYCL this if always return false
719  if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
720  num_coeffs_to_preserve)) {
721  if (m_result) {
722  m_device.deallocate_temp(m_result);
723  m_result = NULL;
724  }
725  return true;
726  } else {
727  return (m_result != NULL);
728  }
729  }
730 
731  bool preserving_inner_dims = true;
732  for (int i = 0; i < NumReducedDims; ++i) {
733  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
734  preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
735  } else {
736  preserving_inner_dims &= m_reduced[i];
737  }
738  }
740  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
741  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
742  if (!data) {
743  if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
744  num_values_to_reduce > 32) ||
745  (RunningOnSycl)) {
746  data = static_cast<EvaluatorPointerType>(m_device.get(
747  (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
748  m_result = data;
749  } else {
750  return true;
751  }
752  }
753  Op reducer(m_reducer);
754  // For SYCL this if always return false
755  if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
756  num_coeffs_to_preserve)) {
757  if (m_result) {
758  m_device.deallocate_temp(m_result);
759  m_result = NULL;
760  }
761  return true;
762  } else {
763  return (m_result != NULL);
764  }
765  }
766 #if defined(EIGEN_USE_SYCL)
767  // If there is no Optimised version for SYCL, the reduction expression
768  // must break into two subexpression and use the SYCL generic Reducer on the device.
769  if (RunningOnSycl) {
770  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
771  const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
772  if (!data) {
773  data = static_cast<EvaluatorPointerType>(
774  m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
775  m_result = data;
776  }
777  Op reducer(m_reducer);
778  internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
779  num_coeffs_to_preserve);
780  return (m_result != NULL);
781  }
782 #endif
783  }
784  return true;
785  }
786 
787 #ifdef EIGEN_USE_THREADS
788  template <typename EvalSubExprsCallback>
789  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
790  m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); });
791  }
792 #endif
793 
795  m_impl.evalSubExprsIfNeeded(NULL);
796  return evalSubExprsIfNeededCommon(data);
797  }
798 
800  m_impl.cleanup();
801  if (m_result) {
802  m_device.deallocate_temp(m_result);
803  m_result = NULL;
804  }
805  }
806 
808  if ((RunningFullReduction || RunningOnGPU) && m_result) {
809  return *(m_result + index);
810  }
811  Op reducer(m_reducer);
812  if (ReducingInnerMostDims || RunningFullReduction) {
813  const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
814  ? m_preservedStrides[0]
815  : m_preservedStrides[NumPreservedStrides - 1];
816  return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), num_values_to_reduce, reducer);
817  } else {
818  typename Self::CoeffReturnType accum = reducer.initialize();
819  internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
820  return reducer.finalize(accum);
821  }
822  }
823 
824  // TODO(bsteiner): provide a more efficient implementation.
825  template <int LoadMode>
827  eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
828 
829  if (RunningOnGPU && m_result) {
830  return internal::pload<PacketReturnType>(m_result + index);
831  }
832 
833  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
834  if (ReducingInnerMostDims) {
835  const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
836  ? m_preservedStrides[0]
837  : m_preservedStrides[NumPreservedStrides - 1];
838  const Index firstIndex = firstInput(index);
839  for (Index i = 0; i < PacketSize; ++i) {
840  Op reducer(m_reducer);
841  values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
842  num_values_to_reduce, reducer);
843  }
844  } else if (PreservingInnerMostDims) {
845  const Index firstIndex = firstInput(index);
846  const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
847  // TBD: extend this the the n innermost dimensions that we preserve.
848  if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
849  Op reducer(m_reducer);
850  typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
852  return reducer.finalizePacket(accum);
853  } else {
854  for (int i = 0; i < PacketSize; ++i) {
855  values[i] = coeff(index + i);
856  }
857  }
858  } else {
859  for (int i = 0; i < PacketSize; ++i) {
860  values[i] = coeff(index + i);
861  }
862  }
863  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
864  return rslt;
865  }
866 
867  // Must be called after evalSubExprsIfNeeded().
869  if (RunningFullReduction && m_result) {
870  return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
871  } else {
872  const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
873  const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
874  return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
875  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
876  }
877  }
878 
879  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
880  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
881  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
882 
883  private:
884  template <int, typename, typename>
886  template <typename, typename, bool, bool>
888  template <int, typename, typename, bool>
890  template <typename S, typename O, typename D, bool V>
891  friend struct internal::FullReducer;
892 #ifdef EIGEN_USE_THREADS
893  template <typename S, typename O, bool V>
894  friend struct internal::FullReducerShard;
895 #endif
896 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
897  template <int B, int N, typename S, typename R, typename I_>
898  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
899 #if defined(EIGEN_HAS_GPU_FP16)
900  template <typename S, typename R, typename I_>
901  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
903  template <int B, int N, typename S, typename R, typename I_>
904  KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*,
906  template <int NPT, typename S, typename R, typename I_>
907  KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
908 #endif
909  template <int NPT, typename S, typename R, typename I_>
910  KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
911 
912  template <int NPT, typename S, typename R, typename I_>
913  KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
914 #endif
915 
916 #if defined(EIGEN_USE_SYCL)
917  template <typename Evaluator_, typename Op__>
919  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
920  template <typename, typename, typename>
921  friend struct internal::GenericReducer;
922 #endif
923 
924  template <typename S, typename O, typename D>
925  friend struct internal::InnerReducer;
926 
927  struct BlockIteratorState {
931  };
932 
933  // Returns the Index in the input tensor of the first value that needs to be
934  // used to compute the reduction at output index "index".
936  if (ReducingInnerMostDims) {
937  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
938  return index * m_preservedStrides[0];
939  } else {
940  return index * m_preservedStrides[NumPreservedStrides - 1];
941  }
942  }
943  // TBD: optimize the case where we preserve the innermost dimensions.
944  Index startInput = 0;
945  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
946  for (int i = NumOutputDims - 1; i > 0; --i) {
947  // This is index_i in the output tensor.
948  const Index idx = index / m_outputStrides[i];
949  startInput += idx * m_preservedStrides[i];
950  index -= idx * m_outputStrides[i];
951  }
952  if (PreservingInnerMostDims) {
953  eigen_assert(m_preservedStrides[0] == 1);
954  startInput += index;
955  } else {
956  startInput += index * m_preservedStrides[0];
957  }
958  } else {
959  for (int i = 0; i < NumOutputDims - 1; ++i) {
960  // This is index_i in the output tensor.
961  const Index idx = index / m_outputStrides[i];
962  startInput += idx * m_preservedStrides[i];
963  index -= idx * m_outputStrides[i];
964  }
965  if (PreservingInnerMostDims) {
966  eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
967  startInput += index;
968  } else {
969  startInput += index * m_preservedStrides[NumPreservedStrides - 1];
970  }
971  }
972  return startInput;
973  }
974 
975  // Bitmap indicating if an input dimension is reduced or not.
977  // Dimensions of the output of the operation.
979  // Precomputed strides for the output tensor.
980  // Avoid zero-sized arrays, since element access fails to compile on GPU.
981  array<Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
983  array<Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
984  // Map from output to input dimension index.
986  // How many values go into each reduction
988 
989  // Subset of strides of the input tensor for the reduced dimensions.
990  // Indexed by reduced dimensions.
992  // Size of the input dimensions that are reduced.
993  // Indexed by reduced dimensions.
995 
996  // Evaluator for the input expression.
998 
999  // Operation to apply for computing the reduction.
1001 
1003 
1005 };
1006 
1007 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
1008 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
1009  : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
1011  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {}
1012 };
1013 
1014 template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
1015 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
1016  : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
1019  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
1020  : Base(op, device) {}
1021  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1022  // kernel
1023  // Therefore the coeff function should be overridden by for SYCL kernel
1024  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
1025  return *(this->data() + index);
1026  }
1027  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
1028  // kernel
1029  // Therefore the packet function should be overridden by for SYCL kernel
1030  template <int LoadMode>
1031  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
1032  return internal::pload<typename Base::PacketReturnType>(this->data() + index);
1033  }
1034 };
1035 
1036 } // end namespace Eigen
1037 
1038 #endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
@ R
Definition: StatisticsVector.h:21
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
#define KERNEL_FRIEND
Definition: TensorReduction.h:20
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
The tensor base class.
Definition: TensorBase.h:1026
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost &cost_per_coeff, int max_threads)
Definition: TensorCostModel.h:154
Definition: TensorCostModel.h:28
Definition: TensorReduction.h:510
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & reducer() const
Definition: TensorReduction.h:526
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims & dims() const
Definition: TensorReduction.h:525
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims, const Op &reducer)
Definition: TensorReduction.h:521
Eigen::internal::traits< TensorReductionOp >::Scalar Scalar
Definition: TensorReduction.h:512
std::remove_const_t< typename XprType::CoeffReturnType > CoeffReturnType
Definition: TensorReduction.h:514
Eigen::internal::nested< TensorReductionOp >::type Nested
Definition: TensorReduction.h:515
Eigen::internal::traits< TensorReductionOp >::StorageKind StorageKind
Definition: TensorReduction.h:516
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorReduction.h:524
const Dims m_dims
Definition: TensorReduction.h:530
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType &expr, const Dims &dims)
Definition: TensorReduction.h:519
Eigen::internal::traits< TensorReductionOp >::Index Index
Definition: TensorReduction.h:517
XprType::Nested m_expr
Definition: TensorReduction.h:529
const Op m_reducer
Definition: TensorReduction.h:531
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorReduction.h:513
Definition: TensorBlock.h:566
#define max(a, b)
Definition: datatypes.h:23
@ ColMajor
Definition: Constants.h:318
@ RowMajor
Definition: Constants.h:320
char char * op
Definition: level2_impl.h:374
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_DEVICE_FUNC Index LeafSize< half >()
Definition: TensorReduction.h:236
EIGEN_DEVICE_FUNC Index LeafSize()
Definition: TensorReduction.h:232
EIGEN_DEVICE_FUNC Index LeafSize< bfloat16 >()
Definition: TensorReduction.h:240
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T &x, const T &y)
Definition: MathFunctions.h:920
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
auto run(Kernel kernel, Args &&... args) -> decltype(kernel(args...))
Definition: gpu_test_helper.h:414
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
Definition: Eigen_Colamd.h:49
void start(const unsigned &i)
(Re-)start i-th timer
Definition: oomph_utilities.cc:243
void split(const DoubleVector &in_vector, Vector< DoubleVector * > &out_vector_pt)
Definition: double_vector.cc:1413
@ S
Definition: quadtree.h:62
Type
Type of JSON value.
Definition: rapidjson.h:513
CwiseBinaryOp< internal::scalar_sum_op< double, double >, const CpyMatrixXd, const CpyMatrixXd > XprType
Definition: nestbyvalue.cpp:15
void output(std::ostream &outfile, const unsigned &nplot)
Overload output function.
Definition: overloaded_element_body.h:490
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorForwardDeclarations.h:25
Definition: TensorMeta.h:47
Definition: TensorDimensions.h:85
Definition: TensorForwardDeclarations.h:42
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Eigen::SyclDevice &device)
Definition: TensorReduction.h:1019
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Eigen::SyclDevice > Base
Definition: TensorReduction.h:1018
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::CoeffReturnType coeff(typename Base::Index index) const
Definition: TensorReduction.h:1024
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Base::PacketReturnType packet(typename Base::Index index) const
Definition: TensorReduction.h:1031
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Base
Definition: TensorReduction.h:1010
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType &op, const Device &device)
Definition: TensorReduction.h:1011
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
EIGEN_DEVICE_FUNC const TensorEvaluator< ArgType, Device > & impl() const
Definition: TensorReduction.h:880
std::conditional_t< NumOutputDims==0, Sizes<>, DSizes< Index, NumOutputDims > > Dimensions
Definition: TensorReduction.h:549
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorReduction.h:673
EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType &op, const Device &device)
Definition: TensorReduction.h:597
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorReduction.h:590
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition: TensorReduction.h:794
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const
Definition: TensorReduction.h:935
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
Definition: TensorReduction.h:826
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorReduction.h:554
EIGEN_DEVICE_FUNC const Device & device() const
Definition: TensorReduction.h:881
array< internal::TensorIntDivisor< Index >,(std::max)(NumOutputDims, 1)> m_fastOutputStrides
Definition: TensorReduction.h:982
array< Index,(std::max)(NumPreservedStrides, 1)> m_preservedStrides
Definition: TensorReduction.h:983
TensorReductionEvaluatorBase< const TensorReductionOp< Op, Dims, ArgType, MakePointer_ >, Device > Self
Definition: TensorReduction.h:551
internal::ReductionReturnType< Op, typename XprType::CoeffReturnType >::type CoeffReturnType
Definition: TensorReduction.h:553
array< Index,(std::max)(NumOutputDims, 1)> m_output_to_input_dim_map
Definition: TensorReduction.h:985
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorReduction.h:868
array< Index,(std::max)(NumOutputDims, 1)> m_outputStrides
Definition: TensorReduction.h:981
StorageMemory< CoeffReturnType, Device > Storage
Definition: TensorReduction.h:558
EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data)
Definition: TensorReduction.h:675
internal::reducer_traits< Op, Device > ReducerTraits
Definition: TensorReduction.h:540
Eigen::internal::traits< XprType >::PointerType TensorPointerType
Definition: TensorReduction.h:557
TensorReductionOp< Op, Dims, ArgType, MakePointer_ > XprType
Definition: TensorReduction.h:542
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorReduction.h:807
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorReduction.h:879
TensorEvaluator< ArgType, Device >::Dimensions InputDimensions
Definition: TensorReduction.h:545
Definition: TensorReduction.h:535
Definition: Half.h:139
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, Rank > &, Sizes<> *, array< Index, Rank > *reduced_dims)
Definition: TensorReduction.h:91
Definition: TensorReduction.h:68
static EIGEN_DEVICE_FUNC void run(const InputDims &input_dims, const array< bool, internal::array_size< InputDims >::value > &reduced, OutputDims *output_dims, ReducedDims *reduced_dims)
Definition: TensorReduction.h:70
Definition: TensorReduction.h:356
static EIGEN_DEVICE_FUNC void run(const Self &self, Op &reducer, const Device &, typename Self::EvaluatorPointerType output)
Definition: TensorReduction.h:359
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index index, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:160
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:150
Definition: TensorReduction.h:138
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::CoeffReturnType *accum)
Definition: TensorReduction.h:139
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
Definition: TensorReduction.h:347
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer0, typename Self::PacketReturnType *accum0)
Definition: TensorReduction.h:310
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &self, typename Self::Index firstIndex, Op &reducer, typename Self::PacketReturnType *accum)
Definition: TensorReduction.h:298
Definition: TensorReduction.h:289
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self &, typename Self::Index, Op &, typename Self::PacketReturnType *)
Definition: TensorReduction.h:290
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:246
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer0)
Definition: TensorReduction.h:185
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:264
Definition: TensorReduction.h:172
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Self::CoeffReturnType reduce(const Self &self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op &reducer)
Definition: TensorReduction.h:173
Definition: TensorReduction.h:430
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:431
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:433
Definition: TensorReduction.h:442
static constexpr bool HasOptimizedImplementation
Definition: TensorReduction.h:443
static EIGEN_DEVICE_FUNC bool run(const Self &, Op &, const Device &, typename Self::CoeffReturnType *, typename Self::Index, typename Self::Index)
Definition: TensorReduction.h:445
Definition: TensorReduction.h:499
std::remove_const_t< CoeffReturnType > type
Definition: TensorReduction.h:503
Definition: TensorReduction.h:101
static const bool value
Definition: TensorReduction.h:102
Definition: Meta.h:305
const TensorReductionOp< Op, Dims, XprType, MakePointer_ > & type
Definition: TensorReduction.h:58
Definition: XprHelper.h:427
Definition: XprHelper.h:205
@ Cost
Definition: XprHelper.h:206
Definition: Meta.h:205
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
Packet16h type
Definition: AVX512/PacketMath.h:68
Definition: TensorReduction.h:105
static const bool value
Definition: TensorReduction.h:106
Definition: TensorFunctors.h:60
XprTraits::PointerType PointerType
Definition: TensorReduction.h:46
XprTraits::StorageKind StorageKind
Definition: TensorReduction.h:41
Definition: ForwardDeclarations.h:21
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
Definition: TensorMeta.h:32
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2
void run(const string &dir_name, LinearSolver *linear_solver_pt, const unsigned nel_1d, bool mess_up_order)
Definition: two_d_poisson_compare_solvers.cc:317