TensorScan.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
18 namespace internal {
19 
20 template <typename Op, typename XprType>
21 struct traits<TensorScanOp<Op, XprType> > : public traits<XprType> {
22  typedef typename XprType::Scalar Scalar;
24  typedef typename XprTraits::StorageKind StorageKind;
25  typedef typename XprType::Nested Nested;
26  typedef std::remove_reference_t<Nested> Nested_;
27  static constexpr int NumDimensions = XprTraits::NumDimensions;
28  static constexpr int Layout = XprTraits::Layout;
29  typedef typename XprTraits::PointerType PointerType;
30 };
31 
32 template <typename Op, typename XprType>
35 };
36 
37 template <typename Op, typename XprType>
38 struct nested<TensorScanOp<Op, XprType>, 1, typename eval<TensorScanOp<Op, XprType> >::type> {
40 };
41 } // end namespace internal
42 
48 template <typename Op, typename XprType>
49 class TensorScanOp : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
50  public:
53  typedef typename XprType::CoeffReturnType CoeffReturnType;
57 
59  const Op& op = Op())
61 
66 
67  protected:
68  typename XprType::Nested m_expr;
69  const Index m_axis;
70  const Op m_accumulator;
71  const bool m_exclusive;
72 };
73 
74 namespace internal {
75 
76 template <typename Self>
77 EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) {
78  // Compute the scan along the axis, starting at the given offset
79  typename Self::CoeffReturnType accum = self.accumulator().initialize();
80  if (self.stride() == 1) {
81  if (self.exclusive()) {
82  for (Index curr = offset; curr < offset + self.size(); ++curr) {
83  data[curr] = self.accumulator().finalize(accum);
84  self.accumulator().reduce(self.inner().coeff(curr), &accum);
85  }
86  } else {
87  for (Index curr = offset; curr < offset + self.size(); ++curr) {
88  self.accumulator().reduce(self.inner().coeff(curr), &accum);
89  data[curr] = self.accumulator().finalize(accum);
90  }
91  }
92  } else {
93  if (self.exclusive()) {
94  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
95  Index curr = offset + idx3 * self.stride();
96  data[curr] = self.accumulator().finalize(accum);
97  self.accumulator().reduce(self.inner().coeff(curr), &accum);
98  }
99  } else {
100  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
101  Index curr = offset + idx3 * self.stride();
102  self.accumulator().reduce(self.inner().coeff(curr), &accum);
103  data[curr] = self.accumulator().finalize(accum);
104  }
105  }
106  }
107 }
108 
109 template <typename Self>
110 EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) {
111  using Scalar = typename Self::CoeffReturnType;
112  using Packet = typename Self::PacketReturnType;
113  // Compute the scan along the axis, starting at the calculated offset
114  Packet accum = self.accumulator().template initializePacket<Packet>();
115  if (self.stride() == 1) {
116  if (self.exclusive()) {
117  for (Index curr = offset; curr < offset + self.size(); ++curr) {
118  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
119  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
120  }
121  } else {
122  for (Index curr = offset; curr < offset + self.size(); ++curr) {
123  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
124  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
125  }
126  }
127  } else {
128  if (self.exclusive()) {
129  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
130  const Index curr = offset + idx3 * self.stride();
131  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
132  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
133  }
134  } else {
135  for (Index idx3 = 0; idx3 < self.size(); idx3++) {
136  const Index curr = offset + idx3 * self.stride();
137  self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
138  internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
139  }
140  }
141  }
142 }
143 
144 template <typename Self, bool Vectorize, bool Parallel>
145 struct ReduceBlock {
146  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
147  for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
148  // Calculate the starting offset for the scan
149  Index offset = idx1 + idx2;
150  ReduceScalar(self, offset, data);
151  }
152  }
153 };
154 
155 // Specialization for vectorized reduction.
156 template <typename Self>
157 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
158  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
159  using Packet = typename Self::PacketReturnType;
160  const int PacketSize = internal::unpacket_traits<Packet>::size;
161  Index idx2 = 0;
162  for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
163  // Calculate the starting offset for the packet scan
164  Index offset = idx1 + idx2;
165  ReducePacket(self, offset, data);
166  }
167  for (; idx2 < self.stride(); idx2++) {
168  // Calculate the starting offset for the scan
169  Index offset = idx1 + idx2;
170  ReduceScalar(self, offset, data);
171  }
172  }
173 };
174 
175 // Single-threaded CPU implementation of scan
176 template <typename Self, typename Reducer, typename Device,
179 struct ScanLauncher {
180  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
181  Index total_size = internal::array_prod(self.dimensions());
182 
183  // We fix the index along the scan axis to 0 and perform a
184  // scan per remaining entry. The iteration is split into two nested
185  // loops to avoid an integer division by keeping track of each idx1 and
186  // idx2.
187  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
188  ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
189  block_reducer(self, idx1, data);
190  }
191  }
192 };
193 
194 #ifdef EIGEN_USE_THREADS
195 
196 // Adjust block_size to avoid false sharing of cachelines among
197 // threads. Currently set to twice the cache line size on Intel and ARM
198 // processors.
199 EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
200  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
201  const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
202  return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
203 }
204 
205 template <typename Self>
206 struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
207  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
208  using Scalar = typename Self::CoeffReturnType;
209  using Packet = typename Self::PacketReturnType;
210  const int PacketSize = internal::unpacket_traits<Packet>::size;
211  Index num_scalars = self.stride();
212  Index num_packets = 0;
213  if (self.stride() >= PacketSize) {
214  num_packets = self.stride() / PacketSize;
215  self.device().parallelFor(
216  num_packets,
217  TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true,
218  PacketSize),
219  // Make the shard size large enough that two neighboring threads
220  // won't write to the same cacheline of `data`.
221  [=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); },
222  [&](Index first, Index last) {
223  for (Index packet = first; packet < last; ++packet) {
224  const Index idx2 = packet * PacketSize;
225  ReducePacket(self, idx1 + idx2, data);
226  }
227  });
228  num_scalars -= num_packets * PacketSize;
229  }
230  self.device().parallelFor(
231  num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
232  // Make the shard size large enough that two neighboring threads
233  // won't write to the same cacheline of `data`.
234  [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
235  [&](Index first, Index last) {
236  for (Index scalar = first; scalar < last; ++scalar) {
237  const Index idx2 = num_packets * PacketSize + scalar;
238  ReduceScalar(self, idx1 + idx2, data);
239  }
240  });
241  }
242 };
243 
244 template <typename Self>
245 struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
246  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
247  using Scalar = typename Self::CoeffReturnType;
248  self.device().parallelFor(
249  self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
250  // Make the shard size large enough that two neighboring threads
251  // won't write to the same cacheline of `data`.
252  [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
253  [&](Index first, Index last) {
254  for (Index idx2 = first; idx2 < last; ++idx2) {
255  ReduceScalar(self, idx1 + idx2, data);
256  }
257  });
258  }
259 };
260 
261 // Specialization for multi-threaded execution.
262 template <typename Self, typename Reducer, bool Vectorize>
263 struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
264  void operator()(Self& self, typename Self::CoeffReturnType* data) {
265  using Scalar = typename Self::CoeffReturnType;
266  using Packet = typename Self::PacketReturnType;
267  const int PacketSize = internal::unpacket_traits<Packet>::size;
268  const Index total_size = internal::array_prod(self.dimensions());
269  const Index inner_block_size = self.stride() * self.size();
270  bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
271 
272  if ((parallelize_by_outer_blocks && total_size <= 4096) ||
273  (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
274  ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
275  launcher(self, data);
276  return;
277  }
278 
279  if (parallelize_by_outer_blocks) {
280  // Parallelize over outer blocks.
281  const Index num_outer_blocks = total_size / inner_block_size;
282  self.device().parallelFor(
283  num_outer_blocks,
284  TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize),
285  [=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); },
286  [&](Index first, Index last) {
287  for (Index idx1 = first; idx1 < last; ++idx1) {
288  ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
289  block_reducer(self, idx1 * inner_block_size, data);
290  }
291  });
292  } else {
293  // Parallelize over inner packets/scalars dimensions when the reduction
294  // axis is not an inner dimension.
295  ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
296  for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
297  block_reducer(self, idx1, data);
298  }
299  }
300  }
301 };
302 #endif // EIGEN_USE_THREADS
303 
304 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
305 
306 // GPU implementation of scan
307 // TODO(ibab) This placeholder implementation performs multiple scans in
308 // parallel, but it would be better to use a parallel scan algorithm and
309 // optimize memory access.
310 template <typename Self, typename Reducer>
311 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size,
312  typename Self::CoeffReturnType* data) {
313  // Compute offset as in the CPU version
314  Index val = threadIdx.x + blockIdx.x * blockDim.x;
315  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
316 
317  if (offset + (self.size() - 1) * self.stride() < total_size) {
318  // Compute the scan along the axis, starting at the calculated offset
319  typename Self::CoeffReturnType accum = self.accumulator().initialize();
320  for (Index idx = 0; idx < self.size(); idx++) {
321  Index curr = offset + idx * self.stride();
322  if (self.exclusive()) {
323  data[curr] = self.accumulator().finalize(accum);
324  self.accumulator().reduce(self.inner().coeff(curr), &accum);
325  } else {
326  self.accumulator().reduce(self.inner().coeff(curr), &accum);
327  data[curr] = self.accumulator().finalize(accum);
328  }
329  }
330  }
331  __syncthreads();
332 }
333 
334 template <typename Self, typename Reducer, bool Vectorize>
335 struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
336  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
337  Index total_size = internal::array_prod(self.dimensions());
338  Index num_blocks = (total_size / self.size() + 63) / 64;
339  Index block_size = 64;
340 
341  LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
342  }
343 };
344 #endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
345 
346 } // namespace internal
347 
348 // Eval as rvalue
349 template <typename Op, typename ArgType, typename Device>
350 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
352  typedef typename XprType::Index Index;
353  typedef const ArgType ChildTypeNoConst;
354  typedef const ArgType ChildType;
357  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
363 
365  enum {
366  IsAligned = false,
368  BlockAccess = false,
369  PreferBlockAccess = false,
370  CoordAccess = false,
371  RawAccess = true
372  };
373 
374  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
376  //===--------------------------------------------------------------------===//
377 
378  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
379  : m_impl(op.expression(), device),
380  m_device(device),
381  m_exclusive(op.exclusive()),
382  m_accumulator(op.accumulator()),
383  m_size(m_impl.dimensions()[op.axis()]),
384  m_stride(1),
385  m_consume_dim(op.axis()),
386  m_output(NULL) {
387  // Accumulating a scalar isn't supported.
388  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
389  eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
390 
391  // Compute stride of scan axis
392  const Dimensions& dims = m_impl.dimensions();
393  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
394  for (int i = 0; i < op.axis(); ++i) {
395  m_stride = m_stride * dims[i];
396  }
397  } else {
398  // dims can only be indexed through unsigned integers,
399  // so let's use an unsigned type to let the compiler knows.
400  // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized
401  // in this function"
402  unsigned int axis = internal::convert_index<unsigned int>(op.axis());
403  for (unsigned int i = NumDims - 1; i > axis; --i) {
404  m_stride = m_stride * dims[i];
405  }
406  }
407  }
408 
409  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
410 
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { return m_stride; }
412 
413  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { return m_consume_dim; }
414 
415  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; }
416 
417  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { return m_accumulator; }
418 
419  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; }
420 
422 
423  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
424 
426  m_impl.evalSubExprsIfNeeded(NULL);
428  if (data) {
429  launcher(*this, data);
430  return false;
431  }
432 
433  const Index total_size = internal::array_prod(dimensions());
434  m_output =
435  static_cast<EvaluatorPointerType>(m_device.get((Scalar*)m_device.allocate_temp(total_size * sizeof(Scalar))));
436  launcher(*this, m_output);
437  return true;
438  }
439 
440  template <int LoadMode>
442  return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
443  }
444 
446 
447  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; }
448 
450  return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
451  }
452 
454  if (m_output) {
455  m_device.deallocate_temp(m_output);
456  m_output = NULL;
457  }
458  m_impl.cleanup();
459  }
460 
461  protected:
464  const bool m_exclusive;
466  const Index m_size;
470 };
471 
472 } // end namespace Eigen
473 
474 #endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_CONSTEXPR
Definition: Macros.h:758
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define EIGEN_HIP_LAUNCH_BOUNDS_1024
Definition: Macros.h:576
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
int data[]
Definition: Map_placement_new.cpp:1
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
The tensor base class.
Definition: TensorBase.h:1026
Definition: TensorCostModel.h:28
Definition: TensorScan.h:49
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:53
Eigen::internal::nested< TensorScanOp >::type Nested
Definition: TensorScan.h:54
const bool m_exclusive
Definition: TensorScan.h:71
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const
Definition: TensorScan.h:62
Eigen::internal::traits< TensorScanOp >::StorageKind StorageKind
Definition: TensorScan.h:55
Eigen::internal::traits< TensorScanOp >::Scalar Scalar
Definition: TensorScan.h:51
XprType::Nested m_expr
Definition: TensorScan.h:68
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const
Definition: TensorScan.h:64
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorScan.h:52
const Op m_accumulator
Definition: TensorScan.h:70
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:65
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType &expr, const Index &axis, bool exclusive=false, const Op &op=Op())
Definition: TensorScan.h:58
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType & expression() const
Definition: TensorScan.h:63
Eigen::internal::traits< TensorScanOp >::Index Index
Definition: TensorScan.h:56
const Index m_axis
Definition: TensorScan.h:69
Definition: TensorBlock.h:566
dim3 threadIdx
Definition: gpu_common.h:16
dim3 blockDim
Definition: gpu_common.h:16
dim3 blockIdx
Definition: gpu_common.h:16
static constexpr const last_t last
Definition: IndexedViewHelper.h:48
@ ColMajor
Definition: Constants.h:318
char char * op
Definition: level2_impl.h:374
EIGEN_STRONG_INLINE void ReduceScalar(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:77
constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE auto array_prod(const array< T, N > &arr) -> decltype(array_reduce< product_op, T, N >(arr, static_cast< T >(1)))
Definition: MoreMeta.h:497
EIGEN_STRONG_INLINE void ReducePacket(Self &self, Index offset, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:110
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_CONSTEXPR T div_ceil(T a, T b)
Definition: MathFunctions.h:1251
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
val
Definition: calibrate.py:119
Definition: Eigen_Colamd.h:49
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorMeta.h:47
Definition: TensorForwardDeclarations.h:42
Storage::Type EvaluatorPointerType
Definition: TensorScan.h:362
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorScan.h:447
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
Definition: TensorScan.h:445
XprType::CoeffReturnType CoeffReturnType
Definition: TensorScan.h:358
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator< ArgType, Device > & inner() const
Definition: TensorScan.h:421
TensorScanOp< Op, ArgType > XprType
Definition: TensorScan.h:351
DSizes< Index, NumDims > Dimensions
Definition: TensorScan.h:356
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & size() const
Definition: TensorScan.h:415
internal::TensorBlockNotImplemented TensorBlock
Definition: TensorScan.h:375
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorScan.h:359
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorScan.h:409
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const
Definition: TensorScan.h:419
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorScan.h:378
const ArgType ChildTypeNoConst
Definition: TensorScan.h:353
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
Definition: TensorScan.h:425
const Device EIGEN_DEVICE_REF m_device
Definition: TensorScan.h:463
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorScan.h:453
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & stride() const
Definition: TensorScan.h:411
std::remove_const_t< typename XprType::Scalar > Scalar
Definition: TensorScan.h:357
EvaluatorPointerType m_output
Definition: TensorScan.h:469
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device & device() const
Definition: TensorScan.h:423
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
Definition: TensorScan.h:441
TensorEvaluator< ArgType, Device > m_impl
Definition: TensorScan.h:462
StorageMemory< Scalar, Device > Storage
Definition: TensorScan.h:361
TensorEvaluator< const TensorScanOp< Op, ArgType >, Device > Self
Definition: TensorScan.h:360
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index & consume_dim() const
Definition: TensorScan.h:413
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op & accumulator() const
Definition: TensorScan.h:417
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const
Definition: TensorScan.h:449
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorEvaluator.h:165
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:158
Definition: TensorScan.h:145
EIGEN_STRONG_INLINE void operator()(Self &self, Index idx1, typename Self::CoeffReturnType *data)
Definition: TensorScan.h:146
Definition: TensorScan.h:179
void operator()(Self &self, typename Self::CoeffReturnType *data) const
Definition: TensorScan.h:180
Definition: Meta.h:305
const TensorScanOp< Op, XprType > & type
Definition: TensorScan.h:34
Definition: XprHelper.h:427
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
Definition: TensorFunctors.h:60
traits< XprType > XprTraits
Definition: TensorScan.h:23
XprTraits::StorageKind StorageKind
Definition: TensorScan.h:24
XprTraits::PointerType PointerType
Definition: TensorScan.h:29
std::remove_reference_t< Nested > Nested_
Definition: TensorScan.h:26
XprType::Nested Nested
Definition: TensorScan.h:25
XprType::Scalar Scalar
Definition: TensorScan.h:22
Definition: ForwardDeclarations.h:21
Definition: GenericPacketMath.h:134
@ size
Definition: GenericPacketMath.h:139
Definition: ZVector/PacketMath.h:50