TensorBroadcasting.h
Go to the documentation of this file.
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12 
13 // IWYU pragma: private
14 #include "./InternalHeaderCheck.h"
15 
16 namespace Eigen {
17 
25 namespace internal {
26 template <typename Broadcast, typename XprType>
27 struct traits<TensorBroadcastingOp<Broadcast, XprType>> : public traits<XprType> {
28  typedef typename XprType::Scalar Scalar;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  typedef std::remove_reference_t<Nested> Nested_;
34  static constexpr int NumDimensions = XprTraits::NumDimensions;
35  static constexpr int Layout = XprTraits::Layout;
36  typedef typename XprTraits::PointerType PointerType;
37 };
38 
39 template <typename Broadcast, typename XprType>
40 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> {
42 };
43 
44 template <typename Broadcast, typename XprType>
45 struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1,
46  typename eval<TensorBroadcastingOp<Broadcast, XprType>>::type> {
48 };
49 
50 template <typename Dims>
52  static const bool value = false;
53 };
54 template <>
56  static const bool value = true;
57 };
58 template <typename std::ptrdiff_t... Indices>
59 struct is_input_scalar<Sizes<Indices...>> {
60  static constexpr bool value = (Sizes<Indices...>::total_size == 1);
61 };
62 
63 } // end namespace internal
64 
65 template <typename Broadcast, typename XprType>
66 class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> {
67  public:
70  typedef typename XprType::CoeffReturnType CoeffReturnType;
74 
76  : m_xpr(expr), m_broadcast(broadcast) {}
77 
78  EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; }
79 
81 
82  protected:
83  typename XprType::Nested m_xpr;
84  const Broadcast m_broadcast;
85 };
86 
87 // Eval as rvalue
88 template <typename Broadcast, typename ArgType, typename Device>
89 struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> {
91  typedef typename XprType::Index Index;
94  typedef typename XprType::Scalar Scalar;
99 
100  protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be
101  // standard layout;
102  bool isCopy, nByOne, oneByN;
103 
104  public:
107 
108  enum {
112  PreferBlockAccess = true,
113  RawAccess = false
114  };
116 
117  typedef std::remove_const_t<Scalar> ScalarNoConst;
118 
119  // We do block based broadcasting using a trick with 2x tensor rank and 0
120  // strides. See block method implementation for details.
122 
123  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
126 
128 
130  //===--------------------------------------------------------------------===//
131 
132  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
133  : isCopy(false),
134  nByOne(false),
135  oneByN(false),
136  m_device(device),
137  m_broadcast(op.broadcast()),
138  m_impl(op.expression(), device) {
139  // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
140  // and store the result in a scalar. Instead one should reshape the scalar into a N-D
141  // tensor with N >= 1 of 1 element first and then broadcast.
142  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
143  const InputDimensions& input_dims = m_impl.dimensions();
144  isCopy = true;
145  for (int i = 0; i < NumDims; ++i) {
146  eigen_assert(input_dims[i] > 0);
147  m_dimensions[i] = input_dims[i] * m_broadcast[i];
148  if (m_broadcast[i] != 1) {
149  isCopy = false;
150  }
151  }
152 
153  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
154  m_inputStrides[0] = 1;
155  m_outputStrides[0] = 1;
156  for (int i = 1; i < NumDims; ++i) {
157  m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
158  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
159  }
160  } else {
161  m_inputStrides[NumDims - 1] = 1;
162  m_outputStrides[NumDims - 1] = 1;
163  for (int i = NumDims - 2; i >= 0; --i) {
164  m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
165  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
166  }
167  }
168 
169  if (input_dims[0] == 1) {
170  oneByN = true;
171  for (int i = 1; i < NumDims; ++i) {
172  if (m_broadcast[i] != 1) {
173  oneByN = false;
174  break;
175  }
176  }
177  } else if (input_dims[NumDims - 1] == 1) {
178  nByOne = true;
179  for (int i = 0; i < NumDims - 1; ++i) {
180  if (m_broadcast[i] != 1) {
181  nByOne = false;
182  break;
183  }
184  }
185  }
186 
187  // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
188  // broadcast shape is '[N, 1..., N]'
189  if (!oneByN && !nByOne) {
190  if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) {
191  nByOne = true;
192  oneByN = true;
193  for (int i = 1; i < NumDims - 1; ++i) {
194  if (m_broadcast[i] != 1) {
195  nByOne = false;
196  oneByN = false;
197  break;
198  }
199  }
200  }
201  }
202  }
203 
204  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
205 
207  m_impl.evalSubExprsIfNeeded(NULL);
208  return true;
209  }
210 
211 #ifdef EIGEN_USE_THREADS
212  template <typename EvalSubExprsCallback>
213  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
214  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
215  }
216 #endif // EIGEN_USE_THREADS
217 
218  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
219 
222  return m_impl.coeff(0);
223  }
224 
225  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
226  if (isCopy) {
227  return m_impl.coeff(index);
228  } else {
229  return coeffColMajor(index);
230  }
231  } else {
232  if (isCopy) {
233  return m_impl.coeff(index);
234  } else {
235  return coeffRowMajor(index);
236  }
237  }
238  }
239 
240  // TODO: attempt to speed this up. The integer divisions and modulo are slow
242  Index inputIndex = 0;
244  for (int i = NumDims - 1; i > 0; --i) {
245  const Index idx = index / m_outputStrides[i];
246  if (internal::index_statically_eq<Broadcast>(i, 1)) {
247  eigen_assert(idx < m_impl.dimensions()[i]);
248  inputIndex += idx * m_inputStrides[i];
249  } else {
250  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
251  eigen_assert(idx % m_impl.dimensions()[i] == 0);
252  } else {
253  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
254  }
255  }
256  index -= idx * m_outputStrides[i];
257  }
258  if (internal::index_statically_eq<Broadcast>(0, 1)) {
259  eigen_assert(index < m_impl.dimensions()[0]);
260  inputIndex += index;
261  } else {
262  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
263  eigen_assert(index % m_impl.dimensions()[0] == 0);
264  } else {
265  inputIndex += (index % m_impl.dimensions()[0]);
266  }
267  }
268  return inputIndex;
269  }
270 
272  return m_impl.coeff(indexColMajor(index));
273  }
274 
276  Index inputIndex = 0;
278  for (int i = 0; i < NumDims - 1; ++i) {
279  const Index idx = index / m_outputStrides[i];
280  if (internal::index_statically_eq<Broadcast>(i, 1)) {
281  eigen_assert(idx < m_impl.dimensions()[i]);
282  inputIndex += idx * m_inputStrides[i];
283  } else {
284  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
285  eigen_assert(idx % m_impl.dimensions()[i] == 0);
286  } else {
287  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
288  }
289  }
290  index -= idx * m_outputStrides[i];
291  }
292  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
293  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
294  inputIndex += index;
295  } else {
296  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
297  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
298  } else {
299  inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
300  }
301  }
302  return inputIndex;
303  }
304 
306  return m_impl.coeff(indexRowMajor(index));
307  }
308 
309  template <int LoadMode>
312  return internal::pset1<PacketReturnType>(m_impl.coeff(0));
313  }
314 
315  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
316  if (isCopy) {
317 #ifdef EIGEN_GPU_COMPILE_PHASE
318  // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
319  // unaligned loads here. The reason is unclear though.
320  return m_impl.template packet<Unaligned>(index);
321 #else
322  return m_impl.template packet<LoadMode>(index);
323 #endif
324  } else if (oneByN && !nByOne) {
325  return packetNByOne<LoadMode>(index);
326  } else if (!oneByN && nByOne) {
327  return packetOneByN<LoadMode>(index);
328  } else if (oneByN && nByOne) {
329  return packetOneByNByOne<LoadMode>(index);
330  } else {
331  return packetColMajor<LoadMode>(index);
332  }
333  } else {
334  if (isCopy) {
335 #ifdef EIGEN_GPU_COMPILE_PHASE
336  // See above.
337  return m_impl.template packet<Unaligned>(index);
338 #else
339  return m_impl.template packet<LoadMode>(index);
340 #endif
341  } else if (oneByN && !nByOne) {
342  return packetOneByN<LoadMode>(index);
343  } else if (!oneByN && nByOne) {
344  return packetNByOne<LoadMode>(index);
345  } else if (oneByN && nByOne) {
346  return packetOneByNByOne<LoadMode>(index);
347  } else {
348  return packetRowMajor<LoadMode>(index);
349  }
350  }
351  }
352 
353  template <int LoadMode>
355  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
356 
357  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
358  Index startDim, endDim;
359  Index inputIndex, outputOffset, batchedIndex;
360 
361  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
362  startDim = NumDims - 1;
363  endDim = 1;
364  } else {
365  startDim = 0;
366  endDim = NumDims - 2;
367  }
368 
369  batchedIndex = index % m_outputStrides[startDim];
370  inputIndex = batchedIndex / m_outputStrides[endDim];
371  outputOffset = batchedIndex % m_outputStrides[endDim];
372 
373  if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
374  values[0] = m_impl.coeff(inputIndex);
375  return internal::pload1<PacketReturnType>(values);
376  } else {
378  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
379  if (outputOffset + cur < m_outputStrides[endDim]) {
380  values[i] = m_impl.coeff(inputIndex);
381  } else {
382  ++inputIndex;
383  inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
384  values[i] = m_impl.coeff(inputIndex);
385  outputOffset = 0;
386  cur = 0;
387  }
388  }
389  return internal::pload<PacketReturnType>(values);
390  }
391  }
392 
393  template <int LoadMode>
395  // Consider the flattened tensor [v0, ..., vN],
396  // Concatenates m_broadcast[dim] copies,
397  // [v0, ..., vN, v0, ..., vN, ... ]
398  // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
399  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
400 
401  // Size of flattened tensor.
402  const Index M =
403  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0];
404  Index inputIndex = index % M;
405  if (inputIndex + PacketSize <= M) {
406  return m_impl.template packet<Unaligned>(inputIndex);
407  } else {
408  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
410  for (int i = 0; i < PacketSize; ++i) {
411  if (inputIndex > M - 1) {
412  inputIndex = 0;
413  }
414  values[i] = m_impl.coeff(inputIndex++);
415  }
416  return internal::pload<PacketReturnType>(values);
417  }
418  }
419 
420  template <int LoadMode>
422  // Consider the flattened tensor [v0, ..., vN],
423  // Interleaves m_broadcast[dim] copies,
424  // [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
425  // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
426  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
427 
428  const Index M =
429  (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1];
430 
431  Index inputIndex = index / M;
432  Index outputOffset = index % M;
433  if (outputOffset + PacketSize <= M) {
434  return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
435  } else {
436  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
438  for (int i = 0; i < PacketSize; ++i) {
439  if (outputOffset < M) {
440  values[i] = m_impl.coeff(inputIndex);
441  ++outputOffset;
442  } else {
443  values[i] = m_impl.coeff(++inputIndex);
444  outputOffset = 1; // Next offset.
445  }
446  }
447  return internal::pload<PacketReturnType>(values);
448  }
449  }
450 
451  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
452  // the alignment at compile time.
453  template <int LoadMode>
455  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
456 
457  const Index originalIndex = index;
458 
459  Index inputIndex = 0;
461  for (int i = NumDims - 1; i > 0; --i) {
462  const Index idx = index / m_outputStrides[i];
463  if (internal::index_statically_eq<Broadcast>(i, 1)) {
464  eigen_assert(idx < m_impl.dimensions()[i]);
465  inputIndex += idx * m_inputStrides[i];
466  } else {
467  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
468  eigen_assert(idx % m_impl.dimensions()[i] == 0);
469  } else {
470  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
471  }
472  }
473  index -= idx * m_outputStrides[i];
474  }
475  Index innermostLoc;
476  if (internal::index_statically_eq<Broadcast>(0, 1)) {
477  eigen_assert(index < m_impl.dimensions()[0]);
478  innermostLoc = index;
479  } else {
480  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
481  eigen_assert(index % m_impl.dimensions()[0] == 0);
482  innermostLoc = 0;
483  } else {
484  innermostLoc = index % m_impl.dimensions()[0];
485  }
486  }
487  inputIndex += innermostLoc;
488 
489  // Todo: this could be extended to the second dimension if we're not
490  // broadcasting alongside the first dimension, and so on.
491  if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
492  return m_impl.template packet<Unaligned>(inputIndex);
493  } else {
494  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
495  values[0] = m_impl.coeff(inputIndex);
497  for (int i = 1; i < PacketSize; ++i) {
498  if (innermostLoc + i < m_impl.dimensions()[0]) {
499  values[i] = m_impl.coeff(inputIndex + i);
500  } else {
501  values[i] = coeffColMajor(originalIndex + i);
502  }
503  }
504  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
505  return rslt;
506  }
507  }
508 
509  template <int LoadMode>
511  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
512 
513  const Index originalIndex = index;
514 
515  Index inputIndex = 0;
517  for (int i = 0; i < NumDims - 1; ++i) {
518  const Index idx = index / m_outputStrides[i];
519  if (internal::index_statically_eq<Broadcast>(i, 1)) {
520  eigen_assert(idx < m_impl.dimensions()[i]);
521  inputIndex += idx * m_inputStrides[i];
522  } else {
523  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
524  eigen_assert(idx % m_impl.dimensions()[i] == 0);
525  } else {
526  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
527  }
528  }
529  index -= idx * m_outputStrides[i];
530  }
531  Index innermostLoc;
532  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
533  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
534  innermostLoc = index;
535  } else {
536  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
537  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
538  innermostLoc = 0;
539  } else {
540  innermostLoc = index % m_impl.dimensions()[NumDims - 1];
541  }
542  }
543  inputIndex += innermostLoc;
544 
545  // Todo: this could be extended to the second dimension if we're not
546  // broadcasting alongside the first dimension, and so on.
547  if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) {
548  return m_impl.template packet<Unaligned>(inputIndex);
549  } else {
550  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
551  values[0] = m_impl.coeff(inputIndex);
553  for (int i = 1; i < PacketSize; ++i) {
554  if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) {
555  values[i] = m_impl.coeff(inputIndex + i);
556  } else {
557  values[i] = coeffRowMajor(originalIndex + i);
558  }
559  }
560  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
561  return rslt;
562  }
563  }
564 
566  double compute_cost = TensorOpCost::AddCost<Index>();
567  if (!isCopy && NumDims > 0) {
569  for (int i = NumDims - 1; i > 0; --i) {
570  compute_cost += TensorOpCost::DivCost<Index>();
571  if (internal::index_statically_eq<Broadcast>(i, 1)) {
572  compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
573  } else {
574  if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
575  compute_cost +=
576  TensorOpCost::MulCost<Index>() + TensorOpCost::ModCost<Index>() + TensorOpCost::AddCost<Index>();
577  }
578  }
579  compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
580  }
581  }
582  return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
583  }
584 
586  // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
587  // tensors. But this might need further tuning.
588  const size_t target_size = m_device.firstLevelCacheSize();
590  m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
591  }
592 
594  bool /*root_of_expr_ast*/ = false) const {
595  BlockBroadcastingParams params = blockBroadcastingParams(desc);
596 
597  if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
598  return emptyBlock();
599  }
600 
601  // Prepare storage for the materialized broadcasting result.
602  const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
603  ScalarNoConst* materialized_output = block_storage.data();
604 
605  // We potentially will need to materialize input blocks.
606  size_t materialized_input_size = 0;
607  ScalarNoConst* materialized_input = NULL;
608 
609  // Initialize block broadcating iterator state for outer dimensions (outer
610  // with regard to bcast dimension). Dimension in this array are always in
611  // inner_most -> outer_most order (col major layout).
613  int idx = 0;
614 
615  for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
616  const Index dim = IsColMajor ? i : NumDims - 1 - i;
617  it[idx].size = params.output_dims[dim];
618  it[idx].count = 0;
619  it[idx].output_stride = m_outputStrides[dim];
620  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
621  idx++;
622  }
623 
624  // Write output into the beginning of `materialized_output`.
625  Index output_offset = 0;
626 
627  // We will fill output block by broadcasting along the bcast dim, and
628  // iterating over outer dimension.
629  const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
630 
631  for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
632  ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
633  Index bcast_offset = desc.offset() + output_offset;
634 
635  // Broadcast along the bcast dimension.
636  num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input,
637  &materialized_input_size);
638 
639  // Switch to the next outer dimension.
640  for (int j = 0; j < idx; ++j) {
641  if (++it[j].count < it[j].size) {
642  output_offset += it[j].output_stride;
643  break;
644  }
645  it[j].count = 0;
646  output_offset -= it[j].output_span;
647  }
648  }
649 
650  return block_storage.AsTensorMaterializedBlock();
651  }
652 
653  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
654 
655  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
656 
657  Broadcast functor() const { return m_broadcast; }
658 
659  private:
660  static constexpr bool IsColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
661 
662  // We will build a general case block broadcasting on top of broadcasting
663  // primitive that will do broadcasting only for the inner dimension(s) along
664  // the first dimension smaller than the input size (it's called `bcast_dim`).
665  //
666  // Example:
667  // dim: 0 1 2 (ColMajor)
668  // input size: [9, 3, 6]
669  // block size: [9, 2, 6]
670  //
671  // We will compute broadcasted block by iterating over the outer dimensions
672  // before `bcast_dim` (only dimension `2` in this example) and computing
673  // broadcasts along the `bcast_dim` (dimension `1` in this example).
674 
675  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
676  // single block along the broadcasting dimension. Sizes and strides along the
677  // `bcast_dim` might be invalid, they will be adjusted later in
678  // `BroadcastBlockAlongBcastDim`.
679  struct BlockBroadcastingParams {
680  Dimensions input_dims; // input expression dimensions
681  Dimensions output_dims; // output block sizes
682  Dimensions output_strides; // output block strides
683 
684  int inner_dim_count; // count inner dimensions matching in size
685  int bcast_dim; // broadcasting dimension index
686  Index bcast_dim_size; // broadcasting dimension size
687  Index inner_dim_size; // inner dimensions size
688 
689  // Block sizes and strides for the input block where all dimensions before
690  // `bcast_dim` are equal to `1`.
693 
694  // Block sizes and strides for blocks with extra dimensions and strides `0`.
698  };
699 
700  struct BlockBroadcastingIteratorState {
705  };
706 
708  BlockBroadcastingParams params;
709 
710  params.input_dims = Dimensions(m_impl.dimensions());
711 
712  // Output block sizes and strides.
713  params.output_dims = desc.dimensions();
714  params.output_strides = internal::strides<Layout>(params.output_dims);
715 
716  // Find the broadcasting dimension (first dimension with output size smaller
717  // that the input size).
718  params.bcast_dim = 0;
719  params.bcast_dim_size = 1;
720  params.inner_dim_size = 1;
721 
722  // Count the number of inner dimensions that have the same size in the block
723  // and in the broadcast expression.
724  params.inner_dim_count = 0;
725 
726  for (int i = 0; i < NumDims; ++i) {
727  const int dim = IsColMajor ? i : NumDims - i - 1;
728 
729  if (params.output_dims[dim] == m_dimensions[dim]) {
730  params.inner_dim_size *= params.output_dims[dim];
731  ++params.inner_dim_count;
732  continue;
733  }
734 
735  // First non-matching dimension is the broadcasting dimension.
736  eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
737  params.bcast_dim = dim;
738  params.bcast_dim_size = params.output_dims[dim];
739  break;
740  }
741 
742  // Calculate the input block size for looking into the input.
743  for (int i = 0; i < params.inner_dim_count; ++i) {
744  const int dim = IsColMajor ? i : NumDims - i - 1;
745  params.input_block_sizes[dim] = params.input_dims[dim];
746  }
747  for (int i = params.inner_dim_count; i < NumDims; ++i) {
748  const int dim = IsColMajor ? i : NumDims - i - 1;
749  params.input_block_sizes[dim] = 1;
750  }
751  params.input_block_strides = internal::strides<Layout>(params.input_block_sizes);
752 
753  // Broadcast with the 0-stride trick: Create 1 extra dim for each
754  // broadcast, set the input stride to 0.
755  //
756  // When ColMajor:
757  //
758  // - bcast_block_sizes:
759  // [d_0, b_0, d_1, b_1, ...]
760  //
761  // - bcast_block_strides:
762  // [output_block_strides[0], output_block_strides[0] * d_0,
763  // output_block_strides[1], output_block_strides[1] * d_1,
764  // ...]
765  //
766  // - bcast_input_strides:
767  // [input_block_strides[0], 0,
768  // input_block_strides[1], 0,
769  // ...].
770  //
771  for (int i = 0; i < params.inner_dim_count; ++i) {
772  const int dim = IsColMajor ? i : NumDims - i - 1;
773 
774  const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
775  const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
776 
777  params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
778  params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
779  params.bcast_block_strides[copy_dim] = params.output_strides[dim];
780  params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim];
781  params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
782  params.bcast_input_strides[broadcast_dim] = 0;
783  }
784 
785  for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
786  const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
787  params.bcast_block_sizes[dim] = 1;
788  params.bcast_block_strides[dim] = 0;
789  params.bcast_input_strides[dim] = 0;
790  }
791 
792  return params;
793  }
794 
797  for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
799  }
800 
802  BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch,
803  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
804  if (params.bcast_dim_size == 1) {
805  // We just need one block read using the ready-set values above.
806  return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
807  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
808  materialized_output, materialized_input, materialized_input_size);
809 
810  } else if (params.input_dims[params.bcast_dim] == 1) {
811  // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
812  const int broadcast_bcast_dim =
813  IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
814 
815  params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
816  params.bcast_input_strides[broadcast_bcast_dim] = 0;
817  params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim];
818 
819  return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
820  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
821  materialized_output, materialized_input, materialized_input_size);
822 
823  } else {
824  // Keep track of the total number of the coefficients written to the
825  // output block.
826  Index num_output_coeffs = 0;
827 
828  // The general case. Let's denote the output block as
829  //
830  // x[..., a:a+bcast_dim_size, :, ..., :]
831  //
832  // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
833  // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
834  // sub-blocks:
835  //
836  // (1) a:b, where b is the smallest multiple of
837  // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
838  //
839  // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
840  // in [a, a+bcast_dim_size].
841  //
842  // (3) c:a+bcast_dim_size .
843  //
844  // Or, when b and c do not exist, we just need to process the whole block
845  // together.
846 
847  // Find a.
848  const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim];
849 
850  // Find b and c.
851  const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
852 
853  // First multiple after a. This is b when <= bcast_dim_left_index +
854  // bcast_dim_size.
855  const Index first_multiple =
856  numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size;
857 
858  if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
859  // b exists, so does c. Find it.
860  const Index last_multiple =
861  (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size;
862  const int copy_bcast_dim =
863  IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
864  const int broadcast_bcast_dim =
865  IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
866 
867  if (first_multiple > bcast_dim_left_index) {
868  const Index head_size = first_multiple - bcast_dim_left_index;
869  params.input_block_sizes[params.bcast_dim] = head_size;
870  params.bcast_block_sizes[copy_bcast_dim] = head_size;
871  params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
872  params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
873  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
874  params.bcast_input_strides[broadcast_bcast_dim] = 0;
875  params.bcast_block_strides[broadcast_bcast_dim] =
876  params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
877 
878  num_output_coeffs +=
879  BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
880  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
881  materialized_output, materialized_input, materialized_input_size);
882  }
883  if (first_multiple < last_multiple) {
884  params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
885  params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
886  params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
887  params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
888  params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size;
889  params.bcast_input_strides[broadcast_bcast_dim] = 0;
890  params.bcast_block_strides[broadcast_bcast_dim] =
891  params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
892  const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
893 
894  num_output_coeffs +=
895  BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
896  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
897  materialized_output, materialized_input, materialized_input_size);
898  }
899  if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
900  const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple;
901  params.input_block_sizes[params.bcast_dim] = tail_size;
902  params.bcast_block_sizes[copy_bcast_dim] = tail_size;
903  params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
904  params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
905  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
906  params.bcast_input_strides[broadcast_bcast_dim] = 0;
907  params.bcast_block_strides[broadcast_bcast_dim] =
908  params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
909  const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
910 
911  num_output_coeffs +=
912  BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
913  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
914  materialized_output, materialized_input, materialized_input_size);
915  }
916  } else {
917  // b and c do not exist.
918  const int copy_bcast_dim =
919  IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
920  params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
921  params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
922  params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
923  params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
924 
925  num_output_coeffs +=
926  BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
927  params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
928  materialized_output, materialized_input, materialized_input_size);
929  }
930 
931  return num_output_coeffs;
932  }
933  }
934 
936  const Dimensions& input_block_sizes, const Dimensions& input_block_strides,
937  const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides,
938  const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch,
939  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
940  // ---------------------------------------------------------------------- //
941  // Tensor block descriptor for reading block from the input.
942  const Index input_offset = bcast_offset + offset;
943  TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
944  input_block_sizes);
945 
946  ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
947 
948  // ---------------------------------------------------------------------- //
949  // Materialize input block into a temporary memory buffer only if it's not
950  // already available in the arg block.
951  const ScalarNoConst* input_buffer = NULL;
952 
953  if (input_block.data() != NULL) {
954  // Input block already has raw data, there is no need to materialize it.
955  input_buffer = input_block.data();
956 
957  } else {
958  // Otherwise we have to do block assignment into a temporary buffer.
959 
960  // Maybe reuse previously allocated buffer, or allocate a new one with a
961  // scratch allocator.
962  const size_t input_total_size = input_block_sizes.TotalSize();
963  if (*materialized_input == NULL || *materialized_input_size < input_total_size) {
964  *materialized_input_size = input_total_size;
965  void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
966  *materialized_input = static_cast<ScalarNoConst*>(mem);
967  }
968 
970  TensorBlockAssignment;
971 
972  TensorBlockAssignment::Run(
973  TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input),
974  input_block.expr());
975 
976  input_buffer = *materialized_input;
977  }
978 
979  // ---------------------------------------------------------------------- //
980  // Copy data from materialized input block to the materialized output, using
981  // given broadcast strides (strides with zeroes).
983 
984  typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
985  typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset);
986 
987  return TensorBlockIO::Copy(dst, src);
988  }
989 
990  protected:
992  const std::remove_reference_t<Broadcast> m_broadcast;
997 };
998 
999 } // end namespace Eigen
1000 
1001 #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
int i
Definition: BiCGSTAB_step_by_step.cpp:9
#define EIGEN_ALIGN_MAX
Definition: ConfigureVectorization.h:146
#define EIGEN_ALWAYS_INLINE
Definition: Macros.h:845
#define EIGEN_UNROLL_LOOP
Definition: Macros.h:1298
#define EIGEN_DEVICE_FUNC
Definition: Macros.h:892
#define eigen_assert(x)
Definition: Macros.h:910
#define EIGEN_STRONG_INLINE
Definition: Macros.h:834
#define EIGEN_STATIC_ASSERT(X, MSG)
Definition: StaticAssert.h:26
#define EIGEN_DEVICE_REF
Definition: TensorMacros.h:34
Scalar Scalar int size
Definition: benchVecAdd.cpp:17
SCALAR Scalar
Definition: bench_gemm.cpp:45
Matrix< RealScalar, Dynamic, Dynamic > M
Definition: bench_gemm.cpp:50
Generic expression where a coefficient-wise binary operator is applied to two expressions.
Definition: CwiseBinaryOp.h:79
The matrix class, also used for vectors and row-vectors.
Definition: Eigen/Eigen/src/Core/Matrix.h:186
The tensor base class.
Definition: TensorBase.h:1026
Definition: TensorBroadcasting.h:66
XprType::CoeffReturnType CoeffReturnType
Definition: TensorBroadcasting.h:70
Eigen::NumTraits< Scalar >::Real RealScalar
Definition: TensorBroadcasting.h:69
Eigen::internal::traits< TensorBroadcastingOp >::Index Index
Definition: TensorBroadcasting.h:73
EIGEN_DEVICE_FUNC const Broadcast & broadcast() const
Definition: TensorBroadcasting.h:78
EIGEN_DEVICE_FUNC const internal::remove_all_t< typename XprType::Nested > & expression() const
Definition: TensorBroadcasting.h:80
const Broadcast m_broadcast
Definition: TensorBroadcasting.h:84
Eigen::internal::nested< TensorBroadcastingOp >::type Nested
Definition: TensorBroadcasting.h:71
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType &expr, const Broadcast &broadcast)
Definition: TensorBroadcasting.h:75
Eigen::internal::traits< TensorBroadcastingOp >::Scalar Scalar
Definition: TensorBroadcasting.h:68
XprType::Nested m_xpr
Definition: TensorBroadcasting.h:83
Eigen::internal::traits< TensorBroadcastingOp >::StorageKind StorageKind
Definition: TensorBroadcasting.h:72
Definition: TensorCostModel.h:28
Definition: TensorBlock.h:1314
IndexType offset() const
Definition: TensorBlock.h:270
const Dimensions & dimensions() const
Definition: TensorBlock.h:271
Definition: TensorBlock.h:1093
void * allocate(size_t size)
Definition: TensorBlock.h:485
TensorMaterializedBlock AsTensorMaterializedBlock() const
Definition: TensorBlock.h:644
Scalar * data() const
Definition: TensorBlock.h:640
Definition: TensorBlock.h:604
const Scalar * data() const
Definition: TensorBlock.h:625
const XprType & expr() const
Definition: TensorBlock.h:621
static EIGEN_STRONG_INLINE Storage prepareStorage(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool allow_strided_storage=false)
Definition: TensorBlock.h:671
@ ColMajor
Definition: Constants.h:318
char char * op
Definition: level2_impl.h:374
@ kView
Definition: TensorBlock.h:545
Index first_multiple(Index size, Index base)
Definition: Memory.h:559
typename remove_all< T >::type remove_all_t
Definition: Meta.h:142
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:70
std::array< T, N > array
Definition: EmulateArray.h:231
squared absolute value
Definition: GlobalFunctions.h:87
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:83
dictionary params
Definition: Particles2023AnalysisHung.py:35
Definition: Eigen_Colamd.h:49
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const
Definition: TensorDimensions.h:167
Definition: Constants.h:519
T Real
Definition: NumTraits.h:183
Definition: TensorMeta.h:47
Definition: TensorDimensions.h:85
Definition: TensorForwardDeclarations.h:42
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc &desc, TensorBlockScratch &scratch, bool=false) const
Definition: TensorBroadcasting.h:593
internal::TensorMaterializedBlock< ScalarNoConst, NumDims, Layout, Index > TensorBlock
Definition: TensorBroadcasting.h:129
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
Definition: TensorBroadcasting.h:454
TensorEvaluator< ArgType, Device >::Dimensions InputDimensions
Definition: TensorBroadcasting.h:95
EIGEN_STRONG_INLINE TensorEvaluator(const XprType &op, const Device &device)
Definition: TensorBroadcasting.h:132
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
Definition: TensorBroadcasting.h:801
StorageMemory< CoeffReturnType, Device > Storage
Definition: TensorBroadcasting.h:105
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const
Definition: TensorBroadcasting.h:275
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const
Definition: TensorBroadcasting.h:565
internal::TensorBlockDescriptor< NumDims, Index > TensorBlockDesc
Definition: TensorBroadcasting.h:124
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
Definition: TensorBroadcasting.h:206
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
Definition: TensorBroadcasting.h:510
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const
Definition: TensorBroadcasting.h:585
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc &desc) const
Definition: TensorBroadcasting.h:707
TensorBroadcastingOp< Broadcast, ArgType > XprType
Definition: TensorBroadcasting.h:90
EIGEN_STRONG_INLINE void cleanup()
Definition: TensorBroadcasting.h:218
const std::remove_reference_t< Broadcast > m_broadcast
Definition: TensorBroadcasting.h:992
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
Definition: TensorBroadcasting.h:421
TensorEvaluator< const ArgType, Device >::TensorBlock ArgTensorBlock
Definition: TensorBroadcasting.h:127
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const
Definition: TensorBroadcasting.h:241
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
Definition: TensorBroadcasting.h:394
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(const Dimensions &input_block_sizes, const Dimensions &input_block_strides, const BroadcastDimensions &bcast_block_sizes, const BroadcastDimensions &bcast_block_strides, const BroadcastDimensions &bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch &scratch, ScalarNoConst *materialized_output, ScalarNoConst **materialized_input, size_t *materialized_input_size) const
Definition: TensorBroadcasting.h:935
array< Index, NumDims > m_inputStrides
Definition: TensorBroadcasting.h:995
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const
Definition: TensorBroadcasting.h:653
XprType::CoeffReturnType CoeffReturnType
Definition: TensorBroadcasting.h:96
Storage::Type EvaluatorPointerType
Definition: TensorBroadcasting.h:106
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const
Definition: TensorBroadcasting.h:354
DSizes< Index, 2 *NumDims > BroadcastDimensions
Definition: TensorBroadcasting.h:121
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
Definition: TensorBroadcasting.h:305
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
Definition: TensorBroadcasting.h:220
PacketType< CoeffReturnType, Device >::type PacketReturnType
Definition: TensorBroadcasting.h:97
const TensorEvaluator< ArgType, Device > & impl() const
Definition: TensorBroadcasting.h:655
array< Index, NumDims > m_outputStrides
Definition: TensorBroadcasting.h:994
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorBroadcasting.h:204
Broadcast functor() const
Definition: TensorBroadcasting.h:657
internal::TensorBlockScratchAllocator< Device > TensorBlockScratch
Definition: TensorBroadcasting.h:125
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
Definition: TensorBroadcasting.h:271
const Device EIGEN_DEVICE_REF m_device
Definition: TensorBroadcasting.h:991
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const
Definition: TensorBroadcasting.h:795
DSizes< Index, NumDims > Dimensions
Definition: TensorBroadcasting.h:93
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
Definition: TensorBroadcasting.h:310
TensorEvaluator< ArgType, Device > m_impl
Definition: TensorBroadcasting.h:996
std::remove_const_t< Scalar > ScalarNoConst
Definition: TensorBroadcasting.h:117
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31
static constexpr int Layout
Definition: TensorEvaluator.h:46
const Device EIGEN_DEVICE_REF m_device
Definition: TensorEvaluator.h:170
Storage::Type EvaluatorPointerType
Definition: TensorEvaluator.h:41
@ PacketAccess
Definition: TensorEvaluator.h:50
@ IsAligned
Definition: TensorEvaluator.h:49
static constexpr int PacketSize
Definition: TensorEvaluator.h:38
internal::TensorMaterializedBlock< ScalarNoConst, NumCoords, Layout, Index > TensorBlock
Definition: TensorEvaluator.h:63
Derived::Dimensions Dimensions
Definition: TensorEvaluator.h:36
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions & dimensions() const
Definition: TensorEvaluator.h:69
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlockResourceRequirements merge(const TensorBlockResourceRequirements &lhs, const TensorBlockResourceRequirements &rhs)
Definition: TensorBlock.h:129
Definition: Meta.h:305
const TensorBroadcastingOp< Broadcast, XprType > EIGEN_DEVICE_REF type
Definition: TensorBroadcasting.h:41
Definition: XprHelper.h:427
Definition: TensorBroadcasting.h:51
static const bool value
Definition: TensorBroadcasting.h:52
Definition: TensorTraits.h:152
ref_selector< T >::type type
Definition: TensorTraits.h:153
XprTraits::Index Index
Definition: TensorBroadcasting.h:31
XprType::Nested Nested
Definition: TensorBroadcasting.h:32
std::remove_reference_t< Nested > Nested_
Definition: TensorBroadcasting.h:33
XprType::Scalar Scalar
Definition: TensorBroadcasting.h:28
traits< XprType > XprTraits
Definition: TensorBroadcasting.h:29
XprTraits::PointerType PointerType
Definition: TensorBroadcasting.h:36
XprTraits::StorageKind StorageKind
Definition: TensorBroadcasting.h:30
Definition: ForwardDeclarations.h:21
std::ptrdiff_t j
Definition: tut_arithmetic_redux_minmax.cpp:2